In [230]:
import os
import sys
import glob
import scipy
import sklearn
import matplotlib
import numpy as np
import pandas as pd
import seaborn as sns
from collections import defaultdict
from matplotlib import pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

%matplotlib inline
sns.set_style('whitegrid')
pd.set_option('display.max_rows', 100)
matplotlib.rcParams['ps.fonttype'] = 42
matplotlib.rcParams['pdf.fonttype'] = 42
pd.set_option('display.max_columns', 100)

## Read in information

In [231]:
PMdb = pd.read_csv('/home/mattolm/user_data/Covid_19/datatables/SRA_metadata_v1.csv')
PLdb = pd.read_csv('/home/mattolm/user_data/Covid_19/datatables/COVID_local_locations_v2.csv')

In [232]:
PLdb['inStrain_succeeded'].value_counts()

True    36
Name: inStrain_succeeded, dtype: int64

## Load and save inStrain genome coverage

In [233]:
import inStrain
import inStrain.SNVprofile

dbs = []
for i, row in PLdb[PLdb['inStrain_succeeded'] == True].iterrows():
    db = pd.read_csv(row['inStrain_coverage'], sep='\t')
    db['Run'] = row['Run']
    dbs.append(db)


COdb = pd.concat(dbs).reset_index(drop=True)
COdb['genome'] = 'MT039887.1'
for c in ['detected_scaffolds', 'true_scaffolds']:
    del COdb[c]
    
COdb.to_csv('/home/mattolm/user_data/Covid_19/datatables/COVID_genome_coverage_v2.csv', index=False)
COdb.head()

Unnamed: 0,genome,true_length,SNPs,BiAllelic_SNPs,MultiAllelic_SNPs,consensus_SNPs,population_SNPs,breadth,coverage,std_cov,mean_clonality,rarefied_mean_microdiversity,conANI,popANI,unmaskedBreadth,expected_breadth,Run
0,MT039887.1,29903,23,22,1,1,0,0.99806,51015.203658,16355.048169,0.994172,0.005656,0.999966,1.0,0.997392,1.0,SRR11177792
1,MT039887.1,29903,25,20,0,6,5,0.993613,242.371066,104.383155,0.998488,0.001427,0.999798,0.999832,0.992977,1.0,SRR11278092
2,MT039887.1,29903,19,14,0,6,5,0.999799,1430.516771,591.378209,0.998533,0.001413,0.999799,0.999833,0.999799,1.0,SRR11278091
3,MT039887.1,29903,111,106,0,7,5,0.979166,56.407819,32.761759,0.998343,0.001674,0.999761,0.999829,0.977661,1.0,SRR11278090
4,MT039887.1,29903,192,184,3,5,5,0.989232,80.702438,61.107891,0.997692,0.002322,0.999831,0.999831,0.988496,1.0,SRR11278168


## Load inStrain positional coverage

In [234]:
TRUE_LENGTH = 29903
SCAFFOLD = 'NC_045512.2'
bldb = pd.DataFrame(list(range(0, TRUE_LENGTH))).rename(columns={0:'position'})

dbs = []
for i, row in PLdb[PLdb['inStrain_succeeded'] == True].iterrows():
    IS_loc = '/'.join(row['inStrain_coverage'].split('/')[:-2])
    IS = inStrain.SNVprofile.SNVprofile(IS_loc)
    
    # Load positional coverage
    covT = IS.get('covT')
    cov = inStrain.profileUtilities._mm_counts_to_counts_shrunk(covT[SCAFFOLD])
    db = pd.DataFrame(cov).reset_index(drop=False).rename(columns={'index':'position', 0:'coverage'})
    db['coverage'] = db['coverage'].astype(int)
    
    # Load positional microdiversity
    covT = IS.get('clonT')
    cov = inStrain.plottingUtilities._get_basewise_clons3(covT[SCAFFOLD], 100)
    cdb = cov.reset_index(drop=False).rename(columns={'index':'position', 0:'clonality'})
    cdb['clonality'] = cdb['clonality'].astype(float)
    cdb['microdiversity'] = 1 - cdb['clonality']
    cdb['normalized_microdiversity'] = sklearn.preprocessing.normalize(cdb['microdiversity'].values.reshape(-1, 1), axis=0) # Normalize clonality before NaN values are introduced
    
    # Load rarefied positional microdiversity
    covTR = IS.get('clonTR')
    cov = inStrain.plottingUtilities._get_basewise_clons3(covTR[SCAFFOLD], 100)
    crdb = cov.reset_index(drop=False).rename(columns={'index':'position', 0:'rarefied_clonality'})
    crdb['rarefied_clonality'] = crdb['rarefied_clonality'].astype(float)
    crdb['rarefied_microdiversity'] = 1 - crdb['rarefied_clonality']
    try:
        crdb['normalized_rarefied_microdiversity'] = sklearn.preprocessing.normalize(crdb['rarefied_microdiversity'].values.reshape(-1, 1), axis=0) # Normalize clonality before NaN values are introduced
    except ValueError:
        crdb['normalized_rarefied_microdiversity'] = np.nan
    
    # Merge
    db = pd.merge(bldb, db, on='position', how='outer')
    db = pd.merge(db, cdb, on='position', how='outer')
    db = pd.merge(db, crdb, on='position', how='outer')
    db['Run'] = row['Run']
    db = db.sort_values('position')
    
    # Backfill coverage with 0s and normalize
    db['coverage'] = db['coverage'].fillna(0).astype(int)
    db['normalized_coverage'] = sklearn.preprocessing.normalize(db['coverage'].values.reshape(-1, 1), axis=0)
    
    # Add mean and adjust
#     db['mean_coverage'] = db['coverage'].mean()
#     db['adjusted_coverage'] = db['coverage'] / db['mean_coverage']
    
#     db['mean_microdiversity'] = db['microdiversity'].mean()
#     db['adjusted_microdiversity'] = db['coverage'] / db['mean_microdiversity']
    
#     db['mean_rarefied_microdiversity'] = db['rarefied_microdiversity'].mean()
#     db['adjusted_rarefied_microdiversity'] = db['coverage'] / db['mean_rarefied_microdiversity']
    
    dbs.append(db)
    
# Re-order columns
Odb = pd.concat(dbs).reset_index(drop=True)
order = ['Run', 'position']
for item in ['coverage', 'microdiversity', 'rarefied_microdiversity']:
    order += [item, 'normalized_' + item]
Odb = Odb[order]

# Save
Odb.to_csv('/home/mattolm/user_data/Covid_19/datatables/COVID_positional_coverage_v2.csv', index=False)
Odb.head()


Unnamed: 0,Run,position,coverage,normalized_coverage,microdiversity,normalized_microdiversity,rarefied_microdiversity,normalized_rarefied_microdiversity
0,SRR11177792,0,3,3.238322e-07,,,,
1,SRR11177792,1,3,3.238322e-07,,,,
2,SRR11177792,2,3,3.238322e-07,,,,
3,SRR11177792,3,3,3.238322e-07,,,,
4,SRR11177792,4,3,3.238322e-07,,,,


## Make new genes table

In [278]:
import inStrain
import Bio
import logging
from Bio import SeqIO

def parse_genbank_genes_COV(gene_file, gene_name='gene'):
    table = defaultdict(list)
    gene2sequence = {}
    gene2aa = {}
    for record in SeqIO.parse(gene_file, 'gb'):
        scaffold = record.id
        for feature in record.features:
            if feature.type == 'CDS':
                sequence = feature.location.extract(record).seq
                
                gene = feature.qualifiers[gene_name][0]
                loc = feature.location
                if type(loc) is Bio.SeqFeature.CompoundLocation:
                    partial = 'compound'
                else:
                    partial = False

                table['gene'].append(gene)
                table['scaffold'].append(scaffold)
                table['direction'].append(feature.location.strand)
                table['translated'].append(True)

                table['start'].append(loc.start)
                table['end'].append(loc.end)
                table['location'].append(feature.location)

                #try:
                gene2sequence[gene] = sequence
                TransSeq = str(sequence.translate()[:-1])
                assert TransSeq == str(feature.qualifiers['translation'][0]), [TransSeq, str(feature.qualifiers['translation'][0])]
                gene2aa[gene] = TransSeq

            elif ('UTR' in feature.type) | ('stem_loop' in feature.type):
                loc = feature.location
                gene = feature.type
                
                table['gene'].append(gene)
                table['scaffold'].append(scaffold)
                table['direction'].append(feature.location.strand)
                table['translated'].append(False)

                table['start'].append(loc.start)
                table['end'].append(loc.end)
                table['location'].append(feature.location)
                
            elif feature.type == 'mat_peptide':
                loc = feature.location
                gene = feature.qualifiers['protein_id'][0]
                sequence = feature.location.extract(record).seq
                
                print(feature.location)
                
                table['gene'].append(gene)
                table['scaffold'].append(scaffold)
                table['direction'].append(feature.location.strand)
                table['translated'].append(True)

                table['start'].append(loc.start)
                table['end'].append(loc.end)
                table['location'].append(feature.location)
                
                gene2sequence[gene] = sequence
                TransSeq = str(sequence.translate()[:-1])
                #assert TransSeq == str(feature.qualifiers['translation'][0]), [TransSeq, str(feature.qualifiers['translation'][0])]
                assert len(sequence) % 3 == 0
                gene2aa[gene] = TransSeq
                
            elif feature.type in ['gene', 'source']:
                pass

            else:
                print(feature.type)

    Gdb = pd.DataFrame(table)

    return Gdb, gene2sequence, gene2aa, record.seq

Gdb, gene2sequence, gene2aa, source = parse_genbank_genes_COV('/home/mattolm/user_data/Covid_19/genomes/NC_045512.2.gb')

[265:805](+)
[805:2719](+)
[2719:8554](+)
[8554:10054](+)
[10054:10972](+)
[10972:11842](+)
[11842:12091](+)
[12091:12685](+)
[12685:13024](+)
[13024:13441](+)
join{[13441:13468](+), [13467:16236](+)}
[16236:18039](+)
[18039:19620](+)
[19620:20658](+)
[20658:21552](+)
[13441:13480](+)


# Convert to positional

In [279]:
def make_positional_gene_table_COV(Gdb, gene2sequence, gene2aa, source):
    # Make a table of the genome
    table = defaultdict(list)
    for i, char in enumerate(source):
        table['position'].append(i)
        table['ref_seq'].append(char)
    Sdb = pd.DataFrame(table)
    l2c = Sdb.set_index('position')['ref_seq'].to_dict()
    
    # Add translations for gene products
    table = defaultdict(list)
    for j, row in Gdb[Gdb['translated'] == True].iterrows():
        
        # Skip this  polyprotein; you handle it with the products
        if row['gene'] in ['orf1ab']:
            continue
            
        NSEQ = gene2sequence[row['gene']]
        location = row['location']
        ASEQ = gene2aa[row['gene']]
        EXPA = []
        for a in ASEQ:
            EXPA += [a] * 3
        EXPA += ['stop'] * 3
        assert len(NSEQ) == len(location) == len(EXPA), [len(NSEQ), len(location), len(EXPA)]

        pos = 0
        for loc, char, a in zip(location, NSEQ, EXPA):
            table['position'].append(loc)
            table['protein'].append(row['gene'])
            table['orf_position'].append((pos) % 3)
            table['aa'].append(a)
            table['aa_index'].append(int(pos/3))
            assert l2c[loc] == char, [row['gene'],l2c[loc], char]
            pos += 1
        assert (pos) % 3 == 0
    Tdb = pd.DataFrame(table)
    #print(Tdb[Tdb.duplicated(subset=['location'], keep=False)].sort_values('location'))
        
    # Add genes
    table = defaultdict(list)
    for j, row in Gdb[Gdb['translated'] == False].iterrows():
        location = row['location']
        for loc in location:
            table['position'].append(loc)
            table['gene'].append(row['gene'])
    gdb = pd.DataFrame(table)
    
    # Merge
    db = pd.merge(Sdb, Tdb, on='position', how='left').merge(gdb, on='position', how='left')
    return db
        
POdb = make_positional_gene_table_COV(Gdb, gene2sequence, gene2aa, source)

## Save genes stuff

In [237]:
GOdb = Gdb.copy()
GOdb['location'] = [str(x) for x in GOdb['location']]
GOdb.to_csv('/home/mattolm/user_data/Covid_19/datatables/COVID_genes_table_v2.csv', index=False)
GOdb.head()

Unnamed: 0,gene,scaffold,direction,translated,start,end,location
0,5'UTR,NC_045512.2,1,False,0,265,[0:265](+)
1,orf1ab,NC_045512.2,1,True,265,21555,"join{[265:13468](+), [13467:21555](+)}"
2,YP_009725297.1,NC_045512.2,1,True,265,805,[265:805](+)
3,YP_009725298.1,NC_045512.2,1,True,805,2719,[805:2719](+)
4,YP_009725299.1,NC_045512.2,1,True,2719,8554,[2719:8554](+)


In [238]:
POdb.to_csv('/home/mattolm/user_data/Covid_19/datatables/COVID_genes_positional_v2.csv', index=False)
POdb.head()

Unnamed: 0,position,ref_seq,protein,orf_position,aa,aa_index,gene
0,0,A,,,,,5'UTR
1,1,T,,,,,5'UTR
2,2,T,,,,,5'UTR
3,3,A,,,,,5'UTR
4,4,A,,,,,5'UTR


## Save version 2.2 with CCs names

In [276]:
## Do it with CCs names
Ndb = pd.read_csv('/home/mattolm/user_data/Covid_19/datatables/COVID_genes_table_v2.tsv', sep='\t')

GOdb = Gdb.copy()
GOdb['name'] = GOdb['gene'].map(Ndb.set_index('gene')['name'].to_dict())
GOdb.at[17, 'name'] = 'orf1a'

GOdb['location'] = [str(x) for x in GOdb['location']]
GOdb.to_csv('/home/mattolm/user_data/Covid_19/datatables/COVID_genes_table_v2.2.csv', index=False)
GOdb.head()

Unnamed: 0,gene,scaffold,direction,translated,start,end,location,name
0,5'UTR,NC_045512.2,1,False,0,265,[0:265](+),5'UTR
1,orf1ab,NC_045512.2,1,True,265,21555,"join{[265:13468](+), [13467:21555](+)}",orf1ab
2,YP_009725297.1,NC_045512.2,1,True,265,805,[265:805](+),leader protein
3,YP_009725298.1,NC_045512.2,1,True,805,2719,[805:2719](+),nsp2
4,YP_009725299.1,NC_045512.2,1,True,2719,8554,[2719:8554](+),nsp3


In [282]:
POdb2 = POdb.copy()
POdb2['protein_name'] = POdb2['protein'].map(GOdb.set_index('gene')['name'].to_dict())
POdb2['gene_name'] = POdb2['gene'].map(GOdb.set_index('gene')['name'].to_dict())

POdb2.to_csv('/home/mattolm/user_data/Covid_19/datatables/COVID_genes_positional_v2.2.csv', index=False)
POdb2.head()

Unnamed: 0,position,ref_seq,protein,orf_position,aa,aa_index,gene,protein_name,gene_name
0,0,A,,,,,5'UTR,,5'UTR
1,1,T,,,,,5'UTR,,5'UTR
2,2,T,,,,,5'UTR,,5'UTR
3,3,A,,,,,5'UTR,,5'UTR
4,4,A,,,,,5'UTR,,5'UTR
