In [2]:
import os
import sys
import glob
import scipy
import sklearn
import matplotlib
import numpy as np
import pandas as pd
import seaborn as sns
from collections import defaultdict
from matplotlib import pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

%matplotlib inline
sns.set_style('whitegrid')
pd.set_option('display.max_rows', 100)
matplotlib.rcParams['ps.fonttype'] = 42
matplotlib.rcParams['pdf.fonttype'] = 42
pd.set_option('display.max_columns', 100)

## Read in information

In [3]:
DATE = '02_04_2020'
PLdb = pd.read_csv('/home/mattolm/user_data/Covid_19/Pipeline/Jupyter/{0}_SRA_full_info.csv'.format(DATE))

In [8]:
PLdb = PLdb[PLdb['aligned_reads'] > 0]
PLdb['inStrain_succeeded'].value_counts()

True    95
Name: inStrain_succeeded, dtype: int64

## Adjust metadata

In [9]:
saveloc = '/home/mattolm/user_data/Covid_19/Pipeline/Jupyter/{0}_tables/'.format(DATE)
if not os.path.isdir(saveloc):
    os.mkdir(saveloc)

In [10]:
# PLdb = pd.read_csv('/home/mattolm/user_data/Covid_19/Pipeline/Jupyter/{0}_SRA_full_info.csv'.format(DATE))

# VARIABLES = ['library_strategy', 'library_source', 'instrument', 'collected_by']
# for v in VARIABLES:
#     print("{0} samples have no {1}".format(len(PLdb[PLdb[v].isna()]), v))
# print()
    
# # Add collected_by
# PLdb['collected_by'] = ['NCBI (GEO)' if 'ACE2 library' in str(e) else c for e, c in zip(PLdb['experiment_title'], PLdb['collected_by'])]
# for v in VARIABLES:
#     print("{0} samples have no {1}".format(len(PLdb[PLdb[v].isna()]), v))
    
# # Add center_ID
# c2c = { 'Paragon Genomics':'Paragon_Genomics', 
#         'University of Washington Virology laboratory':'Univ_Washington_Vir',
#         'Victorian Infectious Diseases Reference Laboratory (VIDRL)':'VIDRL',
#         'Wuhan Jinyintan Hospital':"Wuhan_Jinyintan",
#         "Utah Public Health Laboratory":'Utah_PublicHealth',
#         'NCBI (GEO)':'NCBI (GEO)',
#         'Institute of Pathogen Biology, Chinese Academy of Medical Sciences and Peking Union Medical College':'Peking_Beijing',
#         'Wisconsin State Lab of Hygiene':'Wisconsin_StateLab',
#         'BEI Resources/American Type and Culture Collection (ATCC)':'ATCC',
#         'University of Washington Medical Center department of Laboratory Medicine, Virology':'Univ_Washington_Med',
#         'Andersen Lab, The Scripps Research Institute':'Scripps',
#         'National Influenza Centre, National Public health Laboratory, Kathmandu, Nepal':'Univ_tech_Pereira'
#       }
# PLdb['Center_ID'] = PLdb['collected_by'].map(c2c)
# PLdb[PLdb['Center_ID'].isna()]['collected_by'].value_counts()

# # Save
# PLdb.to_csv('/home/mattolm/user_data/Covid_19/Pipeline/Jupyter/{0}_SRA_parsed_info.csv'.format(DATE), index=False)

## Load and save coverage

In [12]:
import inStrain
import inStrain.SNVprofile

dbs = []
for i, row in PLdb[PLdb['inStrain_succeeded'] == True].iterrows():
    db = pd.read_csv(row['inStrain_coverage'], sep='\t')
    db['Run'] = row['Run']
    dbs.append(db)

COdb = pd.concat(dbs).reset_index(drop=True)
COdb['genome'] = 'MT039887.1'
# for c in ['detected_scaffolds', 'true_scaffolds']:
#     del COdb[c]
    
COdb.to_csv(saveloc + 'COVID_genome_coverage.csv', index=False)
COdb.head()

Unnamed: 0,scaffold,length,breadth,coverage,median_cov,std_cov,bases_w_0_coverage,mean_clonality,median_clonality,mean_microdiversity,median_microdiversity,rarefied_mean_microdiversity,rarefied_median_microdiversity,unmaskedBreadth,rarefied_breadth,expected_breadth,SNPs,Reference_SNPs,BiAllelic_SNPs,MultiAllelic_SNPs,consensus_SNPs,population_SNPs,conANI,popANI,Run,genome
0,NC_045512.2,29903,0.965522,6.168746,5,4.292021,1031,0.997119,1.0,0.002881,0.0,,,0.617664,0.0,0.995691,14,5,9,0,6,5,0.999675,0.999729,SRR11454606,MT039887.1
1,NC_045512.2,29903,0.997559,40.258502,37,20.940226,73,0.996198,1.0,0.003802,0.0,0.003515,0.0,0.996622,0.28957,1.0,69,4,64,1,5,4,0.999832,0.999866,SRR11454607,MT039887.1
2,NC_045512.2,29903,0.998428,364.855165,265,337.567648,47,0.9958,1.0,0.0042,0.0,0.004157,0.0,0.995887,0.971809,1.0,81,5,72,4,5,5,0.999832,0.999832,SRR11454608,MT039887.1
3,NC_045512.2,29903,0.998763,139.966692,115,114.470238,37,0.995933,1.0,0.004067,0.0,0.003715,0.0,0.998261,0.869645,1.0,114,2,105,7,7,2,0.999766,0.999933,SRR11454609,MT039887.1
4,NC_045512.2,29903,0.998729,284.849948,248,183.518004,38,0.996902,1.0,0.003098,0.0,0.002892,0.0,0.998428,0.976792,1.0,50,2,45,3,2,2,0.999933,0.999933,SRR11454610,MT039887.1


## SNVs

In [13]:
import inStrain
import inStrain.SNVprofile

dbs = []
for i, row in PLdb[PLdb['inStrain_succeeded'] == True].iterrows():
    loc = row['inStrain_coverage'].replace('_genomeWide_scaffold_info.tsv', '_SNVs.tsv')
    try:
        db = pd.read_csv(loc, sep='\t')
        db['Run'] = row['Run']
        dbs.append(db)
    except:
        continue


Sdb = pd.concat(dbs).reset_index(drop=True)
Sdb['genome'] = 'MT039887.1'

Sdb.to_csv(saveloc + 'COVID_raw_SNVs.csv', index=False)
Sdb.head()

Unnamed: 0,scaffold,length,breadth,coverage,median_cov,std_cov,bases_w_0_coverage,mean_clonality,median_clonality,mean_microdiversity,median_microdiversity,rarefied_mean_microdiversity,rarefied_median_microdiversity,unmaskedBreadth,rarefied_breadth,expected_breadth,SNPs,Reference_SNPs,BiAllelic_SNPs,MultiAllelic_SNPs,consensus_SNPs,population_SNPs,conANI,popANI,Run,genome
0,NC_045512.2,29903,0.965522,6.168746,5,4.292021,1031,0.997119,1.0,0.002881,0.0,,,0.617664,0.0,0.995691,14,5,9,0,6,5,0.999675,0.999729,SRR11454606,MT039887.1
1,NC_045512.2,29903,0.997559,40.258502,37,20.940226,73,0.996198,1.0,0.003802,0.0,0.003515,0.0,0.996622,0.28957,1.0,69,4,64,1,5,4,0.999832,0.999866,SRR11454607,MT039887.1
2,NC_045512.2,29903,0.998428,364.855165,265,337.567648,47,0.9958,1.0,0.0042,0.0,0.004157,0.0,0.995887,0.971809,1.0,81,5,72,4,5,5,0.999832,0.999832,SRR11454608,MT039887.1
3,NC_045512.2,29903,0.998763,139.966692,115,114.470238,37,0.995933,1.0,0.004067,0.0,0.003715,0.0,0.998261,0.869645,1.0,114,2,105,7,7,2,0.999766,0.999933,SRR11454609,MT039887.1
4,NC_045512.2,29903,0.998729,284.849948,248,183.518004,38,0.996902,1.0,0.003098,0.0,0.002892,0.0,0.998428,0.976792,1.0,50,2,45,3,2,2,0.999933,0.999933,SRR11454610,MT039887.1


## Load and save positional coverage

In [16]:
TRUE_LENGTH = 29903
SCAFFOLD = 'NC_045512.2'
bldb = pd.DataFrame(list(range(0, TRUE_LENGTH))).rename(columns={0:'position'})

dbs = []
for i, row in PLdb[PLdb['inStrain_succeeded'] == True].iterrows():
    IS_loc = '/'.join(row['inStrain_coverage'].split('/')[:-2])
    IS = inStrain.SNVprofile.SNVprofile(IS_loc)
    
    # Load positional coverage
    covT = IS.get('covT')
    cov = inStrain.profileUtilities._mm_counts_to_counts_shrunk(covT[SCAFFOLD])
    db = pd.DataFrame(cov).reset_index(drop=False).rename(columns={'index':'position', 0:'coverage'})
    db['coverage'] = db['coverage'].astype(int)
    
    # Load positional microdiversity
    covT = IS.get('clonT')
    cov = inStrain.plottingUtilities._get_basewise_clons3(covT[SCAFFOLD], 100)
    cdb = cov.reset_index(drop=False).rename(columns={'index':'position', 0:'clonality'})
    cdb['clonality'] = cdb['clonality'].astype(float)
    cdb['microdiversity'] = 1 - cdb['clonality']
    cdb['normalized_microdiversity'] = sklearn.preprocessing.normalize(cdb['microdiversity'].values.reshape(-1, 1), axis=0) # Normalize clonality before NaN values are introduced
    
    # Load rarefied positional microdiversity
    covTR = IS.get('clonTR')
    cov = inStrain.plottingUtilities._get_basewise_clons3(covTR[SCAFFOLD], 100)
    crdb = cov.reset_index(drop=False).rename(columns={'index':'position', 0:'rarefied_clonality'})
    crdb['rarefied_clonality'] = crdb['rarefied_clonality'].astype(float)
    crdb['rarefied_microdiversity'] = 1 - crdb['rarefied_clonality']
    try:
        crdb['normalized_rarefied_microdiversity'] = sklearn.preprocessing.normalize(crdb['rarefied_microdiversity'].values.reshape(-1, 1), axis=0) # Normalize clonality before NaN values are introduced
    except ValueError:
        crdb['normalized_rarefied_microdiversity'] = np.nan
    
    # Merge
    db = pd.merge(bldb, db, on='position', how='outer')
    db = pd.merge(db, cdb, on='position', how='outer')
    db = pd.merge(db, crdb, on='position', how='outer')
    db['Run'] = row['Run']
    db = db.sort_values('position')
    
    # Backfill coverage with 0s and normalize
    db['coverage'] = db['coverage'].fillna(0).astype(int)
    db['normalized_coverage'] = sklearn.preprocessing.normalize(db['coverage'].values.reshape(-1, 1), axis=0)
    
    # Add mean and adjust
#     db['mean_coverage'] = db['coverage'].mean()
#     db['adjusted_coverage'] = db['coverage'] / db['mean_coverage']
    
#     db['mean_microdiversity'] = db['microdiversity'].mean()
#     db['adjusted_microdiversity'] = db['coverage'] / db['mean_microdiversity']
    
#     db['mean_rarefied_microdiversity'] = db['rarefied_microdiversity'].mean()
#     db['adjusted_rarefied_microdiversity'] = db['coverage'] / db['mean_rarefied_microdiversity']
    
    dbs.append(db)
    
# Re-order columns
Odb = pd.concat(dbs).reset_index(drop=True)
order = ['Run', 'position']
for item in ['coverage', 'microdiversity', 'rarefied_microdiversity']:
    order += [item, 'normalized_' + item]
Odb = Odb[order]

# Save
Odb.to_csv(saveloc + 'COVID_positional_coverage_v2.csv.gz', index=False)
Odb.head()


Unnamed: 0,Run,position,coverage,normalized_coverage,microdiversity,normalized_microdiversity,rarefied_microdiversity,normalized_rarefied_microdiversity
0,SRR11454606,0,0,0.0,,,,
1,SRR11454606,1,0,0.0,,,,
2,SRR11454606,2,0,0.0,,,,
3,SRR11454606,3,0,0.0,,,,
4,SRR11454606,4,0,0.0,,,,


## Gene-level

In [15]:
ON = 'normalized_microdiversity'
MIN_COV = 0

POdb = pd.read_csv('/home/mattolm/user_data/Covid_19/datatables/COVID_genes_positional_v2.2.csv')

fdb = pd.merge(Odb, POdb, on='position', how='left').merge(COdb, how='left', on='Run')
fdb = fdb[fdb['coverage_y'] >= MIN_COV]

table = defaultdict(list)
for run, rdb in fdb.groupby('Run'):
    for gene, db in rdb.groupby('protein'):
        table['Run'].append(run)
        table['gene'].append(gene)
        table['coverage'].append(db['coverage_x'].mean())
        table[ON].append(db[ON].mean())
        
    for gene, db in rdb.groupby('gene'):
        table['Run'].append(run)
        table['gene'].append(gene)
        table['coverage'].append(db['coverage_x'].mean())
        table[ON].append(db[ON].mean())

run = 'all'
rdb = fdb
for gene, db in rdb.groupby('protein'):
    table['Run'].append(run)
    table['gene'].append(gene)
    table['coverage'].append(db['coverage_x'].mean())
    table[ON].append(db[ON].mean())

for gene, db in rdb.groupby('gene'):
    table['Run'].append(run)
    table['gene'].append(gene)
    table['coverage'].append(db['coverage_x'].mean())
    table[ON].append(db[ON].mean())
    
MGdb = pd.DataFrame(table)

# Add rank order
dbs = defaultdict(list)
ITEMS = [ON, 'coverage']
for run, db in MGdb.groupby('Run'):
    for item in ITEMS:
        db['{0}_rank'.format(item)] = db[item].rank(method='min')#.astype(int)
        dbs[item].append(db)
for item in ITEMS:
    db = pd.concat(dbs[item]).reset_index(drop=True)
    MGdb = pd.merge(MGdb, db, how='left', on=list(MGdb.columns))
    
# Save
MGdb.to_csv(saveloc + 'COVID_gene_coverage_v2.csv', index=False)
MGdb.head()

Unnamed: 0,Run,gene,coverage,normalized_microdiversity,normalized_microdiversity_rank,coverage_rank
0,SRR11059940,E,1.921053,0.0,1.0,2.0
1,SRR11059940,M,21.856502,0.000929,7.0,25.0
2,SRR11059940,N,35.115873,0.001697,17.0,29.0
3,SRR11059940,ORF10,17.744361,0.002567,26.0,21.0
4,SRR11059940,ORF3a,9.589372,0.000642,4.0,8.0
