In [1]:
import os
import sys
import glob
import scipy
import sklearn
import matplotlib
import numpy as np
import pandas as pd
import seaborn as sns
from collections import defaultdict
from matplotlib import pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

%matplotlib inline
sns.set_style('whitegrid')
pd.set_option('display.max_rows', 100)
matplotlib.rcParams['ps.fonttype'] = 42
matplotlib.rcParams['pdf.fonttype'] = 42
pd.set_option('display.max_columns', 100)

## Read in information

In [2]:
DATE = '04202020'
PLdb = pd.read_csv('/home/mattolm/user_data/Covid_19/Pipeline/Jupyter/{0}_SRA_full_info.csv'.format(DATE))

In [3]:
PLdb = PLdb[PLdb['aligned_reads'] > 0]
PLdb['inStrain_succeeded'].value_counts()

True     449
False      1
Name: inStrain_succeeded, dtype: int64

In [4]:
PLdb['LibraryLayout'].value_counts()

PAIRED    440
SINGLE     10
Name: LibraryLayout, dtype: int64

## Adjust metadata

In [5]:
saveloc = '/home/mattolm/user_data/Covid_19/Pipeline/Jupyter/{0}_tables/'.format(DATE)
if not os.path.isdir(saveloc):
    os.mkdir(saveloc)

## Load and save coverage

In [6]:
import inStrain
import inStrain.SNVprofile

dbs = []
for i, row in PLdb[PLdb['inStrain_succeeded'] == True].iterrows():
    db = pd.read_csv(row['inStrain_coverage'], sep='\t')
    db['Run'] = row['Run']
    dbs.append(db)

COdb = pd.concat(dbs).reset_index(drop=True)
COdb['genome'] = 'MT039887.1'
# for c in ['detected_scaffolds', 'true_scaffolds']:
#     del COdb[c]
    
COdb.to_csv(saveloc + 'COVID_genome_coverage.csv', index=False)
COdb.head()

Unnamed: 0,scaffold,length,breadth,coverage,median_cov,std_cov,bases_w_0_coverage,mean_clonality,median_clonality,mean_microdiversity,median_microdiversity,rarefied_mean_microdiversity,rarefied_median_microdiversity,unmaskedBreadth,rarefied_breadth,expected_breadth,SNPs,Reference_SNPs,BiAllelic_SNPs,MultiAllelic_SNPs,consensus_SNPs,population_SNPs,conANI,popANI,Run,genome
0,NC_045512.2,29903,0.999131,969.343812,812,807.97274,26,0.991916,0.995506,0.008084,0.004494,0.007682,0.0,0.997458,0.991974,1.0,64,2,50,12,6,3,0.999799,0.999899,SRR11542288,MT039887.1
1,NC_045512.2,29903,0.999064,1372.82577,1180,1062.479273,28,0.9918,0.995488,0.0082,0.004512,0.00768,0.0,0.998027,0.995519,1.0,91,2,78,11,4,3,0.999866,0.999899,SRR11542289,MT039887.1
2,NC_045512.2,29903,0.989265,2968.623549,1286,3938.03032,321,0.998092,1.0,0.001908,0.0,0.001493,0.0,0.954754,0.897669,1.0,59,6,51,2,10,6,0.99965,0.99979,SRR11542243,MT039887.1
3,NC_045512.2,29903,0.790623,436.001371,202,632.044924,6261,0.998721,1.0,0.001279,0.0,0.001078,0.0,0.750694,0.654416,1.0,26,2,24,0,4,2,0.999822,0.999911,SRR11542244,MT039887.1
4,NC_045512.2,29903,0.034813,0.034813,0,0.183305,28862,,,,,,,0.0,0.0,0.030272,0,0,0,0,0,0,0.0,0.0,SRR11524818,MT039887.1


## SNVs

In [7]:
import inStrain
import inStrain.SNVprofile

dbs = []
for i, row in PLdb[PLdb['inStrain_succeeded'] == True].iterrows():
    loc = row['inStrain_coverage'].replace('_genomeWide_scaffold_info.tsv', '_SNVs.tsv')
    try:
        db = pd.read_csv(loc, sep='\t')
        db['Run'] = row['Run']
        dbs.append(db)
    except:
        continue


Sdb = pd.concat(dbs).reset_index(drop=True)
Sdb['genome'] = 'MT039887.1'

Sdb.to_csv(saveloc + 'COVID_raw_SNVs.csv', index=False)
Sdb.head()

Unnamed: 0,scaffold,length,breadth,coverage,median_cov,std_cov,bases_w_0_coverage,mean_clonality,median_clonality,mean_microdiversity,median_microdiversity,rarefied_mean_microdiversity,rarefied_median_microdiversity,unmaskedBreadth,rarefied_breadth,expected_breadth,SNPs,Reference_SNPs,BiAllelic_SNPs,MultiAllelic_SNPs,consensus_SNPs,population_SNPs,conANI,popANI,Run,genome
0,NC_045512.2,29903,0.999131,969.343812,812,807.97274,26,0.991916,0.995506,0.008084,0.004494,0.007682,0.0,0.997458,0.991974,1.0,64,2,50,12,6,3,0.999799,0.999899,SRR11542288,MT039887.1
1,NC_045512.2,29903,0.999064,1372.82577,1180,1062.479273,28,0.9918,0.995488,0.0082,0.004512,0.00768,0.0,0.998027,0.995519,1.0,91,2,78,11,4,3,0.999866,0.999899,SRR11542289,MT039887.1
2,NC_045512.2,29903,0.989265,2968.623549,1286,3938.03032,321,0.998092,1.0,0.001908,0.0,0.001493,0.0,0.954754,0.897669,1.0,59,6,51,2,10,6,0.99965,0.99979,SRR11542243,MT039887.1
3,NC_045512.2,29903,0.790623,436.001371,202,632.044924,6261,0.998721,1.0,0.001279,0.0,0.001078,0.0,0.750694,0.654416,1.0,26,2,24,0,4,2,0.999822,0.999911,SRR11542244,MT039887.1
4,NC_045512.2,29903,0.034813,0.034813,0,0.183305,28862,,,,,,,0.0,0.0,0.030272,0,0,0,0,0,0,0.0,0.0,SRR11524818,MT039887.1


## Load and save positional coverage

In [10]:
TRUE_LENGTH = 29903
SCAFFOLD = 'NC_045512.2'
bldb = pd.DataFrame(list(range(0, TRUE_LENGTH))).rename(columns={0:'position'})

dbs = []
for i, row in PLdb[PLdb['inStrain_succeeded'] == True].iterrows():
    IS_loc = '/'.join(row['inStrain_coverage'].split('/')[:-2])
    IS = inStrain.SNVprofile.SNVprofile(IS_loc)
    
    # Load positional coverage
    covT = IS.get('covT')
    cov = inStrain.profileUtilities._mm_counts_to_counts_shrunk(covT[SCAFFOLD])
    db = pd.DataFrame(cov).reset_index(drop=False).rename(columns={'index':'position', 0:'coverage'})
    db['coverage'] = db['coverage'].astype(int)
    
    # Load positional microdiversity
    covT = IS.get('clonT')
    cov = inStrain.plottingUtilities._get_basewise_clons3(covT[SCAFFOLD], 100)
    cdb = cov.reset_index(drop=False).rename(columns={'index':'position', 0:'clonality'})
    cdb['clonality'] = cdb['clonality'].astype(float)
    cdb['microdiversity'] = 1 - cdb['clonality']
    if len(cdb) != 0:
        cdb['normalized_microdiversity'] = sklearn.preprocessing.normalize(cdb['microdiversity'].values.reshape(-1, 1), axis=0) # Normalize clonality before NaN values are introduced
    else:
        cdb['normalized_microdiversity'] = np.nan
    
    # Load rarefied positional microdiversity
    covTR = IS.get('clonTR')
    cov = inStrain.plottingUtilities._get_basewise_clons3(covTR[SCAFFOLD], 100)
    crdb = cov.reset_index(drop=False).rename(columns={'index':'position', 0:'rarefied_clonality'})
    crdb['rarefied_clonality'] = crdb['rarefied_clonality'].astype(float)
    crdb['rarefied_microdiversity'] = 1 - crdb['rarefied_clonality']
    try:
        crdb['normalized_rarefied_microdiversity'] = sklearn.preprocessing.normalize(crdb['rarefied_microdiversity'].values.reshape(-1, 1), axis=0) # Normalize clonality before NaN values are introduced
    except ValueError:
        crdb['normalized_rarefied_microdiversity'] = np.nan
    
    # Merge
    db = pd.merge(bldb, db, on='position', how='outer')
    db = pd.merge(db, cdb, on='position', how='outer')
    db = pd.merge(db, crdb, on='position', how='outer')
    db['Run'] = row['Run']
    db = db.sort_values('position')
    
    # Backfill coverage with 0s and normalize
    db['coverage'] = db['coverage'].fillna(0).astype(int)
    db['normalized_coverage'] = sklearn.preprocessing.normalize(db['coverage'].values.reshape(-1, 1), axis=0)
    
    # Add mean and adjust
#     db['mean_coverage'] = db['coverage'].mean()
#     db['adjusted_coverage'] = db['coverage'] / db['mean_coverage']
    
#     db['mean_microdiversity'] = db['microdiversity'].mean()
#     db['adjusted_microdiversity'] = db['coverage'] / db['mean_microdiversity']
    
#     db['mean_rarefied_microdiversity'] = db['rarefied_microdiversity'].mean()
#     db['adjusted_rarefied_microdiversity'] = db['coverage'] / db['mean_rarefied_microdiversity']
    
    dbs.append(db)
    
# Re-order columns
Odb = pd.concat(dbs).reset_index(drop=True)
order = ['Run', 'position']
for item in ['coverage', 'microdiversity', 'rarefied_microdiversity']:
    order += [item, 'normalized_' + item]
Odb = Odb[order]

# Save
Odb.to_csv(saveloc + 'COVID_positional_coverage_v2.csv.gz', index=False)
Odb.head()


Unnamed: 0,Run,position,coverage,normalized_coverage,microdiversity,normalized_microdiversity,rarefied_microdiversity,normalized_rarefied_microdiversity
0,SRR11542288,0,3,1.4e-05,,,,
1,SRR11542288,1,3,1.4e-05,,,,
2,SRR11542288,2,3,1.4e-05,,,,
3,SRR11542288,3,3,1.4e-05,,,,
4,SRR11542288,4,3,1.4e-05,,,,


## Gene-level

In [11]:
ON = 'normalized_microdiversity'
MIN_COV = 0

POdb = pd.read_csv('/home/mattolm/user_data/Covid_19/datatables/COVID_genes_positional_v4.csv')

fdb = pd.merge(Odb, POdb, on='position', how='left').merge(COdb, how='left', on='Run')
fdb = fdb[fdb['coverage_y'] >= MIN_COV]

table = defaultdict(list)
for run, rdb in fdb.groupby('Run'):
    for gene, db in rdb.groupby('ID'):
        table['Run'].append(run)
        table['gene'].append(gene)
        table['coverage'].append(db['coverage_x'].mean())
        table[ON].append(db[ON].mean())

run = 'all'
rdb = fdb
for gene, db in rdb.groupby('ID'):
    table['Run'].append(run)
    table['gene'].append(gene)
    table['coverage'].append(db['coverage_x'].mean())
    table[ON].append(db[ON].mean())

MGdb = pd.DataFrame(table)

# Add rank order
dbs = defaultdict(list)
ITEMS = [ON, 'coverage']
for run, db in MGdb.groupby('Run'):
    for item in ITEMS:
        db['{0}_rank'.format(item)] = db[item].rank(method='min')#.astype(int)
        dbs[item].append(db)
for item in ITEMS:
    db = pd.concat(dbs[item]).reset_index(drop=True)
    MGdb = pd.merge(MGdb, db, how='left', on=list(MGdb.columns))
    
# Save
MGdb.to_csv(saveloc + 'COVID_gene_coverage_v4.csv', index=False)
MGdb.head()

Unnamed: 0,Run,gene,coverage,normalized_microdiversity,normalized_microdiversity_rank,coverage_rank
0,SRR11059940,3'UTR,13.19214,0.001564,14.0,15.0
1,SRR11059940,5'UTR,1.607547,,,1.0
2,SRR11059940,GU280_gp01_stem_loop,16.060241,0.004236,30.0,22.0
3,SRR11059940,GU280_gp11_stem_loop,19.430769,0.004316,31.0,27.0
4,SRR11059940,YP_009724389.1,12.438965,0.001565,15.0,13.0
