In [2]:
import os
import sys
import glob
import scipy
import matplotlib
import numpy as np
import pandas as pd
import seaborn as sns
from collections import defaultdict
from matplotlib import pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

from subprocess import call

%matplotlib inline
sns.set_style('whitegrid')
pd.set_option('display.max_rows', 100)
matplotlib.rcParams['ps.fonttype'] = 42
matplotlib.rcParams['pdf.fonttype'] = 42
pd.set_option('display.max_columns', 100)

In [3]:
def SRA_to_bioproject(accession):
    '''
    From an SRA (like SRR1883283) return a biosample
    '''
    from Bio import Entrez
    Entrez.email = "mattolm@gmail.com"

    try:
        # GO FROM SRA TO ID
        #accession = 'SRX1883283'
        handle = Entrez.esearch(db='sra', term=accession, retmode='text')
        record = Entrez.read(handle)
        handle.close()
        ID = record['IdList'][0]
        #return ID

        # ELINK FROM SRA ID TO BIOSAMPLE ID
        accession = ID
        handle = Entrez.elink(dbfrom="sra", retmax=10, id=accession, linkname="sra_bioproject")
        record = Entrez.read(handle)
        handle.close()
        BID = [link["Id"] for link in record[0]["LinkSetDb"][0]["Link"]][0]
        #return BID

        # SUMMARY FROM BIOSAMPLE ID TO TERM
        handle = Entrez.esummary(db="bioproject", id=BID)
        record = Entrez.read(handle)
        handle.close()
        BioSample = record['DocumentSummarySet']['DocumentSummary'][0]['Project_Acc']

        return BioSample
    
    except:
        return 'Fail'
    
def SRA_to_library(accession):
    '''
    From an SRA (like SRR1883283) return a biosample
    '''
    from Bio import Entrez
    Entrez.email = "mattolm@gmail.com"

    lib = 'UNK'
    handle = Entrez.efetch(db="sra", id=accession, retmode='text')
    for line in handle.readlines():
        line = line.strip()
        if 'LIBRARY_LAYOUT' in line:
            lib = line.split('LIBRARY_LAYOUT')[1][2:].split('/')[0].split()[0]
    return lib


# Set up

In [8]:
val_loc = '/home/mattolm/user_data/Covid_19/Pipeline/Jupyter/validated_downloads.txt'
meta_loc = '/home/mattolm/user_data/Covid_19/Pipeline/Metadata/metadata.xlsx'

THREADS = 6

VALIDATED_RUNS = []
with open(val_loc, 'r') as o:
    for line in o.readlines():
        VALIDATED_RUNS.append(line.strip())
VALIDATED_RUNS = set(VALIDATED_RUNS)

BioMeta = pd.read_excel(meta_loc, sheet_name='BioProjects')
RunMeta = pd.read_excel(meta_loc, sheet_name='IndividualSamples')
BioMeta.head()

Unnamed: 0,BioProject,Link,Group,Center_ID,Method,IndividualLabels?,Publication link,Publication title,Description
0,PRJNA615032,https://www.ncbi.nlm.nih.gov/bioproject/?term=...,Icahn School of Medicine at Mount Sina,Icahn_NY,IRRELEVANT,No,https://www.biorxiv.org/content/10.1101/2020.0...,SARS-CoV-2 launches a unique transcriptional s...,Evaluating lots of things; worth following up....
1,PRJNA610428,https://www.ncbi.nlm.nih.gov/bioproject/?term=...,UNIVERSITY OF WASHINGTON,UW_WA,IRRELEVANT,No,,,The are isolating the virus first using human ...
2,PRJNA613958,https://www.ncbi.nlm.nih.gov/bioproject/?term=...,The Peter Doherty Institute for Infection and ...,Doherty_Melbourne,PCR_ARCTIC,No,,,
3,PRJNA614546,https://www.ncbi.nlm.nih.gov/bioproject/?term=...,Paragon Genomics,Paragon_CA,Mixed,No,https://www.biorxiv.org/content/10.1101/2020.0...,High sensitivity detection of coronavirus SARS...,Evaluating lots of things; worth following up....
4,PRJNA616446,https://www.ncbi.nlm.nih.gov/bioproject/?term=...,Hubei Provincial Center for Disease Control an...,Hubei_China,RNA-seq,No,https://www.biorxiv.org/content/10.1101/2020.0...,Genome-wide data inferring the evolution and p...,Sequencing genomes


# Get a list of all SRA samples

In [4]:
from datetime import date
today = date.today()
d4 = today.strftime("%m%d%Y")
dloc = '/home/mattolm/user_data/Covid_19/Pipeline/Jupyter/{0}_SRA.tsv'.format(d4)

SEARCH_STRING = '(\\"Severe acute respiratory syndrome coronavirus 2\\"[Organism] OR SARS-CoV-2[All Fields]) AND \\"platform illumina\\"[Properties]'

! pysradb search "$SEARCH_STRING" --detailed --saveto "$dloc"

SRdb = pd.read_csv(dloc, sep='\t')
SRdb['Run'] = SRdb['run_accession']
SRdb = SRdb[~SRdb['run_accession'].isna()]

# Add BioProject and layout
SRdb['BioProject'] = [SRA_to_bioproject(r) for r in SRdb['Run']]
SRdb['TrueLibraryLayout'] = [SRA_to_library(r) for r in SRdb['Run']]


  from pandas import Panel
  from pandas import Panel


In [5]:
# Any unknown BioProjects?
unk = set(SRdb['BioProject'].tolist()) - set(BioMeta['BioProject'].tolist())
if len(unk) == 0:
    print("All BioProjects are known")
else:
    print("{0} new BioProjects with no info".format(len(unk)))
    print("\n".join(list(unk)))

4 new BioProjects with no info
PRJNA624358
Fail
PRJNA624792
PRJNA616147


In [7]:
SRdb[SRdb['BioProject'] == 'PRJNA624358']

Unnamed: 0,study_accession,experiment_accession,experiment_title,experiment_desc,organism_taxid,organism_name,library_strategy,library_source,library_selection,sample_accession,sample_title_x,instrument,total_spots,total_size,run_accession,run_total_spots,run_total_bases,run_alias,sra_url_alt,sra_url,experiment_alias,isolate,collected_by,collection_date,geo_loc_name,host,host_disease,isolation_source,lat_lon,BioSampleModel,passage_history,strain,source_name,subject status,tissue/cell type,treatment,time after treatment,sub_species,cell line,description,env_broad_scale,env_local_scale,env_medium,host_taxid,isol_growth_condt,propagation,sample_title_y,seq_methods,source_uvig,virus_enrich_appr,culture_collection,genotype,host_age,host_description,host_disease_outcome,host_disease_stage,host_health_state,host_sex,host_subject_id,host_tissue_sampled,pathotype,serotype,serovar,specimen_voucher,subgroup,subtype,cell type,time point,identification_method,sample type,Laboratory Host,Extraction Method,ref_biomaterial,link_addit_analys,country,sra_url_alt1,sra_url_alt2,ena_fastq_url,ena_fastq_ftp,Run,BioProject
6,SRP255993,SRX8095881,WGS of SARS-CoV-2 R03006_2020_2,WGS of SARS-CoV-2 R03006_2020_2,2697049.0,Severe acute respiratory syndrome coronavirus 2,WGS,GENOMIC,RANDOM,SRS6462475,,Illumina MiSeq,2971431.0,1079579000.0,SRR11524818,2971431.0,1787565000.0,R03006_2_2020_S2_L001_R1_001.fastq.gz,https://sra-download.ncbi.nlm.nih.gov/traces/s...,https://sra-download.ncbi.nlm.nih.gov/traces/s...,,,"National Institute for Communicable Diseases, ...",2020-03-07,South Africa: KwaZulu-Natal,Homo sapiens,COVID-19,Combined nasopharyngeal and oropharyngeal swab,28.5306 S 30.8958 E,Pathogen.cl,,R03006_2020_2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,SRR11524818,PRJNA624358
7,SRP255993,SRX8095880,WGS of SARS-CoV-2 R03006_2020_1,WGS of SARS-CoV-2 R03006_2020_1,2697049.0,Severe acute respiratory syndrome coronavirus 2,WGS,GENOMIC,RANDOM,SRS6462474,,Illumina MiSeq,1731908.0,686253000.0,SRR11524819,1731908.0,1042411000.0,R03006_1_2020_S2_L001_R1_001.fastq.gz,https://sra-download.ncbi.nlm.nih.gov/traces/s...,https://sra-download.ncbi.nlm.nih.gov/traces/s...,,,"National Institute for Communicable Diseases, ...",2020-03-07,South Africa: KwaZulu-Natal,Homo sapiens,COVID-19,Combined nasopharyngeal and oropharyngeal swab,28.5306 S 30.8958 E,Pathogen.cl,,R03006_2020_1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,SRR11524819,PRJNA624358


In [11]:
# d4 = '04202020'
# dloc = '/home/mattolm/user_data/Covid_19/Pipeline/Jupyter/{0}_SRA.tsv'.format(d4)
# SRdb = pd.read_csv(dloc, sep='\t')
# SRdb['Run'] = SRdb['run_accession']
# SRdb = SRdb[~SRdb['run_accession'].isna()]

# # Add BioProject
# SRdb['BioProject'] = [SRA_to_bioproject(r) for r in SRdb['Run']]
# SRdb['TrueLibraryLayout'] = [SRA_to_library(r) for r in SRdb['Run']]

In [59]:
# Fix that one
SRdb['BioProject'] = ['PRJNA614995' if ((c == 'Utah Public Health Laboratory') & (b == 'Fail')) else b for b, c in zip(SRdb['BioProject'], SRdb['collected_by'])]

# Any unknown BioProjects?
unk = set(SRdb['BioProject'].tolist()) - set(BioMeta['BioProject'].tolist())
if len(unk) == 0:
    print("All BioProjects are known")
else:
    print("{0} new BioProjects with no info".format(len(unk)))
    print("\n".join(list(unk)))

All BioProjects are known


In [60]:
# Filter out irrelevant bioprojects
SRdb = SRdb[~SRdb['BioProject'].isin(BioMeta[BioMeta['Method'] == 'IRRELEVANT']['BioProject'])]
NEW = set(SRdb['Run'].tolist())
print("{0} new samples (and {1} already processed = {2} total (including irrelevant already processed))".format(len(NEW - VALIDATED_RUNS), len(VALIDATED_RUNS), len(SRdb)))

0 new samples (and 773 already processed = 459 total (including irrelevant already processed))


In [95]:
print(SRdb['TrueLibraryLayout'].value_counts())

PAIRED    453
SINGLE      6
Name: TrueLibraryLayout, dtype: int64


# START PROCESSING!

## Save preliminary list

In [5]:
#SRdb.to_csv('/home/mattolm/user_data/Covid_19/Pipeline/Jupyter/{0}_SRA_filtered.csv'.format(d4), index=False)
d4 = '04202020'
SRdb = pd.read_csv('/home/mattolm/user_data/Covid_19/Pipeline/Jupyter/{0}_SRA_filtered.csv'.format(d4))

# Download samples that need downloading

In [9]:
import subprocess
def check_sra_file(file):
    cmd = "/home/mattolm/miniconda3/bin/vdb-validate {0}".format(file)
    try:
        output = subprocess.check_output(
            cmd, stderr=subprocess.STDOUT, shell=True,
            universal_newlines=True)
        final_out = output.split('\n')[-2]
        return True
    except Exception as exc:
        return False

In [10]:
base_loc = '/home/mattolm/user_data/Covid_19/reads/'
odir = '/home/mattolm/user_data/Covid_19/Pipeline/test/'

SRdb['sra_file'] = [base_loc + x for x in SRdb['Run']]
SRdb['sra_file_consistant'] = [(R in VALIDATED_RUNS) for R in SRdb['Run']]
SRdb['sra_file_consistant'] = [check_sra_file(x) if y == False else y for x, y in zip(SRdb['sra_file'], SRdb['sra_file_consistant'])]

TOTAL_LOOPS = 50
loop = 0
while len(SRdb[SRdb['sra_file_consistant'] == False]) > 0:
    print('Running loop {0}; downloading {1} new files'.format(loop, len(SRdb[SRdb['sra_file_consistant'] == False])))
    cmd_loc = '/home/mattolm/user_data/Covid_19/Pipeline/Jupyter/cmds/{0}_download.txt'.format(d4)
    with open(cmd_loc, 'w') as o:
        for i, row in SRdb[SRdb['sra_file_consistant'] == False].iterrows():
            o.write("wget -c {0} -O {1}\n".format(row['sra_url'], row['sra_file']))
    cmd = "cat {0} | parallel -j {1}".format(cmd_loc, THREADS)
    call(cmd, shell=True)
            
    SRdb['sra_file_consistant'] = [check_sra_file(x) if y == False else y for x, y in zip(SRdb['sra_file'], SRdb['sra_file_consistant'])]
    loop += 1
    if loop == TOTAL_LOOPS:
        break
        
if len(SRdb[SRdb['sra_file_consistant'] == False]) > 0:
    print("THE FOLLOWING HAVE PROBLEMS:")
    for i, row in SRdb[SRdb['sra_file_consistant'] == False].iterrows():
        print("vdb-validate {0}".format(row['sra_file']))
        print("wget -c {0} -O {1}\n".format(row['sra_url'], row['sra_file']))


In [11]:
if len(SRdb[SRdb['sra_file_consistant'] == False]) > 0:
    print("THE FOLLOWING {0} HAVE PROBLEMS:".format(len(SRdb[SRdb['sra_file_consistant'] == False])))
    for i, row in SRdb[SRdb['sra_file_consistant'] == False].iterrows():
        print("/usr/bin/vdb-validate {0}".format(row['sra_file']))
        print("wget -c {0} -O {1}\n".format(row['sra_url'], row['sra_file']))
else:
    print("All are good!")

All are good!


In [12]:
# Save new validations
VALIDATED_RUNS = VALIDATED_RUNS.union(set(SRdb[SRdb['sra_file_consistant'] == True]['Run'].tolist()))
with open('/home/mattolm/user_data/Covid_19/Pipeline/Jupyter/validated_downloads.txt', 'w') as o:
    for v in VALIDATED_RUNS:
        o.write(v + '\n')

In [13]:
SRdb = SRdb[SRdb['sra_file_consistant'] == True]

## FastqQ Dump

In [14]:
base_loc = '/home/mattolm/user_data/Covid_19/reads/'

SRdb['fastq1'] = [x + '_1.fastq' for x in SRdb['sra_file']]
SRdb['fastq2'] = [x + '_2.fastq' for x in SRdb['sra_file']]

SRdb['fastq1_exists'] = [os.path.exists(x) for x in SRdb['fastq1']]
SRdb['fastq2_exists'] = [os.path.exists(x) for x in SRdb['fastq2']]

loop = 0
MISSING_FASTA = SRdb[[(row['fastq1_exists'] == False) if row['TrueLibraryLayout'] == 'SINGLE' else (row['fastq2_exists'] == False) for i, row in SRdb.iterrows()]]
while len(MISSING_FASTA) > 0:
    print("Loop {0} - {1} samples are missing their fastq dumps".format(loop, len(MISSING_FASTA)))
    
    cmd_loc = '/home/mattolm/user_data/Covid_19/Pipeline/Jupyter/cmds/{0}_dump.txt'.format(d4)
    with open(cmd_loc, 'w') as o:
        for i, row in MISSING_FASTA.iterrows():
            o.write("fastq-dump {0} --split-files -O {1}".format(row['sra_file'], base_loc) + '\n')
    cmd = "cat {0} | parallel -j {1}".format(cmd_loc, THREADS)
    call(cmd, shell=True)

    SRdb['fastq1_exists'] = [os.path.exists(x) for x in SRdb['fastq1']]
    MISSING_FASTA = SRdb[[(row['fastq1_exists'] == False) if row['TrueLibraryLayout'] == 'SINGLE' else (row['fastq2_exists'] == False) for i, row in SRdb.iterrows()]]
    loop += 1
    
    if loop == 3:
        break
        
if len(MISSING_FASTA) > 0:
    print("{0} samples are still missing their fastq dumps. They will be marked as single".format(len(MISSING_FASTA)))
    for i, row in MISSING_FASTA.iterrows():
        print("fastq-dump {0} --split-files -O {1}".format(row['sra_file'], base_loc) + '\n')
    
SRdb['LibraryLayout'] = ['PAIRED' if t else 'SINGLE' for t in SRdb['fastq2_exists']]

Loop 0 - 7 samples are missing their fastq dumps
Loop 1 - 7 samples are missing their fastq dumps
Loop 2 - 7 samples are missing their fastq dumps
7 samples are still missing their fastq dumps. They will be marked as single
fastq-dump /home/mattolm/user_data/Covid_19/reads/SRR11542288 --split-files -O /home/mattolm/user_data/Covid_19/reads/

fastq-dump /home/mattolm/user_data/Covid_19/reads/SRR11542289 --split-files -O /home/mattolm/user_data/Covid_19/reads/

fastq-dump /home/mattolm/user_data/Covid_19/reads/SRR11513114 --split-files -O /home/mattolm/user_data/Covid_19/reads/

fastq-dump /home/mattolm/user_data/Covid_19/reads/SRR11513118 --split-files -O /home/mattolm/user_data/Covid_19/reads/

fastq-dump /home/mattolm/user_data/Covid_19/reads/SRR11479033 --split-files -O /home/mattolm/user_data/Covid_19/reads/

fastq-dump /home/mattolm/user_data/Covid_19/reads/SRR11479040 --split-files -O /home/mattolm/user_data/Covid_19/reads/

fastq-dump /home/mattolm/user_data/Covid_19/reads/SRR114

In [15]:
SRdb['LibraryLayout'].value_counts()

PAIRED    449
SINGLE     10
Name: LibraryLayout, dtype: int64

In [16]:
for x, db in SRdb.groupby('TrueLibraryLayout'):
    print(x)
    print(db['LibraryLayout'].value_counts())

PAIRED
PAIRED    446
SINGLE      7
Name: LibraryLayout, dtype: int64
SINGLE
SINGLE    3
PAIRED    3
Name: LibraryLayout, dtype: int64


## Process reads

In [17]:
SRdb['filtered_fastq1'] = ['/home/mattolm/user_data/Covid_19/reads/filtered/' + os.path.basename(rp).replace('_1.fastq', '_rep1.fastq').replace('_rep1.fastq', '_bbduk_1.fastq') for rp in SRdb['fastq1']]
SRdb['filtered_fastq2'] = ['/home/mattolm/user_data/Covid_19/reads/filtered/' + os.path.basename(rp).replace('_2.fastq', '_rep2.fastq').replace('_rep2.fastq', '_bbduk_2.fastq') for rp in SRdb['fastq2']]
SRdb['filtered_fastqS'] = ['/home/mattolm/user_data/Covid_19/reads/filtered/' + os.path.basename(rp).replace('_1.fastq', '_rep1.fastq').replace('_rep1.fastq', '_bbduk.fastq') for rp in SRdb['fastq1']]

SRdb['filtered_fastq1_exists'] = [os.path.exists(x) for x in SRdb['filtered_fastq1']]
SRdb['filtered_fastq2_exists'] = [os.path.exists(x) for x in SRdb['filtered_fastq2']]
SRdb['filtered_fastqS_exists'] = [os.path.exists(x) for x in SRdb['filtered_fastqS']]


In [18]:
from subprocess import call
def process_reads(r1, r2, outfolder):
    # repair
    rr1 = outfolder + os.path.basename(r1).replace('_1.fastq', '_rep1.fastq')
    rr2 = outfolder + os.path.basename(r2).replace('_2.fastq', '_rep2.fastq')
    cmd = "repair.sh in={0} in2={1} out={2} out2={3}".format(r1, r2, rr1, rr2)
    print(cmd)
    call(cmd, shell=True)
    
    # bbduk
    rb1 = rr1.replace('_rep1.fastq', '_bbduk_1.fastq')
    rb2 = rr2.replace('_rep2.fastq', '_bbduk_2.fastq')
    cmd = "bbduk.sh in={0} in2={1} out={2} out2={3} threads={4}".format(rr1, rr2, rb1, rb2, THREADS)
    print(cmd)
    call(cmd, shell=True)
    
    return rb1, rb2

def process_reads_s(r1, outfolder):
    # bbduk
    rr1 = outfolder + os.path.basename(r1).replace('_1.fastq', '_rep1.fastq')
    rb1 = rr1.replace('_rep1.fastq', '_bbduk.fastq')
    cmd = "bbduk.sh in={0} out={1} threads={2}".format(r1, rb1, THREADS)
    print(cmd)
    call(cmd, shell=True)
    
    return rb1 

SRdb['filtered_fastq1_exists'] = [os.path.exists(x) for x in SRdb['filtered_fastq1']]
SRdb['filtered_fastq2_exists'] = [os.path.exists(x) for x in SRdb['filtered_fastq2']]
SRdb['filtered_fastqS_exists'] = [os.path.exists(x) for x in SRdb['filtered_fastqS']]

# Process paired
for i, row in SRdb[(SRdb['LibraryLayout'] == 'PAIRED') & ((SRdb['filtered_fastq1_exists'] == False) | (SRdb['filtered_fastq2_exists'] == False))].iterrows():
    rb1, rb2 = process_reads(row['fastq1'], row['fastq2'], '/home/mattolm/user_data/Covid_19/reads/filtered/')
    
# Process unpaired
for i, row in SRdb[(SRdb['LibraryLayout'] == 'SINGLE') & (SRdb['filtered_fastqS_exists'] == False)].iterrows():
    b1 = process_reads_s(row['fastq1'], '/home/mattolm/user_data/Covid_19/reads/filtered/')
    
SRdb['filtered_fastq1_exists'] = [os.path.exists(x) for x in SRdb['filtered_fastq1']]
SRdb['filtered_fastq2_exists'] = [os.path.exists(x) for x in SRdb['filtered_fastq2']]
SRdb['filtered_fastqS_exists'] = [os.path.exists(x) for x in SRdb['filtered_fastqS']]

In [19]:
for i, row in SRdb[(SRdb['LibraryLayout'] == 'PAIRED') & ((SRdb['filtered_fastq1_exists'] == False) | (SRdb['filtered_fastq2_exists'] == False))].iterrows():
    print("{0} still has a problem".format(row['Run']))
    
for i, row in SRdb[(SRdb['LibraryLayout'] == 'SINGLE') & (SRdb['filtered_fastqS_exists'] == False)].iterrows():
    print("{0} still has a problem".format(row['Run']))
#     rb1, rb2 = process_reads(row['fastq1'], row['fastq2'], '/home/mattolm/user_data/Covid_19/reads/filtered/')

## Run mapping

In [20]:
BTL = '/home/mattolm/user_data/Covid_19/genomes/NC_045512.2.fasta.bt2'
ML = '/home/mattolm/user_data/Covid_19/inStrain/mapping_files/'
s2l = SRdb.set_index('Run')['LibraryLayout'].to_dict()

SRdb['sam_file'] = ["{0}{1}-vs-{2}.sam".format(ML, os.path.basename(BTL), r) for r in SRdb['Run']]
SRdb['mapping_done'] = [os.path.isfile(s) for s in SRdb['sam_file']]

cmd_loc = '/home/mattolm/user_data/Covid_19/Pipeline/Jupyter/cmds/{0}_mapping.txt'.format(d4)
re = open(cmd_loc, 'w')

print("Need to run {0} mapping jobs".format(len(SRdb[SRdb['mapping_done'] == False])))

for i, row in SRdb[SRdb['mapping_done'] == False].iterrows():
# for i, row in SRdb.iterrows():
    sam_loc = row['sam_file']
    if s2l[row['Run']] == 'PAIRED':
        cmd = "bowtie2 -x {3} -1 {0} -2 {1} --no-unal -S {2} -p {4} 2> {2}.log".format(
                row['filtered_fastq1'], row['filtered_fastq2'], sam_loc, BTL, THREADS)
        re.write(cmd + '\n')
    else:
        cmd = "bowtie2 -x {3} -U {0} --no-unal -S {2} -p {4} 2> {2}.log".format(
                row['filtered_fastqS'], '', sam_loc, BTL, THREADS)
        re.write(cmd + '\n')
re.close()

cmd = "cat {0} | bash".format(cmd_loc)
call(cmd, shell=True)

SRdb['mapping_done'] = [os.path.isfile(s) for s in SRdb['sam_file']]
SRdb['mapping_done'].value_counts()

Need to run 0 mapping jobs


True    459
Name: mapping_done, dtype: int64

## Parse bowtie2 logs for information

In [21]:
def parse_bt2(log):
    table = defaultdict(list)
    with open(log, 'r') as o:
        for line in o.readlines():
            line = line.strip()
            if 'reads;' in line:
                table['total_reads'].append(int(line.split()[0]))
            if 'paired;' in line:
                table['paired_reads'].append(int(line.split()[0]))
            if 'overall alignment rate' in line:
                table['percent_aligned'].append(float(line.split()[0][:-1]))
    db = pd.DataFrame(table)
    db['aligned_reads'] = db['total_reads'] * (db['percent_aligned'] / 100)
    db['aligned_reads'] = db['aligned_reads'].astype(int)
    return db

dbs = []
SRdb['mapping_log'] = SRdb['sam_file'] + '.log'
SRdb['mapping_log_exists'] = [os.path.exists(x) for x in SRdb['mapping_log']]
for i, row in SRdb.iterrows():
    db = parse_bt2(row['mapping_log'])
    db['Run'] = row['Run']
    dbs.append(db)
Mdb = pd.concat(dbs).reset_index(drop=True)
for col in Mdb.columns:
    if col in ['Run']:
        continue
    r2c = Mdb.set_index('Run')[col].to_dict()
    SRdb[col] = SRdb['Run'].map(r2c)

## Run inStrain

In [23]:
SRdb['inStrain_coverage'] = ["/home/mattolm/user_data/Covid_19/inStrain/profiles_v2/{0}.IS/output/{0}.IS_scaffold_info.tsv".format(os.path.basename(r)) for r in SRdb['sam_file']]
SRdb['inStrain_succeeded'] = [os.path.isfile(s) for s in SRdb['inStrain_coverage']]
SRdb['inStrain_succeeded'].value_counts()

False    459
Name: inStrain_succeeded, dtype: int64

In [24]:
cmd_loc = '/home/mattolm/user_data/Covid_19/Pipeline/Jupyter/cmds/{0}_instrain.txt'.format(d4)
fasta = '/home/mattolm/user_data/Covid_19/genomes/NC_045512.2.fasta'
genes ='/home/mattolm/user_data/Covid_19/genomes/NC_045512.2.gb'
s2l = SRdb.set_index('Run')['LibraryLayout'].to_dict()

re = open(cmd_loc, 'w')
#for i, row in SRdb[SRdb['inStrain_succeeded'] == False].iterrows():
for i, row in SRdb.iterrows():
    if s2l[row['Run']] == 'PAIRED': 
        cmd = "inStrain profile {0} {2} -o /home/mattolm/user_data/Covid_19/inStrain/profiles_v2/{1}.IS -p 1 -g {3} --skip_mm_profiling".format(row['sam_file'], os.path.basename(row['sam_file']), fasta, genes)
        re.write(cmd + '\n')
    else:
        cmd = "inStrain profile {0} {2} -o /home/mattolm/user_data/Covid_19/inStrain/profiles_v2/{1}.IS -p 1 --pairing_filter non_discordant -g {3} --skip_mm_profiling".format(row['sam_file'], os.path.basename(row['sam_file']), fasta, genes)
        re.write(cmd + '\n')
re.close()
    
cmd = "cat {0} | parallel -j {1}".format(cmd_loc, THREADS)
print(cmd)
#call(cmd, shell=True)
    
SRdb['inStrain_succeeded'] = [os.path.isfile(s) for s in SRdb['inStrain_coverage']]
SRdb['inStrain_succeeded'].value_counts()

cat /home/mattolm/user_data/Covid_19/Pipeline/Jupyter/cmds/04202020_instrain.txt | parallel -j 6


False    459
Name: inStrain_succeeded, dtype: int64

In [31]:
SRdb['inStrain_succeeded'] = [os.path.isfile(s) for s in SRdb['inStrain_coverage']]
SRdb['inStrain_succeeded'].value_counts()

True     458
False      1
Name: inStrain_succeeded, dtype: int64

In [32]:
print("{0} samples failed inStrain that should have passed".format(len(SRdb[(SRdb['inStrain_succeeded'] == False) & (SRdb['aligned_reads'] > 0)])))

1 samples failed inStrain that should have passed


In [34]:
['/'.join(x.split('/')[:-2]) + '/log/log.log' for x in SRdb[(SRdb['inStrain_succeeded'] == False) & (SRdb['aligned_reads'] > 0)]['inStrain_coverage'].tolist()]

['/home/mattolm/user_data/Covid_19/inStrain/profiles_v2/NC_045512.2.fasta.bt2-vs-SRR11059947.sam.IS/log/log.log']

# Save information

In [35]:
## Add sequence release date
SRdb['release_date'] = [os.path.getctime(s) for s in SRdb['sra_file']]

In [36]:
# Full
SRdb.to_csv('/home/mattolm/user_data/Covid_19/Pipeline/Jupyter/{0}_SRA_full_info.csv'.format(d4), index=False)
SRdb.head()

Unnamed: 0,study_accession,experiment_accession,experiment_title,experiment_desc,organism_taxid,organism_name,library_strategy,library_source,library_selection,sample_accession,sample_title_x,instrument,total_spots,total_size,run_accession,run_total_spots,run_total_bases,run_alias,sra_url_alt,sra_url,experiment_alias,isolate,collected_by,collection_date,geo_loc_name,host,host_disease,isolation_source,lat_lon,BioSampleModel,passage_history,strain,source_name,subject status,tissue/cell type,treatment,time after treatment,sub_species,cell line,description,env_broad_scale,env_local_scale,env_medium,host_taxid,isol_growth_condt,propagation,sample_title_y,seq_methods,source_uvig,virus_enrich_appr,...,host_health_state,host_sex,host_subject_id,host_tissue_sampled,pathotype,serotype,serovar,specimen_voucher,subgroup,subtype,cell type,time point,identification_method,sample type,Laboratory Host,Extraction Method,ref_biomaterial,link_addit_analys,country,sra_url_alt1,sra_url_alt2,ena_fastq_url,ena_fastq_ftp,Run,BioProject,TrueLibraryLayout,sra_file,sra_file_consistant,fastq1,fastq2,fastq1_exists,fastq2_exists,LibraryLayout,filtered_fastq1,filtered_fastq2,filtered_fastqS,filtered_fastq1_exists,filtered_fastq2_exists,filtered_fastqS_exists,sam_file,mapping_done,mapping_log,mapping_log_exists,total_reads,paired_reads,percent_aligned,aligned_reads,inStrain_coverage,inStrain_succeeded,release_date
0,SRP250294,SRX8112385,SISPA of SARS-CoV-2 from cell culture,SISPA of SARS-CoV-2 from cell culture,9606.0,Homo sapiens,WGS,METAGENOMIC,RANDOM PCR,SRS6187447,,Illumina MiSeq,128808.0,12862840.0,SRR11542288,128808.0,30352180.0,p2b_vero76.fastq,https://sra-download.ncbi.nlm.nih.gov/traces/s...,https://sra-download.ncbi.nlm.nih.gov/traces/s...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,SRR11542288,PRJNA607948,PAIRED,/home/mattolm/user_data/Covid_19/reads/SRR1154...,True,/home/mattolm/user_data/Covid_19/reads/SRR1154...,/home/mattolm/user_data/Covid_19/reads/SRR1154...,True,False,SINGLE,/home/mattolm/user_data/Covid_19/reads/filtere...,/home/mattolm/user_data/Covid_19/reads/filtere...,/home/mattolm/user_data/Covid_19/reads/filtere...,False,False,True,/home/mattolm/user_data/Covid_19/inStrain/mapp...,True,/home/mattolm/user_data/Covid_19/inStrain/mapp...,True,128808,128808,98.6,127004,/home/mattolm/user_data/Covid_19/inStrain/prof...,True,1587408000.0
1,SRP250294,SRX8112384,SISPA of SARS-CoV-2 from cell culture,SISPA of SARS-CoV-2 from cell culture,9606.0,Homo sapiens,WGS,METAGENOMIC,RANDOM PCR,SRS6187448,,Illumina MiSeq,187666.0,17959590.0,SRR11542289,187666.0,42712570.0,p2a_vero76.fastq,https://sra-download.ncbi.nlm.nih.gov/traces/s...,https://sra-download.ncbi.nlm.nih.gov/traces/s...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,SRR11542289,PRJNA607948,PAIRED,/home/mattolm/user_data/Covid_19/reads/SRR1154...,True,/home/mattolm/user_data/Covid_19/reads/SRR1154...,/home/mattolm/user_data/Covid_19/reads/SRR1154...,True,False,SINGLE,/home/mattolm/user_data/Covid_19/reads/filtere...,/home/mattolm/user_data/Covid_19/reads/filtere...,/home/mattolm/user_data/Covid_19/reads/filtere...,False,False,True,/home/mattolm/user_data/Covid_19/inStrain/mapp...,True,/home/mattolm/user_data/Covid_19/inStrain/mapp...,True,187666,187666,98.98,185751,/home/mattolm/user_data/Covid_19/inStrain/prof...,True,1587408000.0
2,SRP254488,SRX8112342,SARS-CoV-2/190300/human/2020/Malaysia _EPI_ISL...,SARS-CoV-2/190300/human/2020/Malaysia _EPI_ISL...,2697049.0,Severe acute respiratory syndrome coronavirus 2,AMPLICON,VIRAL RNA,PCR,SRS6477428,,Illumina iSeq 100,898380.0,54608710.0,SRR11542243,898380.0,153868600.0,EPI_ISL_417920_S3_L001_R1_001.fastq.gz,https://sra-download.ncbi.nlm.nih.gov/traces/s...,https://sra-download.ncbi.nlm.nih.gov/traces/s...,,SARS-CoV-2/190300/human/2020/Malaysia,Universiti Malaya COVID Research group,22-Mar-2020,Malaysia,Homo sapiens,COVID-19,Nasopharyngeal/throat swab,3.1390 N 101.6869 E,Pathogen.cl,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,SRR11542243,PRJNA616147,PAIRED,/home/mattolm/user_data/Covid_19/reads/SRR1154...,True,/home/mattolm/user_data/Covid_19/reads/SRR1154...,/home/mattolm/user_data/Covid_19/reads/SRR1154...,True,True,PAIRED,/home/mattolm/user_data/Covid_19/reads/filtere...,/home/mattolm/user_data/Covid_19/reads/filtere...,/home/mattolm/user_data/Covid_19/reads/filtere...,True,True,True,/home/mattolm/user_data/Covid_19/inStrain/mapp...,True,/home/mattolm/user_data/Covid_19/inStrain/mapp...,True,898380,898380,91.41,821209,/home/mattolm/user_data/Covid_19/inStrain/prof...,True,1587408000.0
3,SRP254488,SRX8112341,SARS-CoV-2/186197/human/2020/Malaysia_EPI_ISL_...,SARS-CoV-2/186197/human/2020/Malaysia_EPI_ISL_...,2697049.0,Severe acute respiratory syndrome coronavirus 2,AMPLICON,VIRAL RNA,PCR,SRS6477427,,Illumina iSeq 100,733611.0,42237980.0,SRR11542244,733611.0,118456300.0,EPI_ISL_417919_S4_L001_R1_001.fastq.gz,https://sra-download.ncbi.nlm.nih.gov/traces/s...,https://sra-download.ncbi.nlm.nih.gov/traces/s...,,SARS-CoV-2/186197/human/2020/Malaysia,Universiti Malaya COVID Research group,14-Mar-2020,Malaysia,Homo sapiens,COVID-19,Nasopharyngeal/throat swab,3.1390 N 101.6869 E,Pathogen.cl,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,SRR11542244,PRJNA616147,PAIRED,/home/mattolm/user_data/Covid_19/reads/SRR1154...,True,/home/mattolm/user_data/Covid_19/reads/SRR1154...,/home/mattolm/user_data/Covid_19/reads/SRR1154...,True,True,PAIRED,/home/mattolm/user_data/Covid_19/reads/filtere...,/home/mattolm/user_data/Covid_19/reads/filtere...,/home/mattolm/user_data/Covid_19/reads/filtere...,True,True,True,/home/mattolm/user_data/Covid_19/inStrain/mapp...,True,/home/mattolm/user_data/Covid_19/inStrain/mapp...,True,733611,733611,19.18,140706,/home/mattolm/user_data/Covid_19/inStrain/prof...,True,1587408000.0
4,SRP255993,SRX8095881,WGS of SARS-CoV-2 R03006_2020_2,WGS of SARS-CoV-2 R03006_2020_2,2697049.0,Severe acute respiratory syndrome coronavirus 2,WGS,GENOMIC,RANDOM,SRS6462475,,Illumina MiSeq,2971431.0,1079579000.0,SRR11524818,2971431.0,1787565000.0,R03006_2_2020_S2_L001_R1_001.fastq.gz,https://sra-download.ncbi.nlm.nih.gov/traces/s...,https://sra-download.ncbi.nlm.nih.gov/traces/s...,,,"National Institute for Communicable Diseases, ...",2020-03-07,South Africa: KwaZulu-Natal,Homo sapiens,COVID-19,Combined nasopharyngeal and oropharyngeal swab,28.5306 S 30.8958 E,Pathogen.cl,,R03006_2020_2,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,SRR11524818,PRJNA624358,PAIRED,/home/mattolm/user_data/Covid_19/reads/SRR1152...,True,/home/mattolm/user_data/Covid_19/reads/SRR1152...,/home/mattolm/user_data/Covid_19/reads/SRR1152...,True,True,PAIRED,/home/mattolm/user_data/Covid_19/reads/filtere...,/home/mattolm/user_data/Covid_19/reads/filtere...,/home/mattolm/user_data/Covid_19/reads/filtere...,True,True,True,/home/mattolm/user_data/Covid_19/inStrain/mapp...,True,/home/mattolm/user_data/Covid_19/inStrain/mapp...,True,38800,38800,0.01,3,/home/mattolm/user_data/Covid_19/inStrain/prof...,True,1587408000.0


In [37]:
# Basic
PMdb = SRdb[['Run', 'experiment_title', 'experiment_desc', 'sample_accession', 
             'library_strategy', 'library_source', 'library_selection', 'LibraryLayout', 
             'instrument', 'collected_by', 'collection_date', 'isolation_source',
             'total_reads',	'paired_reads', 'percent_aligned', 'aligned_reads', 'release_date']]#, 'BioSample']]
PMdb.to_csv('/home/mattolm/user_data/Covid_19/Pipeline/Jupyter/{0}_SRA_basic_info.csv'.format(d4), index=False)
PMdb.head()

Unnamed: 0,Run,experiment_title,experiment_desc,sample_accession,library_strategy,library_source,library_selection,LibraryLayout,instrument,collected_by,collection_date,isolation_source,total_reads,paired_reads,percent_aligned,aligned_reads,release_date
0,SRR11542288,SISPA of SARS-CoV-2 from cell culture,SISPA of SARS-CoV-2 from cell culture,SRS6187447,WGS,METAGENOMIC,RANDOM PCR,SINGLE,Illumina MiSeq,,,,128808,128808,98.6,127004,1587408000.0
1,SRR11542289,SISPA of SARS-CoV-2 from cell culture,SISPA of SARS-CoV-2 from cell culture,SRS6187448,WGS,METAGENOMIC,RANDOM PCR,SINGLE,Illumina MiSeq,,,,187666,187666,98.98,185751,1587408000.0
2,SRR11542243,SARS-CoV-2/190300/human/2020/Malaysia _EPI_ISL...,SARS-CoV-2/190300/human/2020/Malaysia _EPI_ISL...,SRS6477428,AMPLICON,VIRAL RNA,PCR,PAIRED,Illumina iSeq 100,Universiti Malaya COVID Research group,22-Mar-2020,Nasopharyngeal/throat swab,898380,898380,91.41,821209,1587408000.0
3,SRR11542244,SARS-CoV-2/186197/human/2020/Malaysia_EPI_ISL_...,SARS-CoV-2/186197/human/2020/Malaysia_EPI_ISL_...,SRS6477427,AMPLICON,VIRAL RNA,PCR,PAIRED,Illumina iSeq 100,Universiti Malaya COVID Research group,14-Mar-2020,Nasopharyngeal/throat swab,733611,733611,19.18,140706,1587408000.0
4,SRR11524818,WGS of SARS-CoV-2 R03006_2020_2,WGS of SARS-CoV-2 R03006_2020_2,SRS6462475,WGS,GENOMIC,RANDOM,PAIRED,Illumina MiSeq,"National Institute for Communicable Diseases, ...",2020-03-07,Combined nasopharyngeal and oropharyngeal swab,38800,38800,0.01,3,1587408000.0


In [39]:
# Parsed

PLdb = SRdb.copy()

# Add from BioMeta
PLdb = pd.merge(PLdb, BioMeta[['BioProject', 'Group', 'Method', 'Center_ID']], on='BioProject', how='left')

VARIABLES = ['Center_ID', 'Group', 'BioProject', 'Method', 'library_strategy', 'library_source', 'instrument', 'LibraryLayout']
for v in VARIABLES:
    if len(PLdb[PLdb[v].isna()]) > 0:
        print("{0} samples have no {1}".format(len(PLdb[PLdb[v].isna()]), v))
        
# Remove irrelevant samples
IRR = set(RunMeta[RunMeta['Status'] == 'NonPatient']['Run'].tolist())
PLdb = PLdb[~PLdb['Run'].isin(IRR)]
PLdb = PLdb[PLdb['Method'] != 'IRRELEVANT']

# Adjust method
r2m = RunMeta.set_index('Run')['Method'].to_dict()
PLdb['Method'] = [r2m[r] if r in r2m else m for r, m in zip(PLdb['Run'], PLdb['Method'])]

# Add basic instrain
dbs = []
for i, row in PLdb[PLdb['inStrain_succeeded'] == True].iterrows():
    db = pd.read_csv(row['inStrain_coverage'], sep='\t')
    db['Run'] = row['Run']
    dbs.append(db)
COdb = pd.concat(dbs).reset_index(drop=True)

PLdb = pd.merge(PLdb, COdb[['Run', 'coverage', 'breadth']], how='left', on='Run')
assert len(PLdb[(PLdb['coverage'].isna()) & (PLdb['aligned_reads'] > 0)]) <= 1

# Filter
DESC_VARS = ['sample_accession', 'experiment_desc', 'collection_date', 'isolation_source', 'release_date']
INFO_VARS = ['total_reads', 'percent_aligned', 'coverage', 'breadth']
PLdb = PLdb[['Run'] + VARIABLES + INFO_VARS + DESC_VARS]

# Print a little description
COV_LIM = 50
BRE_LIM = 0.9

fdb = PLdb[(PLdb['coverage'] >= COV_LIM)]
print("{0} of {1} samples have >={2} coverage".format(len(fdb), len(PLdb), COV_LIM, BRE_LIM))

fdb = PLdb[(PLdb['breadth'] >= BRE_LIM)]
print("{0} of {1} samples have >={3} breadth".format(len(fdb), len(PLdb), COV_LIM, BRE_LIM))

fdb = PLdb[(PLdb['coverage'] >= COV_LIM) & (PLdb['breadth'] >= BRE_LIM)]
print("{0} of {1} samples have >={2} coverage and >={3} breadth".format(len(fdb), len(PLdb), COV_LIM, BRE_LIM))
fdb['Center_ID'].value_counts()

PLdb['Sufficient_cov'] = [((coverage >= COV_LIM) & (breadth >= BRE_LIM)) for coverage, breadth in zip(PLdb['coverage'], PLdb['breadth'])]
PLdb.to_csv('/home/mattolm/user_data/Covid_19/Pipeline/Jupyter/{0}_SRA_parsed_info_v2.csv'.format(d4), index=False)
PLdb.head()

425 of 453 samples have >=50 coverage
413 of 453 samples have >=0.9 breadth
401 of 453 samples have >=50 coverage and >=0.9 breadth


Unnamed: 0,Run,Center_ID,Group,BioProject,Method,library_strategy,library_source,instrument,LibraryLayout,total_reads,percent_aligned,coverage,breadth,sample_accession,experiment_desc,collection_date,isolation_source,release_date,Sufficient_cov
0,SRR11542243,UM_Malaysia,University Malaya,PRJNA616147,PCR_ARCTIC,AMPLICON,VIRAL RNA,Illumina iSeq 100,PAIRED,898380,91.41,2968.623549,0.989265,SRS6477428,SARS-CoV-2/190300/human/2020/Malaysia _EPI_ISL...,22-Mar-2020,Nasopharyngeal/throat swab,1587408000.0,True
1,SRR11542244,UM_Malaysia,University Malaya,PRJNA616147,PCR_ARCTIC,AMPLICON,VIRAL RNA,Illumina iSeq 100,PAIRED,733611,19.18,436.001371,0.790623,SRS6477427,SARS-CoV-2/186197/human/2020/Malaysia_EPI_ISL_...,14-Mar-2020,Nasopharyngeal/throat swab,1587408000.0,False
2,SRR11524818,NICD_SouthAfrica,National Institute for Communicable Diseases,PRJNA624358,RNA-seq,WGS,GENOMIC,Illumina MiSeq,PAIRED,38800,0.01,0.034813,0.034813,SRS6462475,WGS of SARS-CoV-2 R03006_2020_2,2020-03-07,Combined nasopharyngeal and oropharyngeal swab,1587408000.0,False
3,SRR11514749,UM_Malaysia,University Malaya,PRJNA616147,PCR_ARCTIC,AMPLICON,VIRAL RNA,Illumina iSeq 100,PAIRED,1028510,94.68,3470.610942,0.997726,SRS6395996,SARS-CoV-2/188407/human/2020/Malaysia_EPI_ISL_...,18-Mar-2020,Nasopharyngeal/throat swab,1587408000.0,True
4,SRR11514750,UM_Malaysia,University Malaya,PRJNA616147,PCR_ARCTIC,AMPLICON,VIRAL RNA,Illumina iSeq 100,PAIRED,947580,95.87,3274.178276,0.997392,SRS6395995,SARS-CoV-2/189332/human/2020/Malaysia_EPI_ISL_...,20-Mar-2020,Nasopharyngeal/throat swab,1587408000.0,True


In [40]:
for m, c in PLdb[PLdb['Sufficient_cov'] == True]['Group'].value_counts().to_dict().items():
    print("{0};{1}".format(m, c))

The Peter Doherty Institute for Infection and Immunity;319
Utah Public Health Laboratory;55
CDC Pathogen Discovery Team;9
Hubei Provincial Center for Disease Control and Prevention;6
University Malaya;3
Beijing Institute of Genomics, Chinese Academy of Sciences;2
University of Maryland Institute for Genome Sciences (UMIGS);2
Peruvian National Institute of Health;1
Wuhan Institute of Virology, Chinese Academy of Sciences;1
The Scripps Research Institute;1
Universidad Tecnologica de Pereira, Nepal;1
University of Wisconsin - Madison;1


In [41]:
for m, c in PLdb[PLdb['Sufficient_cov'] == True]['Method'].value_counts().to_dict().items():
    print("{0};{1}".format(m, c))

PCR_ARCTIC;378
RNA-seq;12
PCR_CUSTOM;10
Capture_Twist;1
