In [3]:
import os
import sys
import glob
import scipy
import matplotlib
import numpy as np
import pandas as pd
import seaborn as sns
from collections import defaultdict
from matplotlib import pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

from subprocess import call

%matplotlib inline
sns.set_style('whitegrid')
pd.set_option('display.max_rows', 100)
matplotlib.rcParams['ps.fonttype'] = 42
matplotlib.rcParams['pdf.fonttype'] = 42
pd.set_option('display.max_columns', 100)

In [35]:
def SRA_to_bioproject(accession):
    '''
    From an SRA (like SRR1883283) return a biosample
    '''
    from Bio import Entrez
    Entrez.email = "mattolm@gmail.com"

    # GO FROM SRA TO ID
    #accession = 'SRX1883283'
    handle = Entrez.esearch(db='sra', term=accession, retmode='text')
    record = Entrez.read(handle)
    handle.close()
    ID = record['IdList'][0]
    #return ID
    
    # ELINK FROM SRA ID TO BIOSAMPLE ID
    accession = ID
    handle = Entrez.elink(dbfrom="sra", retmax=10, id=accession, linkname="sra_bioproject")
    record = Entrez.read(handle)
    handle.close()
    BID = [link["Id"] for link in record[0]["LinkSetDb"][0]["Link"]][0]
    #return BID

    # SUMMARY FROM BIOSAMPLE ID TO TERM
    handle = Entrez.esummary(db="bioproject", id=BID)
    record = Entrez.read(handle)
    handle.close()
    BioSample = record['DocumentSummarySet']['DocumentSummary'][0]['Project_Acc']
    
    return BioSample

# Set up

In [42]:
val_loc = '/home/mattolm/user_data/Covid_19/Pipeline/Jupyter/validated_downloads.txt'
meta_loc = '/home/mattolm/user_data/Covid_19/Pipeline/Metadata/metadata.xlsx'

THREADS = 6

VALIDATED_RUNS = []
with open(val_loc, 'r') as o:
    for line in o.readlines():
        VALIDATED_RUNS.append(line.strip())
VALIDATED_RUNS = set(VALIDATED_RUNS)

BioMeta = pd.read_excel(meta_loc, sheet_name='BioProjects')
BioMeta.head()

Unnamed: 0,BioProject,Link,Group,Method,Publication link,Publication title,Description
0,PRJNA615032,https://www.ncbi.nlm.nih.gov/bioproject/?term=...,Icahn School of Medicine at Mount Sina,RNA-seq,https://www.biorxiv.org/content/10.1101/2020.0...,SARS-CoV-2 launches a unique transcriptional s...,Looking at the response of human cells to COVID
1,PRJNA610428,https://www.ncbi.nlm.nih.gov/bioproject/?term=...,UNIVERSITY OF WASHINGTON,RNA-seq,,,
2,PRJNA613958,https://www.ncbi.nlm.nih.gov/bioproject/?term=...,The Peter Doherty Institute for Infection and ...,PCR_ARTIC v1,,,
3,PRJNA614546,https://www.ncbi.nlm.nih.gov/bioproject/?term=...,Paragon Genomics,Mixed,https://www.biorxiv.org/content/10.1101/2020.0...,High sensitivity detection of coronavirus SARS...,Evaluating lots of things; worth following up....
4,PRJNA616446,https://www.ncbi.nlm.nih.gov/bioproject/?term=...,Hubei Provincial Center for Disease Control an...,RNA-seq,https://www.biorxiv.org/content/10.1101/2020.0...,Genome-wide data inferring the evolution and p...,Sequencing genomes


# Get a list of all SRA samples

In [36]:
from datetime import date
today = date.today()
d4 = today.strftime("%d_%m_%Y")
dloc = '/home/mattolm/user_data/Covid_19/Pipeline/Jupyter/{0}_SRA.tsv'.format(d4)

SEARCH_STRING = '(\\"Severe acute respiratory syndrome coronavirus 2\\"[Organism] OR SARS-CoV-2[All Fields]) AND \\"platform illumina\\"[Properties]'

! pysradb search "$SEARCH_STRING" --detailed --saveto "$dloc"

SRdb = pd.read_csv(dloc, sep='\t')
SRdb['Run'] = SRdb['run_accession']
SRdb = SRdb[~SRdb['run_accession'].isna()]

# Add BioProject
SRdb['BioProject'] = [SRA_to_bioproject(r) for r in SRdb['Run']]

# Any unknown BioProjects?
unk = set(SRdb['BioProject'].tolist()) - set(BioMeta['BioProject'].tolist())
if len(unk) == 0:
    print("All BioProjects are known")
else:
    print("{0} new BioProjects with no info".format(len(unk)))
    print("\n".join(list(unk)))


  from pandas import Panel
  from pandas import Panel
All BioProjects are known


In [40]:
# Filter out irrelevant bioprojects
SRdb = SRdb[~SRdb['BioProject'].isin(BioMeta[BioMeta['Method'] == 'IRRELEVANT']['BioProject'])]
NEW = set(SRdb['Run'].tolist())
print("{0} new samples (and {1} already processed = {2} total (including irrelevant already processed))".format(len(NEW - VALIDATED_RUNS), len(VALIDATED_RUNS), len(SRdb)))

90 new samples (and 77 already processed = 159 total (including irrelevant already processed))


# START PROCESSING!

# Download samples that need downloading

In [48]:
import subprocess
def check_sra_file(file):
    cmd = "/usr/bin/vdb-validate {0}".format(file)
    try:
        output = subprocess.check_output(
            cmd, stderr=subprocess.STDOUT, shell=True,
            universal_newlines=True)
        final_out = output.split('\n')[-2]
        return True
    except Exception as exc:
        return False

In [None]:
base_loc = '/home/mattolm/user_data/Covid_19/reads/'
odir = '/home/mattolm/user_data/Covid_19/Pipeline/test/'

SRdb['sra_file'] = [base_loc + x for x in SRdb['Run']]
SRdb['sra_file_consistant'] = [(R in VALIDATED_RUNS) for R in SRdb['Run']]
SRdb['sra_file_consistant'] = [check_sra_file(x) if y == False else y for x, y in zip(SRdb['sra_file'], SRdb['sra_file_consistant'])]

TOTAL_LOOPS = 50
loop = 0
while len(SRdb[SRdb['sra_file_consistant'] == False]) > 0:
    print('Running loop {0}; downloading {1} new files'.format(loop, len(SRdb[SRdb['sra_file_consistant'] == False])))
    cmd_loc = '/home/mattolm/user_data/Covid_19/Pipeline/Jupyter/cmds/{0}_download.txt'.format(d4)
    with open(cmd_loc, 'w') as o:
        for i, row in SRdb[SRdb['sra_file_consistant'] == False].iterrows():
            o.write("wget -c {0} -O {1}\n".format(row['sra_url'], row['sra_file']))
    cmd = "cat {0} | parallel -j {1}".format(cmd_loc, THREADS)
    call(cmd, shell=True)
            
    SRdb['sra_file_consistant'] = [check_sra_file(x) if y == False else y for x, y in zip(SRdb['sra_file'], SRdb['sra_file_consistant'])]
    loop += 1
    if loop == TOTAL_LOOPS:
        break
        
if len(SRdb[SRdb['sra_file_consistant'] == False]) > 0:
    print("THE FOLLOWING HAVE PROBLEMS:")
    for i, row in SRdb[SRdb['sra_file_consistant'] == False].iterrows():
        print("/usr/bin/vdb-validate {0}".format(row['sra_file']))
        print("wget -c {0} -O {1}\n".format(row['sra_url'], row['sra_file']))


Running loop 0; downloading 23 new files


In [52]:
if len(SRdb[SRdb['sra_file_consistant'] == False]) > 0:
    print("THE FOLLOWING HAVE PROBLEMS:")
    for i, row in SRdb[SRdb['sra_file_consistant'] == False].iterrows():
        print("/usr/bin/vdb-validate {0}".format(row['sra_file']))
        print("wget -c {0} -O {1}\n".format(row['sra_url'], row['sra_file']))
else:
    print("All are good!")

All are good!


In [53]:
# Save new validations
VALIDATED_RUNS = VALIDATED_RUNS.union(set(SRdb[SRdb['sra_file_consistant'] == True]['Run'].tolist()))
with open('/home/mattolm/user_data/Covid_19/Pipeline/Jupyter/validated_downloads.txt', 'w') as o:
    for v in VALIDATED_RUNS:
        o.write(v + '\n')

## FastqQ Dump

In [54]:
base_loc = '/home/mattolm/user_data/Covid_19/reads/'

SRdb['fastq1'] = [x + '_1.fastq' for x in SRdb['sra_file']]
SRdb['fastq2'] = [x + '_2.fastq' for x in SRdb['sra_file']]

SRdb['fastq1_exists'] = [os.path.exists(x) for x in SRdb['fastq1']]
SRdb['fastq2_exists'] = [os.path.exists(x) for x in SRdb['fastq2']]

loop = 0
MISSING_FASTA = SRdb[SRdb['fastq1_exists'] == False]
while len(MISSING_FASTA) > 0:
    print("Loop {0} - {1} samples are missing their fastq dumps".format(loop, len(MISSING_FASTA)))
    
    cmd_loc = '/home/mattolm/user_data/Covid_19/Pipeline/Jupyter/cmds/{0}_dump.txt'.format(d4)
    with open(cmd_loc, 'w') as o:
        for i, row in MISSING_FASTA.iterrows():
            o.write("fastq-dump {0} --split-files -O {1}".format(row['sra_file'], base_loc) + '\n')
    cmd = "cat {0} | parallel -j {1}".format(cmd_loc, THREADS)
    call(cmd, shell=True)

    SRdb['fastq1_exists'] = [os.path.exists(x) for x in SRdb['fastq1']]
    MISSING_FASTA = SRdb[SRdb['fastq1_exists'] == False]
    loop += 1
    
SRdb['LibraryLayout'] = ['PAIRED' if t else 'SINGLE' for t in SRdb['fastq2_exists']]

Loop 0 - 90 samples are missing their fastq dumps


In [55]:
for x, db in SRdb.groupby('LibraryLayout'):
    print(x)
    print(db['fastq2_exists'].value_counts())

PAIRED
True    55
Name: fastq2_exists, dtype: int64
SINGLE
False    104
Name: fastq2_exists, dtype: int64


## Process reads

In [57]:
SRdb['filtered_fastq1'] = ['/home/mattolm/user_data/Covid_19/reads/filtered/' + os.path.basename(rp).replace('_1.fastq', '_rep1.fastq').replace('_rep1.fastq', '_bbduk_1.fastq') for rp in SRdb['fastq1']]
SRdb['filtered_fastq2'] = ['/home/mattolm/user_data/Covid_19/reads/filtered/' + os.path.basename(rp).replace('_2.fastq', '_rep2.fastq').replace('_rep2.fastq', '_bbduk_2.fastq') for rp in SRdb['fastq2']]
SRdb['filtered_fastqS'] = ['/home/mattolm/user_data/Covid_19/reads/filtered/' + os.path.basename(rp).replace('_1.fastq', '_rep1.fastq').replace('_rep1.fastq', '_bbduk.fastq') for rp in SRdb['fastq1']]

SRdb['filtered_fastq1_exists'] = [os.path.exists(x) for x in SRdb['filtered_fastq1']]
SRdb['filtered_fastq2_exists'] = [os.path.exists(x) for x in SRdb['filtered_fastq2']]
SRdb['filtered_fastqS_exists'] = [os.path.exists(x) for x in SRdb['filtered_fastqS']]


In [58]:
from subprocess import call
def process_reads(r1, r2, outfolder):
    # repair
    rr1 = outfolder + os.path.basename(r1).replace('_1.fastq', '_rep1.fastq')
    rr2 = outfolder + os.path.basename(r2).replace('_2.fastq', '_rep2.fastq')
    cmd = "repair.sh in={0} in2={1} out={2} out2={3}".format(r1, r2, rr1, rr2)
    print(cmd)
    call(cmd, shell=True)
    
    # bbduk
    rb1 = rr1.replace('_rep1.fastq', '_bbduk_1.fastq')
    rb2 = rr2.replace('_rep2.fastq', '_bbduk_2.fastq')
    cmd = "bbduk.sh in={0} in2={1} out={2} out2={3} threads={4}".format(rr1, rr2, rb1, rb2, THREADS)
    print(cmd)
    call(cmd, shell=True)
    
    return rb1, rb2

def process_reads_s(r1, outfolder):
    # bbduk
    rr1 = outfolder + os.path.basename(r1).replace('_1.fastq', '_rep1.fastq')
    rb1 = rr1.replace('_rep1.fastq', '_bbduk.fastq')
    cmd = "bbduk.sh in={0} out={1} threads={2}".format(r1, rb1, THREADS)
    print(cmd)
    call(cmd, shell=True)
    
    return rb1 

SRdb['filtered_fastq1_exists'] = [os.path.exists(x) for x in SRdb['filtered_fastq1']]
SRdb['filtered_fastq2_exists'] = [os.path.exists(x) for x in SRdb['filtered_fastq2']]
SRdb['filtered_fastqS_exists'] = [os.path.exists(x) for x in SRdb['filtered_fastqS']]

# Process paired
for i, row in SRdb[(SRdb['LibraryLayout'] == 'PAIRED') & ((SRdb['filtered_fastq1_exists'] == False) | (SRdb['filtered_fastq2_exists'] == False))].iterrows():
    rb1, rb2 = process_reads(row['fastq1'], row['fastq2'], '/home/mattolm/user_data/Covid_19/reads/filtered/')
    
# Process unpaired
for i, row in SRdb[(SRdb['LibraryLayout'] == 'SINGLE') & (SRdb['filtered_fastqS_exists'] == False)].iterrows():
    b1 = process_reads_s(row['fastq1'], '/home/mattolm/user_data/Covid_19/reads/filtered/')
    
SRdb['filtered_fastq1_exists'] = [os.path.exists(x) for x in SRdb['filtered_fastq1']]
SRdb['filtered_fastq2_exists'] = [os.path.exists(x) for x in SRdb['filtered_fastq2']]
SRdb['filtered_fastqS_exists'] = [os.path.exists(x) for x in SRdb['filtered_fastqS']]

bbduk.sh in=/home/mattolm/user_data/Covid_19/reads/SRR11454606_1.fastq out=/home/mattolm/user_data/Covid_19/reads/filtered/SRR11454606_bbduk.fastq threads=6
bbduk.sh in=/home/mattolm/user_data/Covid_19/reads/SRR11454607_1.fastq out=/home/mattolm/user_data/Covid_19/reads/filtered/SRR11454607_bbduk.fastq threads=6
bbduk.sh in=/home/mattolm/user_data/Covid_19/reads/SRR11454608_1.fastq out=/home/mattolm/user_data/Covid_19/reads/filtered/SRR11454608_bbduk.fastq threads=6
bbduk.sh in=/home/mattolm/user_data/Covid_19/reads/SRR11454609_1.fastq out=/home/mattolm/user_data/Covid_19/reads/filtered/SRR11454609_bbduk.fastq threads=6
bbduk.sh in=/home/mattolm/user_data/Covid_19/reads/SRR11454610_1.fastq out=/home/mattolm/user_data/Covid_19/reads/filtered/SRR11454610_bbduk.fastq threads=6
bbduk.sh in=/home/mattolm/user_data/Covid_19/reads/SRR11454611_1.fastq out=/home/mattolm/user_data/Covid_19/reads/filtered/SRR11454611_bbduk.fastq threads=6
bbduk.sh in=/home/mattolm/user_data/Covid_19/reads/SRR1145

## Run mapping

In [59]:
BTL = '/home/mattolm/user_data/Covid_19/genomes/NC_045512.2.fasta.bt2'
ML = '/home/mattolm/user_data/Covid_19/inStrain/mapping/'
s2l = SRdb.set_index('Run')['LibraryLayout'].to_dict()

SRdb['sam_file'] = ["{0}{1}-vs-{2}.sam".format(ML, os.path.basename(BTL), r) for r in SRdb['Run']]
SRdb['mapping_done'] = [os.path.isfile(s) for s in SRdb['sam_file']]

cmd_loc = '/home/mattolm/user_data/Covid_19/Pipeline/Jupyter/cmds/{0}_mapping.txt'.format(d4)
re = open(cmd_loc, 'w')

for i, row in SRdb[SRdb['mapping_done'] == False].iterrows():
#for i, row in SRdb.iterrows():
    sam_loc = row['sam_file']
    if s2l[row['Run']] == 'PAIRED':
        cmd = "bowtie2 -x {3} -1 {0} -2 {1} --no-unal -S {2} -p {4} 2> {2}.log".format(
                row['filtered_fastq1'], row['filtered_fastq2'], sam_loc, BTL, THREADS)
        re.write(cmd + '\n')
    else:
        cmd = "bowtie2 -x {3} -U {0} --no-unal -S {2} -p {4} 2> {2}.log".format(
                row['filtered_fastqS'], '', sam_loc, BTL, THREADS)
        re.write(cmd + '\n')
re.close()

cmd = "cat {0} | bash".format(cmd_loc)
call(cmd, shell=True)

SRdb['mapping_done'] = [os.path.isfile(s) for s in SRdb['sam_file']]
SRdb['mapping_done'].value_counts()

True    159
Name: mapping_done, dtype: int64

## Parse bowtie2 logs for information

In [60]:
def parse_bt2(log):
    table = defaultdict(list)
    with open(log, 'r') as o:
        for line in o.readlines():
            line = line.strip()
            if 'reads;' in line:
                table['total_reads'].append(int(line.split()[0]))
            if 'paired;' in line:
                table['paired_reads'].append(int(line.split()[0]))
            if 'overall alignment rate' in line:
                table['percent_aligned'].append(float(line.split()[0][:-1]))
    db = pd.DataFrame(table)
    db['aligned_reads'] = db['total_reads'] * (db['percent_aligned'] / 100)
    db['aligned_reads'] = db['aligned_reads'].astype(int)
    return db

dbs = []
SRdb['mapping_log'] = SRdb['sam_file'] + '.log'
SRdb['mapping_log_exists'] = [os.path.exists(x) for x in SRdb['mapping_log']]
for i, row in SRdb.iterrows():
    db = parse_bt2(row['mapping_log'])
    db['Run'] = row['Run']
    dbs.append(db)
Mdb = pd.concat(dbs).reset_index(drop=True)
for col in Mdb.columns:
    if col in ['Run']:
        continue
    r2c = Mdb.set_index('Run')[col].to_dict()
    SRdb[col] = SRdb['Run'].map(r2c)

## Run inStrain

In [74]:
SRdb['inStrain_coverage'] = ["/home/mattolm/user_data/Covid_19/inStrain/profiles/{0}.IS/output/{0}.IS_scaffold_info.tsv".format(os.path.basename(r)) for r in SRdb['sam_file']]
SRdb['inStrain_succeeded'] = [os.path.isfile(s) for s in SRdb['inStrain_coverage']]
SRdb['inStrain_succeeded'].value_counts()

True     123
False     36
Name: inStrain_succeeded, dtype: int64

In [75]:
cmd_loc = '/home/mattolm/user_data/Covid_19/Pipeline/Jupyter/cmds/{0}_instrain.txt'.format(d4)
fasta = '/home/mattolm/user_data/Covid_19/genomes/NC_045512.2.fasta'
genes ='/home/mattolm/user_data/Covid_19/genomes/NC_045512.2.gb'
s2l = SRdb.set_index('Run')['LibraryLayout'].to_dict()

re = open(cmd_loc, 'w')
for i, row in SRdb[SRdb['inStrain_succeeded'] == False].iterrows():
    if s2l[row['Run']] == 'PAIRED': 
        cmd = "inStrain profile {0} {2} -o /home/mattolm/user_data/Covid_19/inStrain/profiles/{1}.IS -p 1 -g {3} --skip_mm_profiling".format(row['sam_file'], os.path.basename(row['sam_file']), fasta, genes)
        re.write(cmd + '\n')
    else:
        cmd = "inStrain profile {0} {2} -o /home/mattolm/user_data/Covid_19/inStrain/profiles/{1}.IS -p 1 --pairing_filter non_discordant -g {3} --skip_mm_profiling".format(row['sam_file'], os.path.basename(row['sam_file']), fasta, genes)
        re.write(cmd + '\n')
re.close()
    
cmd = "cat {0} | parallel -j {1}".format(cmd_loc, THREADS)
print(cmd)
call(cmd, shell=True)
    
SRdb['inStrain_succeeded'] = [os.path.isfile(s) for s in SRdb['inStrain_coverage']]
SRdb['inStrain_succeeded'].value_counts()

cat /home/mattolm/user_data/Covid_19/Pipeline/Jupyter/cmds/02_04_2020_instrain.txt | parallel -j 6


True     123
False     36
Name: inStrain_succeeded, dtype: int64

In [76]:
print("{0} samples failed inStrain that should have passed".format(len(SRdb[(SRdb['inStrain_succeeded'] == False) & (SRdb['aligned_reads'] > 0)])))

0 samples failed inStrain that should have passed


In [77]:
['/'.join(x.split('/')[:-2]) + '/log/log.log' for x in SRdb[(SRdb['inStrain_succeeded'] == False) & (SRdb['aligned_reads'] > 0)]['inStrain_coverage'].tolist()]

[]

# Save information

In [89]:
# Full
SRdb.to_csv('/home/mattolm/user_data/Covid_19/Pipeline/Jupyter/{0}_SRA_full_info.csv'.format(d4), index=False)
SRdb.head()

Unnamed: 0,study_accession,experiment_accession,experiment_title,experiment_desc,organism_taxid,organism_name,library_strategy,library_source,library_selection,sample_accession,sample_title,instrument,total_spots,total_size,run_accession,run_total_spots,run_total_bases,run_alias,sra_url_alt,sra_url,experiment_alias,isolate,collected_by,collection_date,geo_loc_name,host,host_disease,isolation_source,lat_lon,BioSampleModel,source_name,cell line,cell type,treatment,strain,time point,identification_method,culture_collection,host_description,passage_history,sample type,Laboratory Host,Extraction Method,ref_biomaterial,link_addit_analys,host_age,host_sex,country,sra_url_alt1,sra_url_alt2,host_disease_outcome,host_disease_stage,ena_fastq_url,ena_fastq_ftp,Run,BioProject,sra_file,sra_exists,sra_file_consistant,fastq1,fastq2,fastq1_exists,fastq2_exists,LibraryLayout,filtered_fastq1,filtered_fastq2,filtered_fastqS,filtered_fastq1_exists,filtered_fastq2_exists,filtered_fastqS_exists,sam_file,mapping_done,mapping_log,mapping_log_exists,total_reads,paired_reads,percent_aligned,aligned_reads,inStrain_coverage,inStrain_succeeded
0,SRP254688,SRX8032211,RNA-Seq of Homo sapiens: hCov-19 infected pati...,RNA-Seq of Homo sapiens: hCov-19 infected pati...,2697049.0,Severe acute respiratory syndrome coronavirus 2,RNA-Seq,TRANSCRIPTOMIC,RANDOM PCR,SRS6404546,,Illumina MiniSeq,5668472.0,489620824.0,SRR11454606,5668472.0,1350169000.0,BetaCoV_Tianmen_HBCDC-HB-07_2020_R1_001.fastq.gz,https://sra-download.ncbi.nlm.nih.gov/traces/s...,https://sra-download.ncbi.nlm.nih.gov/traces/s...,,human BetaCoV Tianmen HBCDC-HB-07/2020,Tianmen Center for Disease Control and Prevention,2020-02-08,China: Hubei,Homo sapiens,COVID-19,Tianmen,not collected,Pathogen.cl,,,,,,,,,,,,,,,,,,,,,,,,,SRR11454606,PRJNA616446,/home/mattolm/user_data/Covid_19/reads/SRR1145...,True,True,/home/mattolm/user_data/Covid_19/reads/SRR1145...,/home/mattolm/user_data/Covid_19/reads/SRR1145...,True,False,SINGLE,/home/mattolm/user_data/Covid_19/reads/filtere...,/home/mattolm/user_data/Covid_19/reads/filtere...,/home/mattolm/user_data/Covid_19/reads/filtere...,False,False,True,/home/mattolm/user_data/Covid_19/inStrain/mapp...,True,/home/mattolm/user_data/Covid_19/inStrain/mapp...,True,5668472,5668472,0.03,1700,/home/mattolm/user_data/Covid_19/inStrain/prof...,True
1,SRP254688,SRX8032210,RNA-Seq of Homo sapiens: hCov-19 infected pati...,RNA-Seq of Homo sapiens: hCov-19 infected pati...,2697049.0,Severe acute respiratory syndrome coronavirus 2,RNA-Seq,TRANSCRIPTOMIC,RANDOM PCR,SRS6404545,,Illumina MiniSeq,4407436.0,413461584.0,SRR11454607,4407436.0,1157795000.0,BetaCoV_Wuhan_HBCDC-HB-06_2020_R1_001.fastq.gz,https://sra-download.ncbi.nlm.nih.gov/traces/s...,https://sra-download.ncbi.nlm.nih.gov/traces/s...,,human BetaCoV Wuhan HBCDC-HB-06/2020,Wuhan Lung Hospital,2020-02-07,China: Hubei,Homo sapiens,COVID-19,Wuhan,not collected,Pathogen.cl,,,,,,,,,,,,,,,,,,,,,,,,,SRR11454607,PRJNA616446,/home/mattolm/user_data/Covid_19/reads/SRR1145...,True,True,/home/mattolm/user_data/Covid_19/reads/SRR1145...,/home/mattolm/user_data/Covid_19/reads/SRR1145...,True,False,SINGLE,/home/mattolm/user_data/Covid_19/reads/filtere...,/home/mattolm/user_data/Covid_19/reads/filtere...,/home/mattolm/user_data/Covid_19/reads/filtere...,False,False,True,/home/mattolm/user_data/Covid_19/inStrain/mapp...,True,/home/mattolm/user_data/Covid_19/inStrain/mapp...,True,4407436,4407436,0.25,11018,/home/mattolm/user_data/Covid_19/inStrain/prof...,True
2,SRP254688,SRX8032209,RNA-Seq of Homo sapiens: hCov-19 infected pati...,RNA-Seq of Homo sapiens: hCov-19 infected pati...,2697049.0,Severe acute respiratory syndrome coronavirus 2,RNA-Seq,TRANSCRIPTOMIC,RANDOM PCR,SRS6404544,,Illumina iSeq 100,2690410.0,232046490.0,SRR11454608,2690410.0,725883100.0,BetaCoV_Jingzhou_HBCDC-HB-01_2020_R1_001.fastq.gz,https://sra-download.ncbi.nlm.nih.gov/traces/s...,https://sra-download.ncbi.nlm.nih.gov/traces/s...,,human BetaCoV Jingzhou HBCDC-HB-01/2020,Jingzhou Center for Disease Control and Preven...,2020-01-08,China: Hubei,Homo sapiens,COVID-19,Jingzhou,not collected,Pathogen.cl,,,,,,,,,,,,,,,,,,,,,,,,,SRR11454608,PRJNA616446,/home/mattolm/user_data/Covid_19/reads/SRR1145...,True,True,/home/mattolm/user_data/Covid_19/reads/SRR1145...,/home/mattolm/user_data/Covid_19/reads/SRR1145...,True,False,SINGLE,/home/mattolm/user_data/Covid_19/reads/filtere...,/home/mattolm/user_data/Covid_19/reads/filtere...,/home/mattolm/user_data/Covid_19/reads/filtere...,False,False,True,/home/mattolm/user_data/Covid_19/inStrain/mapp...,True,/home/mattolm/user_data/Covid_19/inStrain/mapp...,True,2690410,2690410,3.2,86093,/home/mattolm/user_data/Covid_19/inStrain/prof...,True
3,SRP254688,SRX8032208,RNA-Seq of Homo sapiens: hCov-19 infected pati...,RNA-Seq of Homo sapiens: hCov-19 infected pati...,2697049.0,Severe acute respiratory syndrome coronavirus 2,RNA-Seq,TRANSCRIPTOMIC,RANDOM PCR,SRS6404543,,NextSeq 550,17121629.0,517518269.0,SRR11454609,17121629.0,1284122000.0,BetaCoV_Wuhan_HBCDC-HB-02_2020_S10_R1_001.fast...,https://sra-download.ncbi.nlm.nih.gov/traces/s...,https://sra-download.ncbi.nlm.nih.gov/traces/s...,,human BetaCoV Wuhan HBCDC-HB-02/2020,The Central Hospital Of Wuhan,2020-01-17,China: Hubei,Homo sapiens,COVID-19,Wuhan,not collected,Pathogen.cl,,,,,,,,,,,,,,,,,,,,,,,,,SRR11454609,PRJNA616446,/home/mattolm/user_data/Covid_19/reads/SRR1145...,True,True,/home/mattolm/user_data/Covid_19/reads/SRR1145...,/home/mattolm/user_data/Covid_19/reads/SRR1145...,True,False,SINGLE,/home/mattolm/user_data/Covid_19/reads/filtere...,/home/mattolm/user_data/Covid_19/reads/filtere...,/home/mattolm/user_data/Covid_19/reads/filtere...,False,False,True,/home/mattolm/user_data/Covid_19/inStrain/mapp...,True,/home/mattolm/user_data/Covid_19/inStrain/mapp...,True,17121629,17121629,0.37,63350,/home/mattolm/user_data/Covid_19/inStrain/prof...,True
4,SRP254688,SRX8032207,RNA-Seq of Homo sapiens: hCov-19 infected pati...,RNA-Seq of Homo sapiens: hCov-19 infected pati...,2697049.0,Severe acute respiratory syndrome coronavirus 2,RNA-Seq,TRANSCRIPTOMIC,RANDOM PCR,SRS6404542,,NextSeq 550,14337950.0,413719955.0,SRR11454610,14337950.0,1075346000.0,BetaCoV_Wuhan_HBCDC-HB-03_2020_S24_R1_001.fast...,https://sra-download.ncbi.nlm.nih.gov/traces/s...,https://sra-download.ncbi.nlm.nih.gov/traces/s...,,human BetaCoV Wuhan HBCDC-HB-03/2020,"Union Hospital of Tongji Medical College, Huaz...",2020-01-18,China: Hubei,Homo sapiens,COVID-19,Wuhan,not collected,Pathogen.cl,,,,,,,,,,,,,,,,,,,,,,,,,SRR11454610,PRJNA616446,/home/mattolm/user_data/Covid_19/reads/SRR1145...,True,True,/home/mattolm/user_data/Covid_19/reads/SRR1145...,/home/mattolm/user_data/Covid_19/reads/SRR1145...,True,False,SINGLE,/home/mattolm/user_data/Covid_19/reads/filtere...,/home/mattolm/user_data/Covid_19/reads/filtere...,/home/mattolm/user_data/Covid_19/reads/filtere...,False,False,True,/home/mattolm/user_data/Covid_19/inStrain/mapp...,True,/home/mattolm/user_data/Covid_19/inStrain/mapp...,True,14337950,14337950,0.87,124740,/home/mattolm/user_data/Covid_19/inStrain/prof...,True


In [88]:
# Basic
PMdb = SRdb[['Run', 'experiment_title', 'experiment_desc', 'sample_accession', 
             'library_strategy', 'library_source', 'library_selection', 'LibraryLayout', 
             'instrument', 'collected_by', 'collection_date', 'isolation_source',
             'total_reads',	'paired_reads', 'percent_aligned', 'aligned_reads']]#, 'BioSample']]
PMdb.to_csv('/home/mattolm/user_data/Covid_19/Pipeline/Jupyter/{0}_SRA_basic_info.csv'.format(d4), index=False)
PMdb.head()

Unnamed: 0,Run,experiment_title,experiment_desc,sample_accession,library_strategy,library_source,library_selection,LibraryLayout,instrument,collected_by,collection_date,isolation_source,total_reads,paired_reads,percent_aligned,aligned_reads
0,SRR11454606,RNA-Seq of Homo sapiens: hCov-19 infected pati...,RNA-Seq of Homo sapiens: hCov-19 infected pati...,SRS6404546,RNA-Seq,TRANSCRIPTOMIC,RANDOM PCR,SINGLE,Illumina MiniSeq,Tianmen Center for Disease Control and Prevention,2020-02-08,Tianmen,5668472,5668472,0.03,1700
1,SRR11454607,RNA-Seq of Homo sapiens: hCov-19 infected pati...,RNA-Seq of Homo sapiens: hCov-19 infected pati...,SRS6404545,RNA-Seq,TRANSCRIPTOMIC,RANDOM PCR,SINGLE,Illumina MiniSeq,Wuhan Lung Hospital,2020-02-07,Wuhan,4407436,4407436,0.25,11018
2,SRR11454608,RNA-Seq of Homo sapiens: hCov-19 infected pati...,RNA-Seq of Homo sapiens: hCov-19 infected pati...,SRS6404544,RNA-Seq,TRANSCRIPTOMIC,RANDOM PCR,SINGLE,Illumina iSeq 100,Jingzhou Center for Disease Control and Preven...,2020-01-08,Jingzhou,2690410,2690410,3.2,86093
3,SRR11454609,RNA-Seq of Homo sapiens: hCov-19 infected pati...,RNA-Seq of Homo sapiens: hCov-19 infected pati...,SRS6404543,RNA-Seq,TRANSCRIPTOMIC,RANDOM PCR,SINGLE,NextSeq 550,The Central Hospital Of Wuhan,2020-01-17,Wuhan,17121629,17121629,0.37,63350
4,SRR11454610,RNA-Seq of Homo sapiens: hCov-19 infected pati...,RNA-Seq of Homo sapiens: hCov-19 infected pati...,SRS6404542,RNA-Seq,TRANSCRIPTOMIC,RANDOM PCR,SINGLE,NextSeq 550,"Union Hospital of Tongji Medical College, Huaz...",2020-01-18,Wuhan,14337950,14337950,0.87,124740


In [123]:
# Parsed

PLdb = SRdb.copy()

# Add from BioMeta
PLdb = pd.merge(PLdb, BioMeta[['BioProject', 'Group', 'Method', 'Center_ID']], on='BioProject', how='left')

VARIABLES = ['Center_ID', 'Group', 'BioProject', 'Method', 'library_strategy', 'library_source', 'instrument', 'LibraryLayout']
for v in VARIABLES:
    if len(PLdb[PLdb[v].isna()]) > 0:
        print("{0} samples have no {1}".format(len(PLdb[PLdb[v].isna()]), v))

# Add basic instrain
dbs = []
for i, row in PLdb[PLdb['inStrain_succeeded'] == True].iterrows():
    db = pd.read_csv(row['inStrain_coverage'], sep='\t')
    db['Run'] = row['Run']
    dbs.append(db)
COdb = pd.concat(dbs).reset_index(drop=True)

PLdb = pd.merge(PLdb, COdb[['Run', 'coverage', 'breadth']], how='left', on='Run')
assert len(PLdb[(PLdb['coverage'].isna()) & (PLdb['aligned_reads'] > 0)]) == 0

# Filter
DESC_VARS = ['sample_accession', 'experiment_desc', 'collection_date', 'isolation_source']
INFO_VARS = ['total_reads', 'percent_aligned', 'coverage', 'breadth']
PLdb = PLdb[['Run'] + VARIABLES + INFO_VARS + DESC_VARS]

# Print a little description
COV_LIM = 50
BRE_LIM = 0.9

fdb = PLdb[(PLdb['coverage'] >= COV_LIM)]
print("{0} of {1} samples have >={2} coverage".format(len(fdb), len(PLdb), COV_LIM, BRE_LIM))

fdb = PLdb[(PLdb['breadth'] >= BRE_LIM)]
print("{0} of {1} samples have >={3} breadth".format(len(fdb), len(PLdb), COV_LIM, BRE_LIM))

fdb = PLdb[(PLdb['coverage'] >= COV_LIM) & (PLdb['breadth'] >= BRE_LIM)]
print("{0} of {1} samples have >={2} coverage and >={3} breadth".format(len(fdb), len(PLdb), COV_LIM, BRE_LIM))
fdb['Center_ID'].value_counts()

PLdb['Sufficient_cov'] = [((coverage >= COV_LIM) & (breadth >= BRE_LIM)) for coverage, breadth in zip(PLdb['coverage'], PLdb['breadth'])]
PLdb.to_csv('/home/mattolm/user_data/Covid_19/Pipeline/Jupyter/{0}_SRA_parsed_info.csv'.format(d4), index=False)
PLdb.head()

59 of 159 samples have >=50 coverage
60 of 159 samples have >=0.9 breadth
50 of 159 samples have >=50 coverage and >=0.9 breadth


Unnamed: 0,Run,Center_ID,Group,BioProject,Method,library_strategy,library_source,instrument,LibraryLayout,total_reads,percent_aligned,coverage,breadth,sample_accession,experiment_desc,collection_date,isolation_source,Sufficient_cov
0,SRR11454606,Hubei_China,Hubei Provincial Center for Disease Control an...,PRJNA616446,RNA-seq,RNA-Seq,TRANSCRIPTOMIC,Illumina MiniSeq,SINGLE,5668472,0.03,6.168746,0.965522,SRS6404546,RNA-Seq of Homo sapiens: hCov-19 infected pati...,2020-02-08,Tianmen,False
1,SRR11454607,Hubei_China,Hubei Provincial Center for Disease Control an...,PRJNA616446,RNA-seq,RNA-Seq,TRANSCRIPTOMIC,Illumina MiniSeq,SINGLE,4407436,0.25,40.258502,0.997559,SRS6404545,RNA-Seq of Homo sapiens: hCov-19 infected pati...,2020-02-07,Wuhan,False
2,SRR11454608,Hubei_China,Hubei Provincial Center for Disease Control an...,PRJNA616446,RNA-seq,RNA-Seq,TRANSCRIPTOMIC,Illumina iSeq 100,SINGLE,2690410,3.2,364.855165,0.998428,SRS6404544,RNA-Seq of Homo sapiens: hCov-19 infected pati...,2020-01-08,Jingzhou,True
3,SRR11454609,Hubei_China,Hubei Provincial Center for Disease Control an...,PRJNA616446,RNA-seq,RNA-Seq,TRANSCRIPTOMIC,NextSeq 550,SINGLE,17121629,0.37,139.966692,0.998763,SRS6404543,RNA-Seq of Homo sapiens: hCov-19 infected pati...,2020-01-17,Wuhan,True
4,SRR11454610,Hubei_China,Hubei Provincial Center for Disease Control an...,PRJNA616446,RNA-seq,RNA-Seq,TRANSCRIPTOMIC,NextSeq 550,SINGLE,14337950,0.87,284.849948,0.998729,SRS6404542,RNA-Seq of Homo sapiens: hCov-19 infected pati...,2020-01-18,Wuhan,True


In [122]:

PLdb['Sufficient_cov'].value_counts()

False    109
True      50
Name: Sufficient_cov, dtype: int64