## Merge with Metadata

This notebook takes a CSV produced by run_pipeline.py and merges it with the SRA Run Table TSV file. It optionally  normalises by Mbases as well if this is required. Note this will override normalisation by number of reads if this has already been done.

In [97]:
import pandas as pd
import re

In [98]:
df_meta = pd.read_csv('../../accession_lists/allergy_SRARunTable.txt', sep='\t')
df_data = pd.read_csv('../data_files/allergy.allergy.csv', index_col=0)
output_name = 'allergy.meta.mbase_norm.csv'
normalise_with_mbases = True

In [99]:
df_data.head()

Unnamed: 0,file_location,source,alignment_count,alpha_wt_count,alpha_dup_count,beta_count,alpha_wt_zero_edit_count,alpha_dup_zero_edit_count,beta_zero_edit_count,alpha_read_covers_snps_count,alpha_dup_read_covers_snps_count,beta_read_covers_snps_count,alpha_read_covers_snps_count_exact,alpha_dup_read_covers_snps_count_exact,beta_read_covers_snps_count_exact
allergy.SRR2240319.sam.sorted.bam,../results/allergy_results/allergy.SRR2240319....,allergy,0,0,0,0,0,0,0,0,0,0,0,0,0
allergy.SRR2240320.sam.sorted.bam,../results/allergy_results/allergy.SRR2240320....,allergy,0,0,0,0,0,0,0,0,0,0,0,0,0
allergy.SRR2240321.sam.sorted.bam,../results/allergy_results/allergy.SRR2240321....,allergy,0,0,0,0,0,0,0,0,0,0,0,0,0
allergy.SRR2240322.sam.sorted.bam,../results/allergy_results/allergy.SRR2240322....,allergy,0,0,0,0,0,0,0,0,0,0,0,0,0
allergy.SRR2240323.sam.sorted.bam,../results/allergy_results/allergy.SRR2240323....,allergy,0,0,0,0,0,0,0,0,0,0,0,0,0


In [100]:
def get_srr_accession(df):
    
    pattern = re.compile(r"[DSE]RR[0-9]+")
    
    file = df['file_location']
    
    return re.search(pattern, file).group(0)

In [101]:
df_data['srr_acc'] = df_data.apply(get_srr_accession, axis=1)

In [102]:
df_data.set_index('srr_acc',inplace=True)

In [103]:
df_data.head()

Unnamed: 0_level_0,file_location,source,alignment_count,alpha_wt_count,alpha_dup_count,beta_count,alpha_wt_zero_edit_count,alpha_dup_zero_edit_count,beta_zero_edit_count,alpha_read_covers_snps_count,alpha_dup_read_covers_snps_count,beta_read_covers_snps_count,alpha_read_covers_snps_count_exact,alpha_dup_read_covers_snps_count_exact,beta_read_covers_snps_count_exact
srr_acc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
SRR2240319,../results/allergy_results/allergy.SRR2240319....,allergy,0,0,0,0,0,0,0,0,0,0,0,0,0
SRR2240320,../results/allergy_results/allergy.SRR2240320....,allergy,0,0,0,0,0,0,0,0,0,0,0,0,0
SRR2240321,../results/allergy_results/allergy.SRR2240321....,allergy,0,0,0,0,0,0,0,0,0,0,0,0,0
SRR2240322,../results/allergy_results/allergy.SRR2240322....,allergy,0,0,0,0,0,0,0,0,0,0,0,0,0
SRR2240323,../results/allergy_results/allergy.SRR2240323....,allergy,0,0,0,0,0,0,0,0,0,0,0,0,0


In [104]:
df_meta.set_index('Run', inplace=True)

In [105]:
df_meta.head()

Unnamed: 0_level_0,Assay_Type,AvgSpotLen,BioProject,BioSample,Center_Name,Experiment,Instrument,LibrarySelection,Library_Name,LoadDate,...,DATASTORE_filetype,DATASTORE_provider,InsertSize,LibraryLayout,LibrarySource,Organism,analyte,biomaterial_provider,health_state,sex
Run,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SRR2240322,RNA-Seq,51,PRJNA287538,SAMN04028900,GEO,SRX1182514,Illumina HiSeq 2500,cDNA,,2015-09-04,...,sra,ncbi,0,SINGLE,TRANSCRIPTOMIC,Homo sapiens,,,,
SRR2240323,RNA-Seq,51,PRJNA287538,SAMN04028901,GEO,SRX1182515,Illumina HiSeq 2500,cDNA,,2015-09-04,...,sra,ncbi,0,SINGLE,TRANSCRIPTOMIC,Homo sapiens,,,,
SRR2240324,RNA-Seq,51,PRJNA287538,SAMN04028901,GEO,SRX1182515,Illumina HiSeq 2500,cDNA,,2015-09-04,...,sra,ncbi,0,SINGLE,TRANSCRIPTOMIC,Homo sapiens,,,,
SRR2240325,RNA-Seq,51,PRJNA287538,SAMN04028901,GEO,SRX1182515,Illumina HiSeq 2500,cDNA,,2015-09-04,...,sra,ncbi,0,SINGLE,TRANSCRIPTOMIC,Homo sapiens,,,,
SRR2240326,RNA-Seq,51,PRJNA287538,SAMN04028902,GEO,SRX1182516,Illumina HiSeq 2500,cDNA,,2015-09-04,...,sra,ncbi,0,SINGLE,TRANSCRIPTOMIC,Homo sapiens,,,,


In [106]:
del df_meta.index.name
del df_data.index.name

In [107]:
df_meta.head()

Unnamed: 0,Assay_Type,AvgSpotLen,BioProject,BioSample,Center_Name,Experiment,Instrument,LibrarySelection,Library_Name,LoadDate,...,DATASTORE_filetype,DATASTORE_provider,InsertSize,LibraryLayout,LibrarySource,Organism,analyte,biomaterial_provider,health_state,sex
SRR2240322,RNA-Seq,51,PRJNA287538,SAMN04028900,GEO,SRX1182514,Illumina HiSeq 2500,cDNA,,2015-09-04,...,sra,ncbi,0,SINGLE,TRANSCRIPTOMIC,Homo sapiens,,,,
SRR2240323,RNA-Seq,51,PRJNA287538,SAMN04028901,GEO,SRX1182515,Illumina HiSeq 2500,cDNA,,2015-09-04,...,sra,ncbi,0,SINGLE,TRANSCRIPTOMIC,Homo sapiens,,,,
SRR2240324,RNA-Seq,51,PRJNA287538,SAMN04028901,GEO,SRX1182515,Illumina HiSeq 2500,cDNA,,2015-09-04,...,sra,ncbi,0,SINGLE,TRANSCRIPTOMIC,Homo sapiens,,,,
SRR2240325,RNA-Seq,51,PRJNA287538,SAMN04028901,GEO,SRX1182515,Illumina HiSeq 2500,cDNA,,2015-09-04,...,sra,ncbi,0,SINGLE,TRANSCRIPTOMIC,Homo sapiens,,,,
SRR2240326,RNA-Seq,51,PRJNA287538,SAMN04028902,GEO,SRX1182516,Illumina HiSeq 2500,cDNA,,2015-09-04,...,sra,ncbi,0,SINGLE,TRANSCRIPTOMIC,Homo sapiens,,,,


In [108]:
master_df = df_meta.join(df_data)

In [109]:
master_df.columns

Index(['Assay_Type', 'AvgSpotLen', 'BioProject', 'BioSample', 'Center_Name',
       'Experiment', 'Instrument', 'LibrarySelection', 'Library_Name',
       'LoadDate', 'MBases', 'MBytes', 'Platform', 'ReleaseDate', 'SRA_Sample',
       'SRA_Study', 'Sample_Name', 'age', 'cell_type', 'isolate',
       'patient_phenotype', 'run_label', 'seasonality', 'source_name',
       'timepoint', 'tissue', 'treatment', 'BioSampleModel', 'Consent',
       'DATASTORE_filetype', 'DATASTORE_provider', 'InsertSize',
       'LibraryLayout', 'LibrarySource', 'Organism', 'analyte',
       'biomaterial_provider', 'health_state', 'sex', 'file_location',
       'source', 'alignment_count', 'alpha_wt_count', 'alpha_dup_count',
       'beta_count', 'alpha_wt_zero_edit_count', 'alpha_dup_zero_edit_count',
       'beta_zero_edit_count', 'alpha_read_covers_snps_count',
       'alpha_dup_read_covers_snps_count', 'beta_read_covers_snps_count',
       'alpha_read_covers_snps_count_exact',
       'alpha_dup_read_covers_

In [110]:
if normalise_with_mbases == True:


    master_df['norm_alignment_count'] = master_df['alignment_count'] / (master_df['MBases'])
    master_df['norm_alpha_wt_count'] = master_df['alpha_wt_count'] / (master_df['MBases'])
    master_df['norm_alpha_dup_count'] = master_df['alpha_dup_count'] / (master_df['MBases'])
    master_df['norm_beta_count'] = master_df['beta_count'] / (master_df['MBases'])
    master_df['norm_alpha_read_covers_snps_count_exact'] = master_df['alpha_read_covers_snps_count_exact'] / (master_df['MBases'])
    master_df['norm_alpha_dup_read_covers_snps_count_exact'] = master_df['alpha_dup_read_covers_snps_count_exact'] / (master_df['MBases'])
    master_df['norm_beta_read_covers_snps_count_exact'] = master_df['beta_read_covers_snps_count_exact'] / (master_df['MBases'])

In [111]:
master_df.head()

Unnamed: 0,Assay_Type,AvgSpotLen,BioProject,BioSample,Center_Name,Experiment,Instrument,LibrarySelection,Library_Name,LoadDate,...,alpha_read_covers_snps_count_exact,alpha_dup_read_covers_snps_count_exact,beta_read_covers_snps_count_exact,norm_alignment_count,norm_alpha_wt_count,norm_alpha_dup_count,norm_beta_count,norm_alpha_read_covers_snps_count_exact,norm_alpha_dup_read_covers_snps_count_exact,norm_beta_read_covers_snps_count_exact
SRR2240322,RNA-Seq,51,PRJNA287538,SAMN04028900,GEO,SRX1182514,Illumina HiSeq 2500,cDNA,,2015-09-04,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SRR2240323,RNA-Seq,51,PRJNA287538,SAMN04028901,GEO,SRX1182515,Illumina HiSeq 2500,cDNA,,2015-09-04,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SRR2240324,RNA-Seq,51,PRJNA287538,SAMN04028901,GEO,SRX1182515,Illumina HiSeq 2500,cDNA,,2015-09-04,...,0,0,0,,,,,,,
SRR2240325,RNA-Seq,51,PRJNA287538,SAMN04028901,GEO,SRX1182515,Illumina HiSeq 2500,cDNA,,2015-09-04,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SRR2240326,RNA-Seq,51,PRJNA287538,SAMN04028902,GEO,SRX1182516,Illumina HiSeq 2500,cDNA,,2015-09-04,...,0,0,0,0.006289,0.0,0.0,0.006289,0.0,0.0,0.0


In [112]:
master_df.to_csv(output_name)

In [113]:
master_df.head()

Unnamed: 0,Assay_Type,AvgSpotLen,BioProject,BioSample,Center_Name,Experiment,Instrument,LibrarySelection,Library_Name,LoadDate,...,alpha_read_covers_snps_count_exact,alpha_dup_read_covers_snps_count_exact,beta_read_covers_snps_count_exact,norm_alignment_count,norm_alpha_wt_count,norm_alpha_dup_count,norm_beta_count,norm_alpha_read_covers_snps_count_exact,norm_alpha_dup_read_covers_snps_count_exact,norm_beta_read_covers_snps_count_exact
SRR2240322,RNA-Seq,51,PRJNA287538,SAMN04028900,GEO,SRX1182514,Illumina HiSeq 2500,cDNA,,2015-09-04,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SRR2240323,RNA-Seq,51,PRJNA287538,SAMN04028901,GEO,SRX1182515,Illumina HiSeq 2500,cDNA,,2015-09-04,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SRR2240324,RNA-Seq,51,PRJNA287538,SAMN04028901,GEO,SRX1182515,Illumina HiSeq 2500,cDNA,,2015-09-04,...,0,0,0,,,,,,,
SRR2240325,RNA-Seq,51,PRJNA287538,SAMN04028901,GEO,SRX1182515,Illumina HiSeq 2500,cDNA,,2015-09-04,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SRR2240326,RNA-Seq,51,PRJNA287538,SAMN04028902,GEO,SRX1182516,Illumina HiSeq 2500,cDNA,,2015-09-04,...,0,0,0,0.006289,0.0,0.0,0.006289,0.0,0.0,0.0
