In [3]:
import pandas as pd

## Data import

In [14]:
### Input mutation files

# annotated mutation call positions wrt to the EB reference 
fname_mutations_wild_type = '../../resources/run_workflow/results_cluster/wild_type/all_mutations.annotated.corrected.csv'
fname_mutations_ko = '../../resources/run_workflow/results_cluster/dicer_KO/all_mutations.annotated.corrected.csv'
fname_mutations_overexpressed = '../../resources/run_workflow/results_cluster/dicer_overexpression/all_mutations.annotated.corrected.csv'

In [17]:
df_wild_type = pd.read_csv(fname_mutations_wild_type)
df_wild_type = df_wild_type.drop(['Unnamed: 0'], axis =1)
df_wild_type['genotype'] = 'wild_type'

df_ko = pd.read_csv(fname_mutations_ko)
df_ko = df_ko.drop(['Unnamed: 0'], axis =1)
df_ko['genotype'] = 'dicer_ko'

df_overexpressed = pd.read_csv(fname_mutations_overexpressed)
df_overexpressed = df_overexpressed.drop(['Unnamed: 0'], axis =1)
df_overexpressed['genotype'] = 'dicer_overexpressed'

In [24]:
# dataframe with all mutations from all samples
df = pd.concat([df_ko, df_overexpressed, df_wild_type])

In [25]:
df.columns

Index(['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO',
       'genotype', 'replicate', 'passage', 'Fvar', 'Rvar', 'Ftot', 'Rtot',
       'Pval', 'Qval', 'Freq1', 'Freq2', 'Freq3', 'Post1', 'Post2', 'Post3',
       'RefCodon', 'AltCodon', 'RefAminoAcid', 'AltAminoAcid', 'CodonPosition',
       'SNPCodonPosition', 'AminoAcidChange', 'IsSynonymous', 'IsTransition',
       'IsGenic', 'IsPseudo', 'LocusTag', 'Gene', 'Note', 'Inference',
       'Product', 'ProteinID', 'Comments', 'VariantType', 'FeatureType'],
      dtype='object')

## Add information

In [26]:
# add information
df['Freq1'] = pd.to_numeric(df['Freq1'], errors='coerce')
df['Freq2'] = pd.to_numeric(df['Freq2'], errors='coerce')
df['Freq3'] = pd.to_numeric(df['Freq3'], errors='coerce')

df['n_reads_var'] = df['Rvar'] + df['Fvar']
df['coverage'] = df['Rtot'] + df['Ftot']
df['frequency'] = df['n_reads_var'] / df['coverage']
df['Frq_ave'] = df[['Freq1','Freq2','Freq3']].mean(axis=1)

df['passage'] = df['passage'].str.split('_').str[-1].astype('float')

df['position'] = df['POS']

## Filtering

In [27]:
# Post-processing filtering of mutation calling

# filter out where Frq_ave == 0.0 
# that is something unexpected happening in ShoRAH which is due to the super high coverage
df = df[df['Frq_ave']!=0]

# strand bias test 
df = df[df['Pval']>=0.05]

# minimum read support
minimum_read_support = 10 
df = df[df['n_reads_var']>=minimum_read_support]

In [28]:
df.head(10)

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,genotype,replicate,...,Product,ProteinID,Comments,VariantType,FeatureType,n_reads_var,coverage,frequency,Frq_ave,position
0,AF014388,190,.,A,G,15.2481,PASS,Fvar=28;Rvar=89;Ftot=15146;Rtot=64088;Pval=0.4...,dicer_ko,replicate_d,...,.,.,.,SNP,inter_genic,117,79234,0.001477,0.0012,190
2,AF014388,219,.,A,T,4.40133,PASS,Fvar=15;Rvar=28;Ftot=21761;Rtot=38660;Pval=1.0...,dicer_ko,replicate_d,...,.,.,.,SNP,inter_genic,43,60421,0.000712,6.7e-05,219
3,AF014388,222,.,A,G,4.53375,PASS,Fvar=21;Rvar=53;Ftot=22546;Rtot=35907;Pval=0.1...,dicer_ko,replicate_d,...,.,.,.,SNP,inter_genic,74,58453,0.001266,0.000367,222
4,AF014388,246,.,T,C,4.55477,PASS,Fvar=13;Rvar=8;Ftot=27030;Rtot=22623;Pval=0.67...,dicer_ko,replicate_d,...,.,.,.,SNP,inter_genic,21,49653,0.000423,3.3e-05,246
8,AF014388,292,.,A,T,20.8619,PASS,Fvar=82;Rvar=33;Ftot=35315;Rtot=17916;Pval=0.4...,dicer_ko,replicate_d,...,.,.,.,SNP,inter_genic,115,53231,0.00216,0.0009,292
9,AF014388,293,.,C,T,4.71426,PASS,Fvar=46;Rvar=12;Ftot=35629;Rtot=18124;Pval=0.1...,dicer_ko,replicate_d,...,.,.,.,SNP,inter_genic,58,53753,0.001079,0.000133,293
12,AF014388,331,.,-,C,24.318,PASS,Fvar=15;Rvar=18;Ftot=40664;Rtot=29252;Pval=0.2...,dicer_ko,replicate_d,...,.,.,.,SNP,inter_genic,33,69916,0.000472,0.000167,331
13,AF014388,331,.,-,T,25.7403,PASS,Fvar=1635;Rvar=795;Ftot=40664;Rtot=29252;Pval=...,dicer_ko,replicate_d,...,.,.,.,SNP,inter_genic,2430,69916,0.034756,0.0091,331
14,AF014388,354,.,A,G,4.48062,PASS,Fvar=63;Rvar=47;Ftot=45154;Rtot=35426;Pval=0.9...,dicer_ko,replicate_d,...,.,.,.,SNP,inter_genic,110,80580,0.001365,0.0007,354
15,AF014388,412,.,T,C,17.2507,PASS,Fvar=148;Rvar=134;Ftot=46470;Rtot=40694;Pval=0...,dicer_ko,replicate_d,...,.,.,.,SNP,inter_genic,282,87164,0.003235,0.003,412


In [29]:
df.to_csv('annotated_mutations_of_all_samples_after_filtering.csv')