In [15]:
import pandas as pd

## Data import

In [16]:
### Input mutation files

# mutation call positions wrt to the EB reference 
fname_mutations_wild_type = '../../resources/run_workflow/results_cluster/wild_type/all_mutations.EB_space.csv'
fname_mutations_ko = '../../resources/run_workflow/results_cluster/dicer_KO/all_mutations.EB_space.csv'
fname_mutations_overexpressed = '../../resources/run_workflow/results_cluster/dicer_overexpression/all_mutations.EB_space.csv'

# mutation call positions wrt to the EB reference
fname_parental = '../../resources/run_workflow/results_cluster/parental_stock_ref_EBref/all_mutations.csv'

In [17]:
df_wild_type = pd.read_csv(fname_mutations_wild_type)
df_wild_type = df_wild_type.drop(['Unnamed: 0'], axis =1)
df_wild_type['genotype'] = 'wild_type'
df_wild_type['passage'] = df_wild_type['date']
df_wild_type['replicate'] = df_wild_type['patient']

df_ko = pd.read_csv(fname_mutations_ko)
df_ko = df_ko.drop(['Unnamed: 0'], axis =1)
df_ko['genotype'] = 'dicer_ko'
df_ko['passage'] = df_ko['date']
df_ko['replicate'] = df_ko['patient']

df_overexpressed = pd.read_csv(fname_mutations_overexpressed)
df_overexpressed = df_overexpressed.drop(['Unnamed: 0'], axis =1)
df_overexpressed['genotype'] = 'dicer_overexpressed'
df_overexpressed['passage'] = df_overexpressed['date']
df_overexpressed['replicate'] = df_overexpressed['patient']

# read in parental strain without shift of positions
df_parental = pd.read_csv(fname_parental)
df_parental = df_parental.drop(['Unnamed: 0'], axis =1)
df_parental['genotype'] = 'aaa_parental_stock'
df_parental['passage'] = '0'
df_parental['replicate'] = '0'

In [18]:
# dataframe with all mutations from all samples
df = pd.concat([df_ko, df_overexpressed, df_wild_type, df_parental])

In [19]:
# drop unnecessary columns
df = df.drop(['Unnamed: 0.1', 'sample', 'patient', 'date'], axis=1)

In [20]:
df.shape

(92347, 19)

## Add information

In [21]:
# add information
df['Frq1'] = pd.to_numeric(df['Frq1'], errors='coerce')
df['Frq2'] = pd.to_numeric(df['Frq2'], errors='coerce')
df['Frq3'] = pd.to_numeric(df['Frq3'], errors='coerce')

df['n_reads_var'] = df['Rvar'] + df['Fvar']
df['coverage'] = df['Rtot'] + df['Ftot']
df['frequency'] = df['n_reads_var'] / df['coverage']
df['Frq_ave'] = df[['Frq1','Frq2','Frq3']].mean(axis=1)

df['passage'] = df['passage'].str.split('_').str[-1].astype('float')

df['position'] = df['Pos']

## Filtering

In [22]:
# Post-processing filtering of mutation calling

# filter out where Frq_ave == 0.0 
# that is something unexpected happening in ShoRAH which is due to the super high coverage
df = df[df['Frq_ave']!=0]

# strand bias test 
df = df[df['Pval']>=0.05]

# minimum read support
minimum_read_support = 10 
df = df[df['n_reads_var']>=minimum_read_support]

In [23]:
df.shape

(18088, 24)

In [24]:
df.head(10)

Unnamed: 0,Chromosome,Pos,Ref,Var,Frq1,Frq2,Frq3,Pst1,Pst2,Pst3,...,Pval,Qval,genotype,passage,replicate,n_reads_var,coverage,frequency,Frq_ave,position
3,parental_stock_consensus,190,A,G,0.001,0.0011,0.0015,1.0000,0.9105,0.9999,...,0.401522,1.0,dicer_ko,10.0,replicate_d,117,79234,0.001477,0.0012,190
4,parental_stock_consensus,195,T,-,0.0005,0.0004,,1.0000,1.0,-,...,0.067867,0.548765,dicer_ko,10.0,replicate_d,25,77049,0.000324,0.00045,195
5,parental_stock_consensus,196,A,-,0.0005,0.0004,,1.0000,1.0,-,...,0.050775,0.429836,dicer_ko,10.0,replicate_d,26,76259,0.000341,0.00045,196
6,parental_stock_consensus,208,T,-,,0.0017,0.0016,-,0.9952,1.0000,...,0.709037,1.0,dicer_ko,10.0,replicate_d,117,68340,0.001712,0.00165,208
8,parental_stock_consensus,219,A,T,0.0,0.0002,,0.9546,0.9565,-,...,1.0,1.0,dicer_ko,10.0,replicate_d,43,60421,0.000712,0.0001,219
9,parental_stock_consensus,222,A,G,0.0011,0.0,,1.0000,0.9438,-,...,0.197119,1.0,dicer_ko,10.0,replicate_d,74,58453,0.001266,0.00055,222
11,parental_stock_consensus,246,T,C,0.0,0.0001,,0.9565,0.9924,-,...,0.677219,1.0,dicer_ko,10.0,replicate_d,21,49653,0.000423,5e-05,246
44,parental_stock_consensus,276,T,-,0.0351,0.0433,0.0314,0.9998,0.9948,0.9991,...,0.098921,0.748033,dicer_ko,10.0,replicate_d,1640,51556,0.03181,0.0366,276
48,parental_stock_consensus,292,A,T,0.0001,0.0009,0.0017,0.9868,1.0,0.9886,...,0.4918,1.0,dicer_ko,10.0,replicate_d,115,53231,0.00216,0.0009,292
49,parental_stock_consensus,293,C,T,0.0001,0.0003,,0.9868,1.0,-,...,0.107136,0.777279,dicer_ko,10.0,replicate_d,58,53753,0.001079,0.0002,293


In [25]:
#df.to_csv('mutations_of_all_samples_after_filtering.csv')

In [26]:
df.to_csv('mutations_of_all_samples_after_filtering_test.csv')