In [1]:
### Author: Leonie KÃ¼chenhoff
### Date: October 2022
### Purpose of script: Test stat. significance on fractions of SNP type per treatment condition
### on RNA and WGS data

In [2]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
from scipy.stats import fisher_exact
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy.special import expit
from itertools import product
from config import outdir, basedir, wgs_vartype_dir


In [None]:
os.chdir(basedir)

In [3]:
names = ['279', '282', '450']
paths_anno = [f'{wgs_vartype_dir}/{i}_absolute_numbers.txt' for i in names]

In [4]:
abs_dict_snv = {}
for i in names:
    path = f'{wgs_vartype_dir}/{i}_absolute_numbers.txt'
    table = pd.read_csv(path, sep='\t')
    table['replicate'] = i
    abs_dict_snv[i] = table

### WGS data

In [5]:
abs_count = pd.concat((abs_dict_snv['279'], abs_dict_snv['282'], abs_dict_snv['450'])).fillna(0)
full_df = pd.melt(abs_count, id_vars = ['replicate','replacement'], value_vars = ['heart', 'tail'])
rest = full_df[full_df['replacement']!='TC'].groupby(['replicate','variable']).sum().reset_index()
rest['replacement'] = 'other'
df_AG = full_df[full_df['replacement']=='TC']
bin_data = pd.concat([df_AG, rest])
bin_data_large = pd.merge(rest,df_AG, on = ['replicate', 'variable'], how = 'outer', suffixes=['nonAG','AG'])
smf.glm("valuenonAG + valueAG ~ variable ", family=sm.families.Binomial(), data=bin_data_large).fit().summary()

0,1,2,3
Dep. Variable:,"['valuenonAG', 'valueAG']",No. Observations:,6.0
Model:,GLM,Df Residuals:,4.0
Model Family:,Binomial,Df Model:,1.0
Link Function:,Logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-25.487
Date:,"Mon, 16 Jan 2023",Deviance:,25.729
Time:,17:04:03,Pearson chi2:,22.7
No. Iterations:,4,Pseudo R-squ. (CS):,0.09318
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,1.2826,0.197,6.519,0.000,0.897,1.668
variable[T.tail],-0.1659,0.219,-0.759,0.448,-0.594,0.263


In [6]:
abs_count = pd.concat((abs_dict_snv['279'], abs_dict_snv['282'], abs_dict_snv['450'])).fillna(0)
full_df = pd.melt(abs_count, id_vars = ['replicate','replacement'], value_vars = ['liver', 'tail'])
rest = full_df[full_df['replacement']!='TC'].groupby(['replicate','variable']).sum().reset_index()
rest['replacement'] = 'other'
df_AG = full_df[full_df['replacement']=='TC']
bin_data = pd.concat([df_AG, rest])
bin_data_large = pd.merge(rest,df_AG, on = ['replicate', 'variable'], how = 'outer', suffixes=['nonAG','AG'])
smf.glm("valuenonAG + valueAG ~ variable ", family=sm.families.Binomial(), data=bin_data_large).fit().summary()

0,1,2,3
Dep. Variable:,"['valuenonAG', 'valueAG']",No. Observations:,6.0
Model:,GLM,Df Residuals:,4.0
Model Family:,Binomial,Df Model:,1.0
Link Function:,Logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-23.78
Date:,"Mon, 16 Jan 2023",Deviance:,21.866
Time:,16:10:18,Pearson chi2:,19.5
No. Iterations:,4,Pseudo R-squ. (CS):,0.4734
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,1.5315,0.195,7.856,0.000,1.149,1.914
variable[T.tail],-0.4148,0.217,-1.911,0.056,-0.840,0.011


### RNA data

In [23]:
names = ['028_pbs_R', '029_pbs_R', '032_pbs_R','033_nrch_R', '030_nrch_R', '036_nrch_R', 
            '011_pbs', '012_nrch', '013_nrch', '014_nrch', '279_spry', '321_pbs', '333_pbs', '450_spry', '283_spry']
# treatment in same order as sample names
base_editor = ['pbs', 'pbs', 'pbs', '8e-NRCH', '8e-NRCH', '8e-NRCH',
            'pbs', '8e-NRCH', '8e-NRCH','8e-NRCH', 'max-SpRY', 'pbs', 'pbs', 'max-SpRY', 'max-SpRY']
varcallers = ['hc', 'pl', 'st']
mutation = ['r636q' if i[-1] == 'R' else 'p635l' for i in names]
combinations = list(product(names, varcallers))
pairing = dict(zip(names, base_editor))
pairing_mut = dict(zip(names, mutation))

In [24]:
strand=pd.read_csv(f"{outdir}/SNP_type/strand_tier.txt",sep=",")
no_strand=pd.read_csv(f"{outdir}/SNP_type/tier.txt",sep=",")

group_strand=pd.read_csv(f"{outdir}/SNP_type/strand.txt",sep=",")
group_no_strand=pd.read_csv(f"{outdir}/SNP_type/grouped.txt",sep=",")

In [25]:
def edit_df(df, col):
    df_c = df.copy()
    full_df = df_c.melt(id_vars = 'replacement')

    rest = full_df[full_df['replacement']!=col].groupby(['variable']).sum().reset_index()
    rest['replacement'] = 'other'

    df_AG = full_df[full_df['replacement']==col].groupby(['variable']).sum().reset_index()
    df_AG['replacement'] = 'AG'

    bin_data_large = pd.merge(rest,df_AG, on = ['variable'], how = 'outer', suffixes=['nonAG','AG'])
    bin_data_large['treatment'] = bin_data_large['variable'].map(pairing)
    bin_data_large['mutation'] = bin_data_large['variable'].map(pairing_mut)

    return(bin_data_large)

In [26]:
# stranded 

In [27]:
bin_data_large = edit_df(group_strand, col = 'AG')

In [28]:
# 8e-NRCH vs PBS r636q
data=bin_data_large[bin_data_large['mutation'] == 'r636q']
smf.glm("valuenonAG + valueAG ~ treatment ", family=sm.families.Binomial(), data=data).fit().summary()

0,1,2,3
Dep. Variable:,"['valuenonAG', 'valueAG']",No. Observations:,6.0
Model:,GLM,Df Residuals:,4.0
Model Family:,Binomial,Df Model:,1.0
Link Function:,Logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-19.183
Date:,"Mon, 16 Jan 2023",Deviance:,2.4916
Time:,17:07:13,Pearson chi2:,2.53
No. Iterations:,5,Pseudo R-squ. (CS):,0.5428
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,1.4216,0.064,22.320,0.000,1.297,1.546
treatment[T.pbs],0.2117,0.098,2.159,0.031,0.020,0.404


In [29]:
# max-SpRY vs PBS p635l
data=bin_data_large[(bin_data_large['mutation'] == 'p635l') & (bin_data_large['treatment'] != '8e-NRCH')]
smf.glm("valuenonAG + valueAG ~ treatment ", family=sm.families.Binomial(), data=data).fit().summary()

0,1,2,3
Dep. Variable:,"['valuenonAG', 'valueAG']",No. Observations:,6.0
Model:,GLM,Df Residuals:,4.0
Model Family:,Binomial,Df Model:,1.0
Link Function:,Logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-18.073
Date:,"Mon, 16 Jan 2023",Deviance:,4.2028
Time:,17:07:20,Pearson chi2:,4.07
No. Iterations:,5,Pseudo R-squ. (CS):,0.347
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,1.5385,0.064,23.853,0.000,1.412,1.665
treatment[T.pbs],0.2064,0.131,1.580,0.114,-0.050,0.462


In [30]:
# 8e-NRCH vs PBS p635l
data=bin_data_large[(bin_data_large['mutation'] == 'p635l') & (bin_data_large['treatment'] != 'max-SpRY')]
smf.glm("valuenonAG + valueAG ~ treatment ", family=sm.families.Binomial(), data=data).fit().summary()

0,1,2,3
Dep. Variable:,"['valuenonAG', 'valueAG']",No. Observations:,6.0
Model:,GLM,Df Residuals:,4.0
Model Family:,Binomial,Df Model:,1.0
Link Function:,Logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-16.656
Date:,"Mon, 16 Jan 2023",Deviance:,3.2317
Time:,17:07:20,Pearson chi2:,3.15
No. Iterations:,5,Pseudo R-squ. (CS):,0.349
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,1.5013,0.101,14.810,0.000,1.303,1.700
treatment[T.pbs],0.2436,0.152,1.600,0.110,-0.055,0.542
