In [None]:
import pandas as pd
from pathlib import Path
import statsmodels
import statsmodels.api as sm
import statsmodels.formula.api as smf
import numpy as np
import plotly.express as px
import yaml
with open("config.yaml", "r") as fh:
    config_dict = yaml.safe_load(fh)['default']
from sklearn.metrics import precision_score, recall_score, confusion_matrix

In [None]:
%ls /nfs/nas22/fs2202/biol_micro_bioinf_nccr/hardt/nguyenb/tnseq/scratch/03_23_transcriptomics/modeling

In [None]:
root = Path(config_dict["root"])
synth_dir = root/config_dict["synth_dir"]
sample_data_file = root/config_dict["sample_data_file"]
spikes = pd.read_table(synth_dir/"synth_mgx_mtx/true-exp.mtx_spiked.tsv", header=None, names=['feature', 'positive'])
spikes.shape

# Saving synthetic data in the format accepted by deseq2. 
- Added 1 because with such low 'depth' DESeq2 fails.

In [None]:
 
dsdf = pd.read_table(synth_dir/"synth_mgx_mtx/true-exp.mtx_abunds.tsv", index_col=0).T.reset_index().rename(columns={'index': 'sample_id'})
ds_meta = dsdf[['sample_id', 'Phenotype']]
dsdf = dsdf.drop(columns=['Phenotype', 'SeqDepth']).set_index('sample_id').T.reset_index().rename(columns={'#':'ID'}).set_index('ID')
dsdf = dsdf + 1
#ds_meta.to_csv(synth_dir/'true-exp-meta.csv', index=False)
#dsdf.reset_index().to_csv(synth_dir/"true-exp-counts.csv", index=False)

# DESeq results on all samples with 1 added to all counts

In [None]:

# Analysis with global normalisation
deres = pd.read_csv(synth_dir/"2023-07-31_true-exp-deseq-1_vs_0_l0a0.01_results.csv")
deres = deres.merge(spikes, left_on='ID', right_on='feature', how='outer')
# Analysis with taxon specific normalisation
deres_tx = pd.read_csv(synth_dir/"2023-07-31_true-exp-deseq-within-taxon-1_vs_0_l0a0.01_results.csv")
deres_tx = deres_tx.merge(spikes, left_on='ID', right_on='feature', how='outer')

In [None]:
fdr = 0.01
predicted = (deres.padj < fdr).astype(int)
actual = (deres.positive.notnull()).astype(int)
predicted_tx = (deres_tx.padj < fdr).astype(int)
actual_tx = (deres_tx.positive.notnull()).astype(int)
deseq_all = [precision_score(actual, predicted), recall_score(actual, predicted)] + list(confusion_matrix(actual, predicted).ravel().astype(int))
deseq_tx = [precision_score(actual_tx, predicted_tx), recall_score(actual_tx, predicted_tx)] + list(confusion_matrix(actual_tx, predicted_tx).ravel().astype(int))
labels = ['Precision', 'Recall', 'TN', 'FP', 'FN', 'TP']
bench_df = pd.DataFrame([deseq_all, deseq_tx], columns=labels, index = ['Global', 'Taxon']).apply(round,ndigits=2)

In [None]:
bench_df

# Same analysis with linear models

In [None]:
df = pd.read_table(synth_dir/"synth_mgx_mtx/true-exp.mtx_abunds.tsv", index_col=0).T.reset_index().rename(columns={'index': 'sample_id'})
df = df.melt(id_vars=["sample_id", "Phenotype", "SeqDepth"], var_name="gene_name", value_name='raw_cnt')
df['tss'] = df['raw_cnt']/df['SeqDepth']*1e6
df['bug'] = df['gene_name'].str.split("_", expand=True)[0]
df = df.merge(df.groupby(['sample_id', 'bug']).raw_cnt.sum().reset_index().rename(columns={'raw_cnt':'bug_cnt'}), on=['sample_id', 'bug'], how='left')
df['within_bug'] = df['raw_cnt']/df['bug_cnt']*1e6
df['bug_perc'] = df['bug_cnt']/df['SeqDepth']


In [None]:
df[df.sample_id == 'SAMPLE0001'].sample(5)

In [None]:
test_gene = "BUG0007_GROUP001780"
df[df.gene_name == test_gene]

In [None]:
# Filter out genes with less than 100 reads across samples
mask = df.groupby(['gene_name']).raw_cnt.sum() > 100
mask2 = df[df.raw_cnt > 0].groupby('gene_name').sample_id.nunique() > 10
df_filtered = df.set_index('gene_name').loc[mask&mask2,:].reset_index()

In [None]:
gene_df = df[df.gene_name == test_gene].copy()
def linear_on_gene(gene_df, expr_col, fixed_effects):
    pseudo_count = gene_df[gene_df[expr_col] > 0][expr_col].min()/2
    gene_df[f'{expr_col}_log'] = np.log10(gene_df[expr_col] + pseudo_count)
    #gene_df['tss_log_std'] = (gene_df['tss_log'] - gene_df['tss_log'].mean())/gene_df['tss_log'].std()
    if not gene_df.empty and any([i in gene_df.columns for i in fixed_effects]):
        formula = f"{expr_col}_log ~ {' + '.join(fixed_effects)}"
        #print(formula)
        md = smf.glm(formula=formula, data=gene_df, family=sm.families.Gaussian()).fit()
        #print(md.summary())
        return [md.params[1], md.bse[1], md.pvalues[1]]
    return []

def linear_on_df(df, expr_col, fixed_effects):
    f = df.dropna().groupby('gene_name').apply(linear_on_gene, expr_col = expr_col, fixed_effects = fixed_effects).reset_index()
    f[['coef', 'se', 'pval']] = pd.DataFrame(f[0].to_list(), index = f.index)
    f = f.drop(columns=[0])
    f['padj'] = statsmodels.stats.multitest.multipletests(f.pval.values, method='fdr_bh')[1]
    return f

In [None]:
df.shape

In [None]:
df_filtered.shape

In [None]:
phenotype_tx = linear_on_df(df_filtered, 'within_bug', ['Phenotype'])

In [None]:
phenotype_tx

In [None]:
def get_scores(result_df, ground_truth, left_feat, right_feat, name, fdr=0.05):
    df = result_df[[left_feat, 'padj']].merge(ground_truth, left_on=left_feat, right_on=right_feat, how='outer')
    predicted_phenotype_tx = (df.padj < fdr).astype(int)
    actual_phenotype_tx = (df.positive.notnull()).astype(int)
    return  pd.Series([precision_score(actual_phenotype_tx, predicted_phenotype_tx), 
                       recall_score(actual_phenotype_tx, predicted_phenotype_tx)] + list(confusion_matrix(actual_phenotype_tx, predicted_phenotype_tx).ravel().astype(int)),index=['Precision', 'Recall', 'TN', 'FP', 'FN', 'TP'], name=name)


In [None]:
get_scores(phenotype_tx, spikes, 'gene_name', 'feature', 'linear-phenotype')

In [None]:
phenotype_bug_tx = linear_on_df(df_filtered, 'within_bug', ['Phenotype', 'bug_perc'])

In [None]:
get_scores(phenotype_bug_tx, spikes, 'gene_name', 'feature', 'linear-bug-phenotype')

In [None]:
phenotype_global = linear_on_df(df_filtered, 'tss', ['Phenotype'])

In [None]:
get_scores(phenotype_global, spikes, 'gene_name', 'feature', 'linear-bug-phenotype')

In [None]:
linear_on_gene(gene_df, 'within_bug', ['Phenotype'])

In [None]:
m.summary()

In [None]:
test = df_filtered[df_filtered.gene_name.isin(test_genes)]

In [None]:
res = linear_on_df(df_filtered, 'tss', ['Phenotype'])

In [None]:
df_filtered.groupby('bug').bug_cnt.median().sort_values()

In [None]:
gene_df.sample(10)

In [None]:
test

In [None]:
res = res.merge(spikes, left_on='gene_name', right_on='feature', how='outer')

In [None]:
#import modin.pandas as pd

In [None]:
res2 = linear_on_df(df_filtered, 'within_bug', ['Phenotype'])

In [None]:
res2 = res2.merge(spikes, left_on='gene_name', right_on='feature', how='outer')

In [None]:
res2[res2.padj < 0.05].shape[0]

In [None]:
121/2085

In [None]:
res2[res2.padj < 0.05].isna().sum()

In [None]:
res3 = linear_on_df(df_filtered, 'within_bug', ['Phenotype', 'bug_perc'])

In [None]:
res4 = linear_on_df(df_filtered, 'within_bug', ['Phenotype', 'bug_perc', 'bug_perc*Phenotype'])

In [None]:
res4 = res4.merge(spikes, left_on='gene_name', right_on='feature', how='outer')

In [None]:
res4['gene_name'] = res4['gene_name'].fillna(res4.feature)


In [None]:
b7 = res4[res4.gene_name.str.contains('BUG0007')].copy()

In [None]:
b7.shape

In [None]:
b7[(b7.positive.notnull()) & (b7.padj < 0.05)].shape

In [None]:
res4[res4.padj < 0.05]

In [None]:
res3 = res3.merge(spikes, left_on='gene_name', right_on='feature', how='outer')

In [None]:
res3.dropna(subset=['gene_name'])[~res3.positive.isna()].shape

In [None]:
2000/6818

In [None]:
res3[res3.padj < 0.05]

In [None]:
import statsmodels
statsmodels.stats.multitest.multipletests(f.pval.values, method='fdr_bh')[1]

In [None]:
x = test.dropna().groupby('gene_name').apply(linear_on_gene, expr_col = 'within_bug', fixed_effects = ['Phenotype', 'bug_perc', 'Phenotype*bug_perc']).reset_index()
y = test.dropna().groupby('gene_name').apply(linear_on_gene, expr_col = 'within_bug', fixed_effects = ['Phenotype']).reset_index()

In [None]:
' + '.join(['Phenotype', 'bug_perc', 'Phenotype*bug_perc'])

In [None]:
x

In [None]:
y

In [None]:
x[0][3].summary()

In [None]:
y[0][3].summary()

In [None]:
linear_on_gene(test, 'within', ['Phenotype'])

In [None]:
px.box(gene_df, x="Phenotype", y='tss_log_std', width=400, height=400)

In [None]:
# # Run linear model
# md = smf.ols(formula="tss_log ~ Phenotype", data=test_df)
# mdf = md.fit()
# mdf.summary()

In [None]:
# Run linear model
md = smf.glm(formula="tss_log_std ~ Phenotype", data=gene_df, family=sm.families.Gaussian())
mdf = md.fit()

In [None]:
mdf.summary()

In [None]:
mdf.params

In [None]:
mdf.bse

In [None]:
mdf.pvalues

In [None]:
# Results from re-running mtx package
res = pd.read_table("../test_out/all_results.tsv")

In [None]:
res = res.merge(spikes, on='feature', how='outer')

In [None]:
res.head(10)

In [None]:
res[(~res.positive.isna()) & (res.qval < 0.25)]

In [None]:
# Load results for M1 model for this dataset
m1_res = pd.read_table(data_dir/"strict_filtering/true-exp_RNA/all_results.fdr_correction.tsv")


In [None]:
m1_res[m1_res.feature.str.contains(test_gene)]

In [None]:
m1_res.sort_values(['coef', 'qval'], ascending=False)

In [None]:
meta_data= df[['sample_id', 'Phenotype', 'SeqDepth']]
count_data = df[['sample_id'] + [c for c in df.columns if 'BUG' in c]].set_index('sample_id').T

In [None]:
count_data.head()

In [None]:
meta_data.to_csv(data_dir/"true-exp.mtx_abunds_meta.csv",  index=False)
count_data.to_csv(data_dir/"true-exp.mtx_abunds_count.csv")

In [None]:
cov_df = pd.read_table(data_dir/"true-exp.bug_abunds.tsv").iloc[2:, :]

In [None]:
cov_df.head()

In [None]:
cov_df.to_csv(data_dir/"true-exp.bug_abunds_edited.csv", index=False)

In [None]:
# Results

res = pd.read_table('../test_out/all_results.tsv')

In [None]:
res[res.qval < 0.25]

In [None]:
spikes = pd.read_table(data_dir/"true-exp.mtx_spiked.tsv", header=None, names=['feature', 'positive'])
spikes.shape

In [None]:
res = res.merge(spikes, on ='feature')

In [None]:
res[(res.qval < 0.25) & (res.positive == 1)]

In [None]:
res.shape

In [None]:
res.positive.value_counts()

In [None]:
pub_res = pd.read_table("/nfs/nas22/fs2202/biol_micro_bioinf_nccr/hardt/nguyenb/tnseq/scratch/03_23_transcriptomics/modeling/strict_filtering/true-exp_mtx_vs_bug/all_results.fdr_correction.tsv")

In [None]:
res.sort_values('qval').sample(10)

In [None]:
x = res.merge(pub_res, on='feature', suffixes=["_me", "_pub"])

In [None]:
pub_res.head()

In [None]:
count_data.head()

In [None]:
cov_df.sum()

In [None]:
# TSS within each bug 
# BUG abundances based on TX data

In [None]:
cdf = count_data.reset_index().rename(columns={'#': 'gene_name'}).copy()
cdf['bug'] = cdf.gene_name.str.split("_", expand=True)[1]

In [None]:
cdf.head()

In [None]:
cdf = cdf.melt(id_vars=['gene_name', 'bug'], var_name='sample_id', value_name='cnts')

In [None]:
cdf.head()

In [None]:
sm = cdf.groupby(['sample_id', 'bug']).cnts.sum()

In [None]:
bug_tx_abund = (sm/sm.groupby(level=0).transform('sum') * 100).reset_index()

In [None]:
bug_tx_abund

In [None]:
sm = sm.reset_index()
sm.columns = ['sample_id', 'bug', 'bug_ab']

In [None]:
cdf = cdf.merge(sm, on=['sample_id', 'bug'])

In [None]:
cdf['tx_cnts'] = cdf['cnts']/cdf['bug_ab']

In [None]:
cdf.head()

In [None]:
cdf.pivot(index='gene_name', columns='sample_id', values='tx_cnts')

In [None]:
test= cdf[cdf.gene_name == 'BUG0007_GROUP000391']

In [None]:
test = test[test.tx_cnts>0]

In [None]:
test = test.merge(meta_data, on='sample_id')
test = test.merge(bug_tx_abund, on=['sample_id', 'bug'], suffixes=['_raw', '_bug'])

In [None]:
test

In [None]:
mod = smf.ols(formula='tx_cnts ~ Phenotype + bug_ab', data=test)

In [None]:
res = mod.fit()

In [None]:
res.summary()

In [None]:
data = sm.datasets.get_rdataset("dietox", "geepack").data

In [None]:
data