In [None]:
# Compares different methods on running DESeq2 and linear models using a synthetic dataset

# Conclusions: within taxa normalisation works almost as well as analysing each taxon individually, 

# Global normalisation is not appropriate 

# To do: look at the description of the synthetic dataset, re. how variable are different taxa between samples
# To do: All the anlysis was done on filtered dataset (i.e. excluding low expression, redo without it?)

In [None]:
import pandas as pd
from pathlib import Path
import statsmodels
import statsmodels.api as sm
import statsmodels.formula.api as smf
import numpy as np
import plotly.express as px
import yaml
with open("config.yaml", "r") as fh:
    config_dict = yaml.safe_load(fh)['default']
from sklearn.metrics import precision_score, recall_score, confusion_matrix

In [None]:
%ls /nfs/nas22/fs2202/biol_micro_bioinf_nccr/hardt/nguyenb/tnseq/scratch/03_23_transcriptomics/modeling

In [None]:
root = Path(config_dict["root"])
synth_dir = root/config_dict["synth_dir"]
sample_data_file = root/config_dict["sample_data_file"]
spikes = pd.read_table(synth_dir/"synth_mgx_mtx/true-exp.mtx_spiked.tsv", header=None, names=['feature', 'positive'])
spikes.shape

In [None]:
spikes.head()

# Saving synthetic data in the format accepted by deseq2. 
- Added 1 because with such low 'depth' DESeq2 fails.

In [None]:
 
dsdf = pd.read_table(synth_dir/"synth_mgx_mtx/true-exp.mtx_abunds.tsv", index_col=0).T.reset_index().rename(columns={'index': 'sample_id'})
ds_meta = dsdf[['sample_id', 'Phenotype']]
dsdf = dsdf.drop(columns=['Phenotype', 'SeqDepth']).set_index('sample_id').T.reset_index().rename(columns={'#':'ID'}).set_index('ID')
dsdf = dsdf + 1
#ds_meta.to_csv(synth_dir/'true-exp-meta.csv', index=False)
#dsdf.reset_index().to_csv(synth_dir/"true-exp-counts.csv", index=False)

# DESeq results on all samples with 1 added to all counts

In [None]:

# Analysis with global normalisation
deres = pd.read_csv(synth_dir/"2023-07-31_true-exp-deseq-1_vs_0_l0a0.01_results.csv")
# Analysis with taxon specific normalisation
deres_tx = pd.read_csv(synth_dir/"2023-07-31_true-exp-deseq-within-taxon-1_vs_0_l0a0.01_results.csv")
# Analysis of taxa one by one
deres_1tx = pd.read_csv(synth_dir/"2023-08-03_true-exp-deseq-taxon-one-by-one-1_vs_0_l0a0.01_results.csv", index_col=0 )


In [None]:
def get_scores(result_df, ground_truth, left_feat, right_feat, name, fdr=0.05):
    df = (result_df[[left_feat, 'padj']]
          .merge(ground_truth, left_on=left_feat, right_on=right_feat, how='outer')
          .dropna(subset=[left_feat]))
    predicted_phenotype_tx = (df.padj < fdr).astype(int)
    actual_phenotype_tx = (df.positive.notnull()).astype(int)
    return  pd.Series([precision_score(actual_phenotype_tx, predicted_phenotype_tx), 
                       recall_score(actual_phenotype_tx, predicted_phenotype_tx)] + list(confusion_matrix(actual_phenotype_tx, predicted_phenotype_tx).ravel().astype(int)),index=['Precision', 'Recall', 'TN', 'FP', 'FN', 'TP'], name=name)


In [None]:
fdr = 0.01
score_list = []
for res, label in zip([deres, deres_tx, deres_1tx], ['Global', 'Taxon', 'IndivTaxon']):
    score_list.append(get_scores(res, spikes, 'ID', 'feature', label, fdr))

scores = pd.DataFrame(score_list)



In [None]:
scores

In [None]:
frp = scores['FP']/(scores['FP'] + scores['TN'])

In [None]:
frp

# Same analysis with linear models

In [None]:
df = pd.read_table(synth_dir/"synth_mgx_mtx/true-exp.mtx_abunds.tsv", index_col=0).T.reset_index().rename(columns={'index': 'sample_id'})
df = df.melt(id_vars=["sample_id", "Phenotype", "SeqDepth"], var_name="gene_name", value_name='raw_cnt')
df['tss'] = df['raw_cnt']/df['SeqDepth']*1e6
df['bug'] = df['gene_name'].str.split("_", expand=True)[0]
df = df.merge(df.groupby(['sample_id', 'bug']).raw_cnt.sum().reset_index().rename(columns={'raw_cnt':'bug_cnt'}), on=['sample_id', 'bug'], how='left')
df['within_bug'] = df['raw_cnt']/df['bug_cnt']*1e6
df['bug_perc'] = df['bug_cnt']/df['SeqDepth']

In [None]:
# Filter out genes with less than 100 reads across samples
mask = df.groupby(['gene_name']).raw_cnt.sum() > 100
mask2 = df[df.raw_cnt > 0].groupby('gene_name').sample_id.nunique() > 10
df_filtered = df.set_index('gene_name').loc[mask&mask2,:].reset_index()

In [None]:
# 6818 true positives left in the dataset
df_filtered.merge(spikes, left_on='gene_name', right_on='feature', how='inner')[['gene_name', 'positive']].drop_duplicates().shape

In [None]:
def linear_on_gene(gene_df, expr_col, fixed_effects):
    pseudo_count = gene_df[gene_df[expr_col] > 0][expr_col].min()/2
    gene_df[f'{expr_col}_log'] = np.log10(gene_df[expr_col] + pseudo_count)
    #gene_df['tss_log_std'] = (gene_df['tss_log'] - gene_df['tss_log'].mean())/gene_df['tss_log'].std()
    if not gene_df.empty and any([i in gene_df.columns for i in fixed_effects]):
        formula = f"{expr_col}_log ~ {' + '.join(fixed_effects)}"
        #print(formula)
        md = smf.glm(formula=formula, data=gene_df, family=sm.families.Gaussian()).fit()
        #print(md.summary())
        return [md.params[1], md.bse[1], md.pvalues[1]]
    return []

def linear_on_df(df, expr_col, fixed_effects):
    f = df.dropna().groupby('gene_name').apply(linear_on_gene, expr_col = expr_col, fixed_effects = fixed_effects).reset_index()
    f[['coef', 'se', 'pval']] = pd.DataFrame(f[0].to_list(), index = f.index)
    f = f.drop(columns=[0])
    f['padj'] = statsmodels.stats.multitest.multipletests(f.pval.values, method='fdr_bh')[1]
    return f

In [None]:
phenotype_tx = linear_on_df(df_filtered, 'within_bug', ['Phenotype'])

In [None]:
phenotype_bug_tx = linear_on_df(df_filtered, 'within_bug', ['Phenotype', 'bug_perc'])

In [None]:
phenotype_global = linear_on_df(df_filtered, 'tss', ['Phenotype'])

In [None]:
fdr = 0.05
score_list = []
for res, label in zip([phenotype_tx, phenotype_bug_tx, phenotype_global], ['LM_taxon_Ph', 'LM_taxon_Ph_bug', 'LM_global']):
    score_list.append(get_scores(res, spikes, 'gene_name', 'feature', label, fdr))
scores_lm = pd.DataFrame(score_list)

In [None]:
scores_lm

In [None]:
pd.concat([scores, scores_lm])