## TF activity differential analysis between R1 and other subgroups

In [1]:
import tensorflow as tf
import anndata
import h5py
import numpy as np
import math
import scipy
import scipy.sparse as sparse
from scipy.sparse import csr_matrix
import scanpy as sc
import copy
import pandas as pd
from scipy.stats import mannwhitneyu
from statsmodels.stats.multitest import multipletests
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
path = '../../../'
raw = sc.read_h5ad(f'{path}/2_TFactivity/activity_out/covid19_tfactivity.h5ad')
mono = sc.read_h5ad(f'{path}/3_mono/2_R1R4/coviddata/mono_ms.h5ad')

In [4]:
mono = mono.raw.to_adata()
print(mono.obs.Severity.value_counts(),mono.obs.celltypeL0.value_counts())
mono.obs['new.id'] = mono.obs['new.id'].astype(str)

mild      19977
severe    18472
Name: Severity, dtype: int64 cMono          31594
ncMono          3818
CD163.cMono     3037
Name: celltypeL0, dtype: int64


In [5]:
obj = raw.copy()
shared_cells = mono.obs_names.intersection(obj.obs_names)
obj_subset = obj[obj.obs_names.isin(shared_cells)].copy()
assert obj_subset.obs_names.equals(mono.obs.index),'Indexes differ!'
obj_subset.obs['subcluster'] = mono.obs['new.id'] 
obj_subset.obs['Group'] = np.where(
    obj_subset.obs['subcluster'] == '0',
    'R1',
    'Other'
)
obj_subset.obs['Group'] = obj_subset.obs['Group'].astype('category')

In [14]:
obj_subset.obs['Group'].value_counts()

Other    34178
R1        4271
Name: Group, dtype: int64

In [9]:
def calculate_pvalues(obj,group,g1,g2,tf):
    a = obj[obj.obs[group] == g1].to_df()[tf]
    b = obj[obj.obs[group] == g2].to_df()[tf]
    if len(a) > 3 and len(b) > 3:
        stat, p = mannwhitneyu(a, b, alternative="two-sided")
        return p
    else:
        return np.nan

In [11]:
## calculate difference p-val
results = []
tfs = obj_subset.var_names
for tf in tfs:
    pval = calculate_pvalues(obj_subset,group = 'Group',g1='R1',g2='Other',tf=tf)
    results.append({
        "tf": tf,
        "pval": pval
    })

results_df = pd.DataFrame(results)
mask = results_df['pval'].notna()
pvals = results_df.loc[mask, 'pval']
if len(pvals) > 0:
    _, padj, _, _ = multipletests(pvals, method="fdr_bh")
    results_df.loc[mask, 'padj'] = padj

In [12]:
## calculate difference size
df_zscore = pd.DataFrame(
    obj_subset.X,
    index=obj_subset.obs_names,
    columns=obj_subset.var_names
)
df_zscore["Group"] = obj_subset.obs["Group"]
grouped_mean = df_zscore.groupby("Group").mean()
R1_mean = grouped_mean.loc["R1"]
Other_mean = grouped_mean.loc["Other"]
effect_sizes = R1_mean - Other_mean  
es_long = effect_sizes.rename_axis('tf').reset_index(name='mean_difference')
assert len(results_df) == len(es_long), "The number of TFS does not match, and there may be unprocessed TFs"
results_df = results_df.merge(es_long, on='tf', how='left', validate='one_to_one')

In [13]:
results_df.to_csv('./DAresults_R1vsOther.csv', index=False)

In [15]:
results_df

Unnamed: 0,tf,pval,padj,mean_difference
0,ALX1,0.000000e+00,0.000000e+00,0.006584
1,ALX3,0.000000e+00,0.000000e+00,0.003114
2,ALX4,0.000000e+00,0.000000e+00,0.003297
3,ANHX,0.000000e+00,0.000000e+00,0.004886
4,AR,0.000000e+00,0.000000e+00,0.004440
...,...,...,...,...
1060,ZSCAN4,4.150454e-42,4.420234e-39,0.002648
1061,ZSCAN5,0.000000e+00,0.000000e+00,0.005168
1062,ZSCAN5C,0.000000e+00,0.000000e+00,-0.002082
1063,ZSCAN9,1.697279e-171,1.807603e-168,0.010091
