In [1]:
import anndata
import numpy as np
import math
import scipy
import scipy.sparse as sparse
from scipy.sparse import csr_matrix
import scanpy as sc
import copy
import pandas as pd
from scipy.stats import pearsonr
from scipy import stats
from scipy.stats import mannwhitneyu
from statsmodels.stats.multitest import multipletests
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
path = '/picb/bigdata/project/miaoyuanyuan/train/XChrom_analysis/4_covid19/'
rna0 = sc.read_h5ad(f'{path}/0_preprocess/data/mild_severe_all.h5ad')
rna = anndata.AnnData(X=rna0.raw.X, obs=rna0.obs,var=rna0.raw.var)
sc.pp.normalize_total(rna, target_sum=1e4)
sc.pp.log1p(rna)
obj = sc.read_h5ad(f'{path}/2_TFactivity/activity_out/covid19_tfactivity.h5ad')
obj.var['tf_clean'] = obj.var.index.str.upper()
rna.var['tf_clean'] = rna.var.index.str.upper()
common_tfs = list(set(obj.var['tf_clean']) & set(rna.var['tf_clean']))
print(f"have {len(common_tfs)} common TFs")

have 863 common TFs


In [5]:
# Sort by name to ensure that the order of the two datasets is consistent
common_tfs_sorted = sorted(common_tfs)
obj_common_names = obj.var[obj.var['tf_clean'].isin(common_tfs_sorted)].index.tolist()
obj_common_names_sorted = [name for name in common_tfs_sorted if name in obj.var['tf_clean'].values]
obj_common_idx = obj.var.index.get_indexer(obj_common_names_sorted)
if -1 in obj_common_idx:
    missing_tfs = [name for i, name in enumerate(obj_common_names_sorted) if obj_common_idx[i] == -1]
    raise ValueError(f"The following TF was not found in obj: {missing_tfs}")

In [6]:
rna_common_names = rna.var[rna.var['tf_clean'].isin(common_tfs_sorted)].index.tolist()
rna_common_names_sorted = [name for name in common_tfs_sorted if name in rna.var['tf_clean'].values]
rna_common_idx = rna.var.index.get_indexer(rna_common_names_sorted)
if -1 in rna_common_idx:
    missing_tfs = [name for i, name in enumerate(rna_common_names_sorted) if rna_common_idx[i] == -1]
    raise ValueError(f"The following TF was not found in rna1: {missing_tfs}")

In [7]:
## Calculate the correlation between TF activity and expression values in cMono
objm = obj[obj.obs['celltypeL0'].isin (['cMono','CD163.cMono'])]
rnam = rna[rna.obs['celltypeL0'].isin (['cMono','CD163.cMono'])]

obj_data = objm[:, obj_common_idx].X 
rna_data = rnam[:, rna_common_idx].X.toarray() 

correlations = []
ps = []
for i in range(len(common_tfs_sorted)):
    corr, p = pearsonr(obj_data[:, i], rna_data[:, i])
    correlations.append(corr)
    ps.append(p)
results = pd.DataFrame({
    "TF": common_tfs_sorted,
    "Correlation": correlations,
    'p_val':ps
})



In [3]:
pvals = results["p_val"].replace(0.0, np.nextafter(0, 1)).copy()
pvals_clean = pvals.dropna()
_, pvals_adj, _, _ = multipletests(pvals_clean, method="fdr_bh")
results["pval_adj"] = np.nan
results.loc[pvals_clean.index, "pval_adj"] = pvals_adj
print(results.head())

   Unnamed: 0      TF  Correlation         p_val      pval_adj
0           0    ALX3    -0.003881  4.528810e-01  5.926679e-01
1           1    ALX4     0.001937  7.078759e-01  8.158737e-01
2           2      AR    -0.000359  9.447177e-01  9.658327e-01
3           3   ARGFX    -0.003816  4.604488e-01  5.969313e-01
4           4  ARID3A    -0.029144  1.721102e-08  9.227327e-08


In [4]:
results.to_csv('./pearsonr_TFactivity_TFexpr_incMono.csv')

In [5]:
results

Unnamed: 0.1,Unnamed: 0,TF,Correlation,p_val,pval_adj
0,0,ALX3,-0.003881,4.528810e-01,5.926679e-01
1,1,ALX4,0.001937,7.078759e-01,8.158737e-01
2,2,AR,-0.000359,9.447177e-01,9.658327e-01
3,3,ARGFX,-0.003816,4.604488e-01,5.969313e-01
4,4,ARID3A,-0.029144,1.721102e-08,9.227327e-08
...,...,...,...,...,...
858,858,ZSCAN26,-0.015113,3.465724e-03,9.000456e-03
859,859,ZSCAN29,0.011197,3.033840e-02,6.255883e-02
860,860,ZSCAN30,-0.038133,1.606207e-13,1.308586e-12
861,861,ZSCAN31,-0.000771,8.814348e-01,9.288141e-01
