In [31]:
import scanpy as sc
import numpy as np
import pandas as pd

In [15]:
adata = sc.read_h5ad('../data/rna_figure_ready.h5ad')

In [20]:
sc.pp.highly_variable_genes(adata, n_top_genes=None)  # No filtering yet

In [55]:
threshold_75 = adata.var['dispersions_norm'].quantile(0.75)
adata.var['highly_variable'] = adata.var['dispersions_norm'] > threshold_75
hvg = adata.var_names[adata.var['highly_variable']]
len(hvg)

5351

In [56]:
tf_df = pd.read_csv('../data/uniprotkb_organism_id_9606_AND_go_00037_2025_01_29.tsv', sep='\t')
tf_list = tf_df['Gene Names'].str.split().explode().tolist()
len(tf_list)

3044

In [57]:
adata.var['is_tf'] = adata.var_names.isin(tf_list)
adata.var['is_tf'].sum()

1281

In [60]:
threshold = adata[:,adata.var['is_tf']].var['dispersions_norm'].quantile(0.75)
adata.var['is_tf_hvg'] = adata.var['is_tf'] & (adata.var['dispersions_norm'] > threshold)
hv_tf = adata.var_names[adata.var['is_tf_hvg']]
len(hv_tf)

320

In [61]:
print(f"Intersection length: {len(set(hvg).intersection(hv_tf))}")
print(f"HVG length: {len(hvg)}, HV TF length: {len(hv_tf)}")

Intersection length: 271
HVG length: 5351, HV TF length: 320


In [71]:
# Save HVGs and HV TFs to files
with open('../data/hvg.txt', 'w') as f:
    f.write('\n'.join(hvg))
    
with open('../data/hv_tf.txt', 'w') as f:
    f.write('\n'.join(hv_tf))


In [None]:
assert False

In [39]:
tf_hvg_intersection = set(tf_list).intersection(hvg)
print(f"Number of TFs: {len(tf_list)}")
print(f"Number of HVGs: {len(hvg)}")
print(f"Number of TFs in HVGs: {len(tf_hvg_intersection)}")

Number of TFs: 3044
Number of HVGs: 5429
Number of TFs in HVGs: 273


In [26]:
# sc.pp.highly_variable_genes(adata, n_top_genes=400)
alex_genes = open('../data/alex_genes.txt').read().splitlines()
alex_tfs = open('../data/alex_tfs.txt').read().splitlines()

In [29]:
alex_genes_in_hvg = [g for g in alex_genes if g in hvg]
alex_tfs_in_hvg = [t for t in alex_tfs if t in hvg]
print(f"Number of Alex genes in HVG: {len(alex_genes_in_hvg)}, total number of Alex genes: {len(alex_genes)}")
print(f"Number of Alex TFS in HVG: {len(alex_tfs_in_hvg)}, total number of Alex TFS: {len(alex_tfs)}")

Number of Alex genes in HVG: 1446, total number of Alex genes: 1970
Number of Alex TFS in HVG: 150, total number of Alex TFS: 331


In [40]:
tf_hvg_intersection = set(tf_list).intersection(hvg)
alex_tf_set = set(alex_tfs)
common_tfs = tf_hvg_intersection.intersection(alex_tf_set)

print(f"Number of TFs in HVGs: {len(tf_hvg_intersection)}")
print(f"Number of Alex TFs: {len(alex_tf_set)}")
print(f"Number of TFs in both sets: {len(common_tfs)}")


Number of TFs in HVGs: 273
Number of Alex TFs: 331
Number of TFs in both sets: 127


In [42]:
'ESRRA' in tf_hvg_intersection

False

In [43]:
'ESRRA' in hvg

False