# 1k Fibroblasts Perturbed with All HumanTFs

# Set up

In [1]:
import numpy as np
import pandas as pd
import scanpy as sp
import anndata as ad # JP add this line
import os
import sys
import time

In [12]:
# Subset 1k Fibroblasts
# Load firboblast source cells
DATAPATH = "/nfs/turbo/umms-indikar/shared/projects/DARPA_AI/in-silico-reprogramming/unperturbed"
FILE = "fibroblast.h5ad"
adata = sp.read_h5ad(os.path.join(DATAPATH, FILE))

adata.var['ensemblid'] = adata.var['ensemblid'].str.split('.').str[0] # JP Add this line

# Change the layers to show the raw counts
adata.layers = {'raw_counts': adata.layers['raw_counts']}
adata.X = adata.layers['raw_counts'].copy()

# Select 1000 random cells
adata = adata[np.random.randint(0,adata.shape[0], 100), :]

# Show data to the user
adata.var.head()

Unnamed: 0,gene_symbol,feature_type,ensemblid,highly_variable,means,dispersions,dispersions_norm,mean,std
DDX11L1,DDX11L1,Gene Expression,ENSG00000223972,False,6.398244e-05,0.835044,-0.573947,3.9e-05,0.005574
WASH7P,WASH7P,Gene Expression,ENSG00000227232,False,0.002274395,2.44228,0.533203,0.00108,0.031731
MIR6859-1,MIR6859-1,Gene Expression,ENSG00000278267,False,6.175251e-05,1.295335,-0.256874,3.3e-05,0.005634
MIR1302-2HG,MIR1302-2HG,Gene Expression,ENSG00000243485,False,0.0001372886,2.656352,0.680668,4.8e-05,0.008041
MIR1302-2,MIR1302-2,Gene Expression,ENSG00000284332,False,1e-12,,0.0,0.0,1.0


In [13]:
adata

View of AnnData object with n_obs × n_vars = 100 × 58870
    obs: 'organ_tissue', 'method', 'donor', 'anatomical_information', 'n_counts_UMIs', 'n_genes', 'cell_ontology_class', 'free_annotation', 'manually_annotated', 'compartment', 'gender'
    var: 'gene_symbol', 'feature_type', 'ensemblid', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: '_scvi', '_training_mode', 'dendrogram_cell_type_tissue', 'dendrogram_computational_compartment_assignment', 'dendrogram_consensus_prediction', 'dendrogram_tissue_cell_type', 'donor_colors', 'donor_method_colors', 'hvg', 'method_colors', 'neighbors', 'organ_tissue_colors', 'sex_colors', 'tissue_colors', 'umap'
    obsm: 'X_pca', 'X_scvi', 'X_scvi_umap', 'X_umap'
    layers: 'raw_counts'
    obsp: 'connectivities', 'distances'

# Perturbations

## Load HumanTF

In [4]:
humanTfdf = pd.read_csv('data/HumanTFs_v_1.01.csv') # JP this line is changed
humanTfdf.head()

Unnamed: 0.1,Unnamed: 0,Ensembl ID,HGNC symbol,DBD,Is TF?,TF assessment,Binding mode,Motif status,Final Notes,Final Comments,...,CisBP considers it a TF?,TFCat classification,Is a GO TF?,Initial assessment,Curator 1,Curator 2,TFclass considers it a TF?,Go Evidence,Pfam Domains (By ENSP ID),Is C2H2 ZF(KRAB)?
0,0,ENSG00000137203,TFAP2A,AP-2,Yes,Known motif,Monomer or homomultimer,High-throughput in vitro,,,...,Yes,TF Gene_DNA-Binding: sequence-specific_DNA Bin...,Yes,"1a1, Direct HQ evidence",Sam Lambert,Yimeng Yin,Yes,$#ENSG00000137203#GO:0000981#sequence-specific...,$#ENSP00000368928#ENSG00000137203#ENST00000379...,False
1,1,ENSG00000008196,TFAP2B,AP-2,Yes,Known motif,Monomer or homomultimer,High-throughput in vitro,,,...,Yes,TF Gene_DNA-Binding: sequence-specific_DNA Bin...,Yes,"1a1, Direct HQ evidence",Matt Weirauch,Yimeng Yin,Yes,$#ENSG00000008196#GO:0000981#sequence-specific...,$#ENSP00000377265#ENSG00000008196#ENST00000393...,False
2,2,ENSG00000087510,TFAP2C,AP-2,Yes,Known motif,Monomer or homomultimer,High-throughput in vitro,,,...,Yes,No,Yes,"1a1, Direct HQ evidence",Matt Weirauch,Yimeng Yin,Yes,$#ENSG00000087510#GO:0001077#RNA polymerase II...,$#ENSP00000201031#ENSG00000087510#ENST00000201...,False
3,3,ENSG00000008197,TFAP2D,AP-2,Yes,Known motif,Monomer or homomultimer,In vivo/Misc source,Only known motifs are from Transfac or HocoMoc...,Binds the same GCCTGAGGC sequence as the other...,...,Yes,No,Yes,"2a1, Lower confidence direct evidence",Arttu Jolma,Sam Lambert,Yes,$#ENSG00000008197#GO:0000981#sequence-specific...,$#ENSP00000008391#ENSG00000008197#ENST00000008...,False
4,4,ENSG00000116819,TFAP2E,AP-2,Yes,Known motif,Monomer or homomultimer,High-throughput in vitro,,,...,Yes,TF Gene_DNA-Binding: sequence-specific_DNA Bin...,Yes,"1a1, Direct HQ evidence",Sam Lambert,Laura Campitelli,Yes,$#ENSG00000116819#GO:0000981#sequence-specific...,$#ENSP00000362332#ENSG00000116819#ENST00000373...,False


## Verify HumanTFs in Tabula Sapiens Data

In [5]:
# Check which human TFs are in the dataframe
tfIds = humanTfdf['Ensembl ID']

ct = 0
sub = 0
for i, tf in enumerate(tfIds):
    if humanTfdf['Is TF?'].iloc[i] != 'Yes':
        sub += 1
        continue
    if tf in list(adata.var['ensemblid']):
        ct += 1

print(ct)
print(ct/(len(tfIds) - sub))

1637
0.9987797437461867


## Perform Perturbations

In [6]:
adata.obs['TF'] = None
adata.obs['u']  = 0
adata.obs

  adata.obs['TF'] = None
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


Unnamed: 0,organ_tissue,method,donor,anatomical_information,n_counts_UMIs,n_genes,cell_ontology_class,free_annotation,manually_annotated,compartment,gender,TF,u
AGATGCTTCATAAGGA_TSP2_Thymus_NA_10X_1_1,Thymus,10X,TSP2,,10048.0,3323,fibroblast,fibroblast,True,stromal,female,,0
TGTAACGTCGGCTCTT_TSP14_SalivaryGland_Parotid_10X_1_1,Salivary_Gland,10X,TSP14,Parotid,3208.0,1226,fibroblast,Fibroblast,True,stromal,male,,0
GTTTGGACATTCTTCA_TSP4_Uterus_Endometrium_10X_1_1,Uterus,10X,TSP4,Endometrium,6766.0,2644,fibroblast,Endometrial stromal fibbroblast,True,stromal,female,,0
CGGCAGTTCCCTTGTG_TSP2_Vasculature_Aorta_10X_1_1,Vasculature,10X,TSP2,Aorta,2905.0,1294,fibroblast,fibroblast,True,stromal,female,,0
CCTATCGAGGCGATAC_TSP2_Vasculature_Aorta_10X_2_1,Vasculature,10X,TSP2,Aorta,16290.0,3890,fibroblast,fibroblast,True,stromal,female,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTCCGTGCACGTACAT_TSP1_bladder_1,Bladder,10X,TSP1,,12448.0,2992,fibroblast,fibroblast,True,stromal,female,,0
AACCAACTCTAGTGTG_TSP10_FAT_SCAT_10X_1_1,Fat,10X,TSP10,SCAT,11502.0,3212,myofibroblast cell,Myofibroblasts,True,stromal,male,,0
CAACCAATCCACCTCA_TSP14_Vasculature_CoronaryArteries_10X_1_1,Vasculature,10X,TSP14,CoronaryArteries,9086.0,2109,fibroblast,fibroblast,True,stromal,male,,0
CTGCCTAAGGTGCAGT_TSP2_Vasculature_Aorta_10X_1_2,Vasculature,10X,TSP2,Aorta,15170.0,3878,fibroblast,fibroblast,True,stromal,female,,0


In [10]:
0 % 10

0

In [None]:
# Create a copy of the AnnData object
adataFull = adata.copy()

# Define the perturbation values
U = [-1, -0.75, -0.5, -0.25, 0.25, 0.5, 0.75, 1, 1.25, 1.5]

# Calculate the maximum gene expression for each cell
maximumGeneExpression = np.max(adata.X, axis=1).toarray()[:, np.newaxis]

# Loop through the transcription factors in the dataframe
ctr = 0
for tfi, tf in enumerate(humanTfdf['Ensembl ID']):
    # Skip if it is not marked as a TF or if the TF is not found in the adata variable names
    if humanTfdf['Is TF?'].iloc[tfi] != 'Yes':
        continue
    if tf not in list(adata.var['ensemblid']):
        continue

    # Get the transcription factor index and expression
    tfIdx = list(adata.var['ensemblid']).index(tf)
    tfExpression = adata.X[:, tfIdx].copy()  # Copy the expression to avoid overwriting

    start_time = time.time()
    
    # Perturb the transcription factor expression for each value of U
    for u in U:
        # Make a fresh copy of the AnnData object for each perturbation
        adata_perturbed = adata.copy()

        # Apply the perturbation
        if u < 0:
            perturbedExpression = tfExpression - (u * tfExpression)
        else:
            perturbedExpression = u * maximumGeneExpression

        # Update the expression of the TF with the perturbed values
        adata_perturbed.X[:, tfIdx] = perturbedExpression
        adata_perturbed.obs['TF']   = tf
        adata_perturbed.obs['u']    = u
        
        # Concatenate the perturbed AnnData object to adataFull
        adataFull = adataFull.concatenate(adata_perturbed)
    
    print("time=" + str(time.time() - start_time))
    print(f"{ctr=}")
    ctr += 1


    if tfi % 5 == 0:
        # Save the full dataset with all perturbations
        adataFull.write("/nfs/turbo/umms-indikar/shared/projects/DARPA_AI/in-silico-reprogramming/all-tfs/jpic-Sep-26-2024.h5ad")

# Save the full dataset with all perturbations
adataFull.write("/nfs/turbo/umms-indikar/shared/projects/DARPA_AI/in-silico-reprogramming/all-tfs/jpic-Sep-26-2024.h5ad")


  adataFull = adataFull.concatenate(adata_perturbed)
  adataFull = adataFull.concatenate(adata_perturbed)
  adataFull = adataFull.concatenate(adata_perturbed)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)


time=1.6425261497497559
ctr=0


  adataFull = adataFull.concatenate(adata_perturbed)
  adataFull = adataFull.concatenate(adata_perturbed)
  adataFull = adataFull.concatenate(adata_perturbed)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)


time=2.4025962352752686
ctr=1


  adataFull = adataFull.concatenate(adata_perturbed)
  adataFull = adataFull.concatenate(adata_perturbed)
  adataFull = adataFull.concatenate(adata_perturbed)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)


time=3.150596857070923
ctr=2


  adataFull = adataFull.concatenate(adata_perturbed)
  adataFull = adataFull.concatenate(adata_perturbed)
  adataFull = adataFull.concatenate(adata_perturbed)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)


time=4.319546222686768
ctr=3


  adataFull = adataFull.concatenate(adata_perturbed)
  adataFull = adataFull.concatenate(adata_perturbed)
  adataFull = adataFull.concatenate(adata_perturbed)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)


time=5.694421768188477
ctr=4


  adataFull = adataFull.concatenate(adata_perturbed)
  adataFull = adataFull.concatenate(adata_perturbed)
  adataFull = adataFull.concatenate(adata_perturbed)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)


time=6.385369777679443
ctr=5


  adataFull = adataFull.concatenate(adata_perturbed)
  adataFull = adataFull.concatenate(adata_perturbed)
  adataFull = adataFull.concatenate(adata_perturbed)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)


time=7.613742351531982
ctr=6


  adataFull = adataFull.concatenate(adata_perturbed)
  adataFull = adataFull.concatenate(adata_perturbed)
  adataFull = adataFull.concatenate(adata_perturbed)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)


time=8.685608863830566
ctr=7


  adataFull = adataFull.concatenate(adata_perturbed)
  adataFull = adataFull.concatenate(adata_perturbed)
  adataFull = adataFull.concatenate(adata_perturbed)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)


time=10.004846572875977
ctr=8


  adataFull = adataFull.concatenate(adata_perturbed)
  adataFull = adataFull.concatenate(adata_perturbed)
  adataFull = adataFull.concatenate(adata_perturbed)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)


time=10.945104122161865
ctr=9


  adataFull = adataFull.concatenate(adata_perturbed)
  adataFull = adataFull.concatenate(adata_perturbed)
  adataFull = adataFull.concatenate(adata_perturbed)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)


time=12.066322326660156
ctr=10


  adataFull = adataFull.concatenate(adata_perturbed)
  adataFull = adataFull.concatenate(adata_perturbed)
  adataFull = adataFull.concatenate(adata_perturbed)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)
  self._set_arrayXarray(i, j, x)
  adataFull = adataFull.concatenate(adata_perturbed)


In [None]:
adataFull.obs['batch']

In [56]:
adataFull = adata.copy()
U = [-1, -0.75, -0.5, -0.25, 0.25, 0.5, 0.75, 1, 1.25, 1.5]
for tfi, tf in enumerate(humanTfdf['Ensembl ID']):
    # skip the TF if either (1) it isnot a transcription factor or (2) it is not found in the dataframe
    if humanTfdf['Is TF?'].iloc[i] != 'Yes':
        continue
    if tf not in list(adata.var['ensemblid']):
        continue

    # Get the transcription factor location and expression
    tfIdx = list(adata.var['ensemblid']).index(tf)
    tfExpression = adata.X[:, tfIdx]
    for u in U:

        # Do perturbation of gene expression
        if u < 0:
            perturbedExpression = tfExpression - (u * tfExpression)
        else:
            perturbedExpression = u * maximumGeneExpression
        adata.X[:, tfIdx] = perturbedExpression

        
        adataFull = adataFull concatenated with adata.X.copy()


SyntaxError: incomplete input (1386281128.py, line 11)

In [87]:
1000*10*1600

16000000

In [77]:
adata.layers

Layers with keys: decontXcounts, raw_counts

In [58]:
tfIdx = list(adata.var['ensemblid']).index(tf)

In [69]:
adata.layers['raw_counts'][:, tfIdx]

<1000x1 sparse matrix of type '<class 'numpy.float32'>'
	with 847 stored elements in Compressed Sparse Row format>

In [70]:
tfExpression = adata.layers['raw_counts'][:, tfIdx]

In [85]:
tfExpression = adata.X[:, tfIdx]

In [86]:
tfExpression.toarray()

array([[2.000e+00],
       [9.000e+00],
       [2.000e+01],
       [9.000e+00],
       [2.000e+00],
       [4.900e+01],
       [1.000e+01],
       [6.000e+01],
       [9.000e+00],
       [3.000e+00],
       [5.000e+00],
       [1.000e+00],
       [0.000e+00],
       [3.000e+00],
       [2.000e+00],
       [1.200e+01],
       [3.000e+00],
       [3.000e+00],
       [1.000e+01],
       [2.000e+00],
       [3.000e+00],
       [1.300e+01],
       [7.000e+00],
       [1.000e+00],
       [2.000e+00],
       [5.000e+00],
       [1.000e+00],
       [2.000e+00],
       [2.000e+00],
       [1.000e+00],
       [4.000e+00],
       [6.000e+00],
       [5.000e+00],
       [1.000e+00],
       [3.000e+00],
       [1.000e+00],
       [1.000e+00],
       [1.900e+01],
       [7.000e+00],
       [1.000e+00],
       [3.000e+00],
       [2.000e+00],
       [1.000e+00],
       [1.300e+01],
       [2.000e+00],
       [9.000e+00],
       [2.000e+00],
       [2.000e+00],
       [2.400e+01],
       [1.400e+01],


# Embeddings