# 1k Fibroblasts Perturbed with All HumanTFs

# Set up

In [3]:
import numpy as np
import pandas as pd
import scanpy as sp
import numpy.matlib

In [2]:
import numpy as np
import pandas as pd
import scanpy as sp
import anndata as ad # JP add this line
import os
import sys
import time

In [5]:
# Subset 1k Fibroblasts
# Load firboblast source cells
DATAPATH = "/nfs/turbo/umms-indikar/shared/projects/DARPA_AI/in-silico-reprogramming/unperturbed"
FILE = "fibroblast.h5ad"

print(FILE)

adata = sp.read_h5ad(os.path.join(DATAPATH, FILE))

adata.var['ensemblid'] = adata.var['ensemblid'].str.split('.').str[0] # JP Add this line

# Change the layers to show the raw counts
adata.layers = {'raw_counts': adata.layers['raw_counts']}
adata.X = adata.layers['raw_counts'].copy()

# Select 1000 random cells
adata = adata[np.random.randint(0,adata.shape[0], 1000), :]

# Show data to the user
adata.var.head()

fibroblast.h5ad


Unnamed: 0,gene_symbol,feature_type,ensemblid,highly_variable,means,dispersions,dispersions_norm,mean,std
DDX11L1,DDX11L1,Gene Expression,ENSG00000223972,False,6.398244e-05,0.835044,-0.573947,3.9e-05,0.005574
WASH7P,WASH7P,Gene Expression,ENSG00000227232,False,0.002274395,2.44228,0.533203,0.00108,0.031731
MIR6859-1,MIR6859-1,Gene Expression,ENSG00000278267,False,6.175251e-05,1.295335,-0.256874,3.3e-05,0.005634
MIR1302-2HG,MIR1302-2HG,Gene Expression,ENSG00000243485,False,0.0001372886,2.656352,0.680668,4.8e-05,0.008041
MIR1302-2,MIR1302-2,Gene Expression,ENSG00000284332,False,1e-12,,0.0,0.0,1.0


In [6]:
adata

View of AnnData object with n_obs × n_vars = 1000 × 58870
    obs: 'organ_tissue', 'method', 'donor', 'anatomical_information', 'n_counts_UMIs', 'n_genes', 'cell_ontology_class', 'free_annotation', 'manually_annotated', 'compartment', 'gender'
    var: 'gene_symbol', 'feature_type', 'ensemblid', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: '_scvi', '_training_mode', 'dendrogram_cell_type_tissue', 'dendrogram_computational_compartment_assignment', 'dendrogram_consensus_prediction', 'dendrogram_tissue_cell_type', 'donor_colors', 'donor_method_colors', 'hvg', 'method_colors', 'neighbors', 'organ_tissue_colors', 'sex_colors', 'tissue_colors', 'umap'
    obsm: 'X_pca', 'X_scvi', 'X_scvi_umap', 'X_umap'
    layers: 'raw_counts'
    obsp: 'connectivities', 'distances'

# Perturbations

## Load HumanTF

In [7]:
humanTfdf = pd.read_csv('data/HumanTFs_v_1.01.csv') # JP this line is changed
humanTfdf.head()

Unnamed: 0.1,Unnamed: 0,Ensembl ID,HGNC symbol,DBD,Is TF?,TF assessment,Binding mode,Motif status,Final Notes,Final Comments,...,CisBP considers it a TF?,TFCat classification,Is a GO TF?,Initial assessment,Curator 1,Curator 2,TFclass considers it a TF?,Go Evidence,Pfam Domains (By ENSP ID),Is C2H2 ZF(KRAB)?
0,0,ENSG00000137203,TFAP2A,AP-2,Yes,Known motif,Monomer or homomultimer,High-throughput in vitro,,,...,Yes,TF Gene_DNA-Binding: sequence-specific_DNA Bin...,Yes,"1a1, Direct HQ evidence",Sam Lambert,Yimeng Yin,Yes,$#ENSG00000137203#GO:0000981#sequence-specific...,$#ENSP00000368928#ENSG00000137203#ENST00000379...,False
1,1,ENSG00000008196,TFAP2B,AP-2,Yes,Known motif,Monomer or homomultimer,High-throughput in vitro,,,...,Yes,TF Gene_DNA-Binding: sequence-specific_DNA Bin...,Yes,"1a1, Direct HQ evidence",Matt Weirauch,Yimeng Yin,Yes,$#ENSG00000008196#GO:0000981#sequence-specific...,$#ENSP00000377265#ENSG00000008196#ENST00000393...,False
2,2,ENSG00000087510,TFAP2C,AP-2,Yes,Known motif,Monomer or homomultimer,High-throughput in vitro,,,...,Yes,No,Yes,"1a1, Direct HQ evidence",Matt Weirauch,Yimeng Yin,Yes,$#ENSG00000087510#GO:0001077#RNA polymerase II...,$#ENSP00000201031#ENSG00000087510#ENST00000201...,False
3,3,ENSG00000008197,TFAP2D,AP-2,Yes,Known motif,Monomer or homomultimer,In vivo/Misc source,Only known motifs are from Transfac or HocoMoc...,Binds the same GCCTGAGGC sequence as the other...,...,Yes,No,Yes,"2a1, Lower confidence direct evidence",Arttu Jolma,Sam Lambert,Yes,$#ENSG00000008197#GO:0000981#sequence-specific...,$#ENSP00000008391#ENSG00000008197#ENST00000008...,False
4,4,ENSG00000116819,TFAP2E,AP-2,Yes,Known motif,Monomer or homomultimer,High-throughput in vitro,,,...,Yes,TF Gene_DNA-Binding: sequence-specific_DNA Bin...,Yes,"1a1, Direct HQ evidence",Sam Lambert,Laura Campitelli,Yes,$#ENSG00000116819#GO:0000981#sequence-specific...,$#ENSP00000362332#ENSG00000116819#ENST00000373...,False


## Verify HumanTFs in Tabula Sapiens Data

In [8]:
# Check which human TFs are in the dataframe
tfIds = humanTfdf['Ensembl ID']

ct = 0
sub = 0
for i, tf in enumerate(tfIds):
    if humanTfdf['Is TF?'].iloc[i] != 'Yes':
        sub += 1
        continue
    if tf in list(adata.var['ensemblid']):
        ct += 1

print(ct)
print(ct/(len(tfIds) - sub))

1637
0.9987797437461867


## Perform Perturbations

In [9]:
adata.obs['TF'] = None
adata.obs['u']  = 0
adata.obs

  adata.obs['TF'] = None
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


Unnamed: 0,organ_tissue,method,donor,anatomical_information,n_counts_UMIs,n_genes,cell_ontology_class,free_annotation,manually_annotated,compartment,gender,TF,u
CCCTAACTCCGTGTAA_TSP14_Thymus_NA_10X_2_1,Thymus,10X,TSP14,,19279.0,4187,fibroblast,fibroblast,True,stromal,male,,0
CCTCTCCCATGCCGAC_TSP10_FAT_MAT_10X_1_1,Fat,10X,TSP10,MAT,14059.0,3878,fibroblast,Fibroblasts,True,stromal,male,,0
GATCAGTAGACGGATC_TSP14_SalivaryGland_Parotid_10X_1_1,Salivary_Gland,10X,TSP14,Parotid,5127.0,1864,fibroblast,Fibroblast,True,stromal,male,,0
CGCCATTCATCGGTTA_TSP14_Vasculature_CoronaryArteries_10X_1_1,Vasculature,10X,TSP14,CoronaryArteries,23119.0,5178,fibroblast,fibroblast,True,stromal,male,,0
AGTGACTCAGGGATAC_TSP4_Uterus_Endometrium_10X_1_1,Uterus,10X,TSP4,Endometrium,21895.0,4631,fibroblast,Endometrial stromal fibbroblast,True,stromal,female,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
GACCAATCACAACGAG_TSP2_Vasculature_Aorta_10X_2_1,Vasculature,10X,TSP2,Aorta,11084.0,2971,fibroblast,fibroblast,True,stromal,female,,0
CCTTTGGAGCAACTTC_TSP2_Thymus_NA_10X_1_1,Thymus,10X,TSP2,,10402.0,3298,fibroblast,fibroblast,True,stromal,female,,0
CCTCATGTCACTGAAC_TSP2_Vasculature_Aorta_10X_1_2,Vasculature,10X,TSP2,Aorta,11047.0,3307,fibroblast,fibroblast,True,stromal,female,,0
AGTGTTGAGCACTAAA_TSP14_LI_Proximal_10X_1_1,Large_Intestine,10X,TSP14,Proximal,6932.0,2268,fibroblast,fibroblast,True,stromal,male,,0


### v2

In [10]:
import pickle
import numpy as np
import time

output_dir = "/scratch/indikar_root/indikar0/jpic/pb1k/data"

# Define the perturbation values
U = [-1, -0.75, -0.5, -0.25, 0.25, 0.5, 0.75, 1, 1.25, 1.5]

# Calculate the maximum gene expression for each cell
maximumGeneExpression = np.max(adata.X, axis=1).toarray()[:, np.newaxis]

# Initialize the counter
ctr = 0

adata.obs = adata.obs[['TF', 'u']]
adata.obs

Unnamed: 0,TF,u
CCCTAACTCCGTGTAA_TSP14_Thymus_NA_10X_2_1,,0
CCTCTCCCATGCCGAC_TSP10_FAT_MAT_10X_1_1,,0
GATCAGTAGACGGATC_TSP14_SalivaryGland_Parotid_10X_1_1,,0
CGCCATTCATCGGTTA_TSP14_Vasculature_CoronaryArteries_10X_1_1,,0
AGTGACTCAGGGATAC_TSP4_Uterus_Endometrium_10X_1_1,,0
...,...,...
GACCAATCACAACGAG_TSP2_Vasculature_Aorta_10X_2_1,,0
CCTTTGGAGCAACTTC_TSP2_Thymus_NA_10X_1_1,,0
CCTCATGTCACTGAAC_TSP2_Vasculature_Aorta_10X_1_2,,0
AGTGTTGAGCACTAAA_TSP14_LI_Proximal_10X_1_1,,0


In [None]:
# Loop through each cell
for cell_idx in range(adata.n_obs):  # Iterate over each cell
    print(f"Processing cell {cell_idx}/{adata.n_obs}")

    adata_cell = adata[cell_idx, :].copy()  # Create a copy of the current cell's data
    fname = adata_cell.obs.index[0]

    output_cell_file = os.path.join(output_dir, fname + ".h5ad")
    if os.path.exists(output_cell_file):
        print(f"File already exists, skipping: {output_cell_file}")
        continue

    X = np.matlib.repmat(adata_cell.X.toarray(), 1 + 1637 * len(U), 1)
    obsTF = [None]
    obsU  = [0]
    
    # Loop through the transcription factors
    for tfi, tf in enumerate(humanTfdf['Ensembl ID']):
        # Skip if not marked as a TF or TF not found in adata variable names
        if humanTfdf['Is TF?'].iloc[tfi] != 'Yes':
            continue
        if tf not in list(adata.var['ensemblid']):
            continue

        # Get the transcription factor index and expression
        tfIdx = list(adata.var['ensemblid']).index(tf)
        tfExpression = adata_cell.X[:, tfIdx].copy()

        start_time = time.time()

        # Perturb the transcription factor expression for each value of U
        for u in U:
            # Apply perturbation
            if u < 0:
                perturbedExpression = tfExpression - (u * tfExpression)
            else:
                perturbedExpression = u * maximumGeneExpression[cell_idx]  # Apply max gene expression for this cell

            # Update the expression of the TF with the perturbed values
            X[len(obsTF), tfIdx] = perturbedExpression[0,0]
            obsTF.append(tf)
            obsU.append(u)

        # print(f"{tfi}/{humanTfdf['Ensembl ID'].shape[0]}")
        # print(f"Time for TF {tf}: {time.time() - start_time:.2f} seconds")
    
    # Concatenate all perturbed datasets for this cell at once
    obsDf = pd.DataFrame({'TF': obsTF, 'U': obsU})
    adataFull_cell = ad.AnnData(X=X, var=adata.var, obs=obsDf)
    adataFull_cell.write_h5ad(filename=output_cell_file)


Processing cell 0/1000




Processing cell 1/1000




Processing cell 2/1000




Processing cell 3/1000




Processing cell 4/1000




Processing cell 5/1000




Processing cell 6/1000




Processing cell 7/1000




Processing cell 8/1000




Processing cell 9/1000




Processing cell 10/1000




Processing cell 11/1000




Processing cell 12/1000




Processing cell 13/1000




Processing cell 14/1000




Processing cell 15/1000




Processing cell 16/1000




Processing cell 17/1000




Processing cell 18/1000




Processing cell 19/1000




Processing cell 20/1000




Processing cell 21/1000




Processing cell 22/1000




Processing cell 23/1000




Processing cell 24/1000




Processing cell 25/1000




Processing cell 26/1000




Processing cell 27/1000




Processing cell 28/1000




Processing cell 29/1000




Processing cell 30/1000




Processing cell 31/1000




Processing cell 32/1000




Processing cell 33/1000




Processing cell 34/1000




Processing cell 35/1000




Processing cell 36/1000




Processing cell 37/1000




Processing cell 38/1000




Processing cell 39/1000




Processing cell 40/1000




Processing cell 41/1000




Processing cell 42/1000




Processing cell 43/1000




Processing cell 44/1000




Processing cell 45/1000




Processing cell 46/1000




Processing cell 47/1000




Processing cell 48/1000




Processing cell 49/1000




Processing cell 50/1000




Processing cell 51/1000




Processing cell 52/1000




Processing cell 53/1000




Processing cell 54/1000




Processing cell 55/1000




Processing cell 56/1000




Processing cell 57/1000




Processing cell 58/1000




Processing cell 59/1000




Processing cell 60/1000




Processing cell 61/1000




Processing cell 62/1000




Processing cell 63/1000




Processing cell 64/1000




Processing cell 65/1000




Processing cell 66/1000




Processing cell 67/1000




Processing cell 68/1000




Processing cell 69/1000
File already exists, skipping: /scratch/indikar_root/indikar0/jpic/pb1k/data/CCGGTAGTCTCCCAAC_TSP4_Mammary_NA_10X_1_2.h5ad
Processing cell 70/1000




Processing cell 71/1000




Processing cell 72/1000




Processing cell 73/1000




Processing cell 74/1000




Processing cell 75/1000




Processing cell 76/1000




Processing cell 77/1000




Processing cell 78/1000




Processing cell 79/1000




Processing cell 80/1000




Processing cell 81/1000




Processing cell 82/1000




Processing cell 83/1000




Processing cell 84/1000




Processing cell 85/1000




Processing cell 86/1000




Processing cell 87/1000




Processing cell 88/1000




Processing cell 89/1000




Processing cell 90/1000




Processing cell 91/1000




Processing cell 92/1000




Processing cell 93/1000




Processing cell 94/1000




Processing cell 95/1000




Processing cell 96/1000




Processing cell 97/1000




Processing cell 98/1000


### v1

In [None]:
import pickle
import numpy as np
import time

# Define the perturbation values
U = [-1, -0.75, -0.5, -0.25, 0.25, 0.5, 0.75, 1, 1.25, 1.5]

# Calculate the maximum gene expression for each cell
maximumGeneExpression = np.max(adata.X, axis=1).toarray()[:, np.newaxis]

# Define the output pickle file path
output_pickle_file = "/nfs/turbo/umms-indikar/shared/projects/DARPA_AI/in-silico-reprogramming/all-tfs/jpic-Sep-27-2024.pkl"

# Load the AnnData object from the pickled file
with open(output_pickle_file, 'rb') as f:
    adataFull = pickle.load(f)

# # Initialize adataFull with a copy of adata to handle large datasets
# adataFull = adata.copy()

ctr = 0

# Loop through the transcription factors in the dataframe
for tfi, tf in enumerate(humanTfdf['Ensembl ID']):
    # Skip if not marked as a TF or TF not found in adata variable names
    if humanTfdf['Is TF?'].iloc[tfi] != 'Yes':
        continue
    if tf in list(adataFull.obs['TF']):
        print("TF= " + str(tf) + " already complete. Skip")
        continue
    if tf not in list(adata.var['ensemblid']):
        print('2')
        continue
    print(tf)

    # Get the transcription factor index and expression
    tfIdx = list(adata.var['ensemblid']).index(tf)
    tfExpression = adata.X[:, tfIdx].copy()

    start_time = time.time()
    
    # Perturb the transcription factor expression for each value of U
    for u in U:
        # Apply perturbation
        if u < 0:
            perturbedExpression = tfExpression - (u * tfExpression)
        else:
            perturbedExpression = u * maximumGeneExpression
        
        # Update the expression of the TF with the perturbed values
        adataPerturbed = adata.copy()
        adataPerturbed.X[:, tfIdx] = perturbedExpression
        adataPerturbed.obs['TF']   = tf
        adataPerturbed.obs['u']    = u
        
        # Concatenate the perturbed AnnData object incrementally
        adataFull = adataFull.concatenate(adataPerturbed) #, batch_key='perturbation', batch_categories=[f'{tf}_{u}']) #, index_unique=None)
    
    print("time=" + str(time.time() - start_time))
    print(f"{ctr=}")
    ctr += 1

    # Save the full dataset as a pickled file every few iterations
    if tfi % 10 == 0:
        with open(output_pickle_file, 'wb') as f:
            pickle.dump(adataFull, f)

# Save the final version of the dataset as a pickled file
with open(output_pickle_file, 'wb') as f:
    pickle.dump(adataFull, f)

print("Dataset successfully saved as a pickled file.")


TF= ENSG00000137203 already complete. Skip
TF= ENSG00000008196 already complete. Skip
TF= ENSG00000087510 already complete. Skip
TF= ENSG00000008197 already complete. Skip
TF= ENSG00000116819 already complete. Skip
TF= ENSG00000116017 already complete. Skip
TF= ENSG00000179361 already complete. Skip
TF= ENSG00000205143 already complete. Skip
TF= ENSG00000196843 already complete. Skip
TF= ENSG00000150347 already complete. Skip
TF= ENSG00000117139 already complete. Skip
TF= ENSG00000189079 already complete. Skip
TF= ENSG00000153207 already complete. Skip
TF= ENSG00000126705 already complete. Skip
TF= ENSG00000106948 already complete. Skip
TF= ENSG00000116539 already complete. Skip
TF= ENSG00000173894 already complete. Skip
TF= ENSG00000101457 already complete. Skip
TF= ENSG00000104885 already complete. Skip
TF= ENSG00000140632 already complete. Skip
TF= ENSG00000137309 already complete. Skip
TF= ENSG00000149948 already complete. Skip
TF= ENSG00000025293 already complete. Skip
TF= ENSG000

  adataFull = adataFull.concatenate(adataPerturbed) #, batch_key='perturbation', batch_categories=[f'{tf}_{u}']) #, index_unique=None)
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  adataFull = adataFull.concatenate(adataPerturbed) #, batch_key='perturbation', batch_categories=[f'{tf}_{u}']) #, index_unique=None)
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  adataFull = adataFull.concatenate(adataPerturbed) #, batch_key='perturbation', batch_categories=[f'{tf}_{u}']) #, index_unique=None)
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  adataFull = adataFull.concatenate(adat

time=26.941601514816284
ctr=0
ENSG00000029153


  adataFull = adataFull.concatenate(adataPerturbed) #, batch_key='perturbation', batch_categories=[f'{tf}_{u}']) #, index_unique=None)
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  adataFull = adataFull.concatenate(adataPerturbed) #, batch_key='perturbation', batch_categories=[f'{tf}_{u}']) #, index_unique=None)
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  adataFull = adataFull.concatenate(adataPerturbed) #, batch_key='perturbation', batch_categories=[f'{tf}_{u}']) #, index_unique=None)
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  adataFull = adataFull.concatenate(adat

time=28.16948962211609
ctr=1
ENSG00000139352


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  adataFull = adataFull.concatenate(adataPerturbed) #, batch_key='perturbation', batch_categories=[f'{tf}_{u}']) #, index_unique=None)
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  adataFull = adataFull.concatenate(adataPerturbed) #, batch_key='perturbation', batch_categories=[f'{tf}_{u}']) #, index_unique=None)
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  adataFull = adataFull.concatenate(adataPerturbed) #, batch_key='perturbation', batch_categories=[f'{tf}_{u}']) #, index_unique=None)
  utils.warn_names_duplicates("obs")
  u

time=27.69321370124817
ctr=2
ENSG00000183734


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  adataFull = adataFull.concatenate(adataPerturbed) #, batch_key='perturbation', batch_categories=[f'{tf}_{u}']) #, index_unique=None)
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  adataFull = adataFull.concatenate(adataPerturbed) #, batch_key='perturbation', batch_categories=[f'{tf}_{u}']) #, index_unique=None)
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  adataFull = adataFull.concatenate(adataPerturbed) #, batch_key='perturbation', batch_categories=[f'{tf}_{u}']) #, index_unique=None)
  utils.warn_names_duplicates("obs")
  u

time=29.66997790336609
ctr=3
ENSG00000176009


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  adataFull = adataFull.concatenate(adataPerturbed) #, batch_key='perturbation', batch_categories=[f'{tf}_{u}']) #, index_unique=None)
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  adataFull = adataFull.concatenate(adataPerturbed) #, batch_key='perturbation', batch_categories=[f'{tf}_{u}']) #, index_unique=None)
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  adataFull = adataFull.concatenate(adataPerturbed) #, batch_key='perturbation', batch_categories=[f'{tf}_{u}']) #, index_unique=None)
  utils.warn_names_duplicates("obs")
  u

time=30.59893035888672
ctr=4
ENSG00000187855


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  adataFull = adataFull.concatenate(adataPerturbed) #, batch_key='perturbation', batch_categories=[f'{tf}_{u}']) #, index_unique=None)
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  adataFull = adataFull.concatenate(adataPerturbed) #, batch_key='perturbation', batch_categories=[f'{tf}_{u}']) #, index_unique=None)
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  adataFull = adataFull.concatenate(adataPerturbed) #, batch_key='perturbation', batch_categories=[f'{tf}_{u}']) #, index_unique=None)
  utils.warn_names_duplicates("obs")
  u

time=32.21024537086487
ctr=5
ENSG00000232237


  adataFull = adataFull.concatenate(adataPerturbed) #, batch_key='perturbation', batch_categories=[f'{tf}_{u}']) #, index_unique=None)
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  adataFull = adataFull.concatenate(adataPerturbed) #, batch_key='perturbation', batch_categories=[f'{tf}_{u}']) #, index_unique=None)
  utils.warn_names_duplicates("obs")


In [33]:
perturbedExpression.shape

(100, 1)

In [32]:
adataFull.X[:, tfIdx].shape

(200, 1)

In [31]:
tf

'ENSG00000137203'

In [30]:
adataFull.var['ensemblid'].iloc[tfIdx]

'ENSG00000137203'

In [29]:
tfIdx

18023

In [15]:
# Define the perturbation values
U = [-1, -0.75, -0.5, -0.25, 0.25, 0.5, 0.75, 1, 1.25, 1.5]

# Calculate the maximum gene expression for each cell
maximumGeneExpression = np.max(adata.X, axis=1).toarray()[:, np.newaxis]

# Create a backed AnnData object to handle large datasets without loading everything into memory
output_file = "/nfs/turbo/umms-indikar/shared/projects/DARPA_AI/in-silico-reprogramming/all-tfs/jpic-Sep-27-2024.h5ad"

# Initialize a file for writing in backed mode
adataFull = adata.copy() # sp.AnnData(adata.X, obs=adata.obs, var=adata.var)
adataFull.write(output_file)  # Write initial data to file

# Open in backed mode for writing data incrementally
adataFull = sp.read_h5ad(output_file, backed='r+')

ctr = 0

# Loop through the transcription factors in the dataframe
for tfi, tf in enumerate(humanTfdf['Ensembl ID']):
    # Skip if not marked as a TF or TF not found in adata variable names
    if humanTfdf['Is TF?'].iloc[tfi] != 'Yes':
        continue
    if tf not in adata.var['ensemblid']:
        continue

    # Get the transcription factor index and expression
    tfIdx = list(adata.var['ensemblid']).index(tf)
    tfExpression = adata.X[:, tfIdx].copy()

    start_time = time.time()
    
    # Perturb the transcription factor expression for each value of U
    for u in U:
        # Apply perturbation
        if u < 0:
            perturbedExpression = tfExpression - (u * tfExpression)
        else:
            perturbedExpression = u * maximumGeneExpression
        
        # Update the expression of the TF with the perturbed values
        adataFull.X[:, tfIdx] = perturbedExpression
        adataFull.obs['TF']   = tf
        adataFull.obs['u']    = u
        
        # Concatenate the perturbed AnnData object incrementally
        adataFull = adataFull.concatenate(adata, batch_key='perturbation', batch_categories=[f'{tf}_{u}'], index_unique=None)
    
    print("time=" + str(time.time() - start_time))
    print(f"{ctr=}")
    ctr += 1

    # Save the full dataset every few iterations to avoid losing progress and excessive memory usage
    if tfi % 10 == 0:
        adataFull.write()

# Save the final version of the dataset
adataFull.write()


TypeError: Can't implicitly convert non-string objects to strings

In [9]:
output_file

'/nfs/turbo/umms-indikar/shared/projects/DARPA_AI/in-silico-reprogramming/all-tfs/jpic-Sep-27-2024.h5ad'

In [13]:
adataFull.obs

Unnamed: 0,organ_tissue,method,donor,anatomical_information,n_counts_UMIs,n_genes,cell_ontology_class,free_annotation,manually_annotated,compartment,gender,TF,u
CTACTATAGAGGGTAA_TSP14_Vasculature_AortaVeneCava_10X_1_1,Vasculature,10X,TSP14,AortaVeneCava,38630.0,6487,fibroblast,fibroblast,True,stromal,male,,0
TCACTCGGTTTCACTT_TSP14_Bladder_NA_10X_1_1,Bladder,10X,TSP14,,16419.0,3802,fibroblast,fibroblast,True,stromal,male,,0
GCATGCGGTCCAGTGC_TSP14_Thymus_NA_10X_2_1_5Prime,Thymus,10X,TSP14,,5419.0,2446,fibroblast,fibroblast,True,stromal,male,,0
TGTTCCGTCGGTAGGA_TSP2_Bladder_NA_10X_1_1,Bladder,10X,TSP2,,7584.0,2676,fibroblast,fibroblast,True,stromal,female,,0
TTCGCTGGTAACGCGA_TSP2_Thymus_NA_10X_1_2,Thymus,10X,TSP2,,10388.0,3219,fibroblast,fibroblast,True,stromal,female,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
GCAGCCACATAACAGA_TSP2_Vasculature_Aorta_10X_1_1,Vasculature,10X,TSP2,Aorta,18481.0,4184,fibroblast,fibroblast,True,stromal,female,,0
AAATGGAAGGCCACCT_TSP14_LI_Proximal_10X_1_1,Large_Intestine,10X,TSP14,Proximal,2594.0,1087,fibroblast,fibroblast,True,stromal,male,,0
AGGTAGGAGTAATACG_TSP8_Prostate_NA_10X_1_2,Prostate,10X,TSP8,,6892.0,2465,fibroblast,Fibroblast,True,stromal,male,,0
AATGCCAGTTGGGATG_TSP14_Fat_MAT_10X_1_1,Fat,10X,TSP14,MAT,47408.0,6242,fibroblast,Fibroblasts,True,stromal,male,,0


In [14]:
sp.write_h5ad

TypeError: Can't implicitly convert non-string objects to strings

In [12]:
adataFull.write_h5ad(str(output_file))

TypeError: Can't implicitly convert non-string objects to strings

## Verification

In [4]:
import h5py

In [5]:
with h5py.File("/nfs/turbo/umms-indikar/shared/projects/DARPA_AI/in-silico-reprogramming/all-tfs/jpic-Sep-26-2024.h5ad", 'r') as f:
    print(f.keys())  # List file contents


<KeysViewHDF5 ['X', 'obs', 'raw']>


In [7]:
import h5py
import pandas as pd

# Open the .h5ad file
with h5py.File("/nfs/turbo/umms-indikar/shared/projects/DARPA_AI/in-silico-reprogramming/all-tfs/jpic-Sep-26-2024.h5ad", 'r') as f:
    # Access the 'obs' dataset
    obs_data = f['obs']

    # Convert the 'obs' data to a pandas DataFrame
    obs_df = pd.DataFrame(obs_data[:])  # Load data into a DataFrame
    # If the 'obs' is stored with string dtype keys or values, use appropriate decoding
    obs_df.columns = [key.decode() if isinstance(key, bytes) else key for key in obs_data.attrs['columns']]
    
# Now `obs_df` holds the 'obs' data
print(obs_df)


KeyError: 'Unable to synchronously open object (address of object past end of allocation)'

In [9]:
f['X']

KeyError: 'Unable to synchronously open object (invalid identifier type to function)'

In [6]:
adata = sp.read_h5ad("/nfs/turbo/umms-indikar/shared/projects/DARPA_AI/in-silico-reprogramming/all-tfs/jpic-Sep-26-2024.h5ad", backed='r')

KeyError: 'Unable to synchronously open object (address of object past end of allocation)'

# Embeddings