# Reprogramming Recepies

Auth: [Joshua Pickard](jpic@umich.edu)

Date: August 26, 2024

# Day 8

Generatting the perturbation files

## Set up

In [1]:
import numpy as np
import anndata as ad
import pandas as pd
import scanpy as sp
import os

def iterate_perturb_counts(adata, tf_list, scalar_list):
    """
    Applies perturbations to the expression data of specified transcription factors across multiple scalars 
    and stores the resulting AnnData objects in a dictionary.

    This function performs the following steps:
    1. Iterates over a list of scalar values.
    2. For each scalar, creates a copy of the AnnData object to preserve the original data.
    3. Applies the `perturb_counts` function to scale the expression data of genes listed in `tf_list` by
       the maximum gene expression of each cell and the current scalar.
    4. Stores the perturbed AnnData object in a dictionary with the scalar as the key.

    Parameters:
    tf_list (list): A list of gene symbols (transcription factors) to be perturbed.
    scalar_list (list): A list of scalar values for scaling the gene expression.
    adata (AnnData): The AnnData object containing gene expression data (cells x genes).

    Returns:
    dict: A dictionary where keys are scalar values and values are the corresponding perturbed AnnData objects.
    """
    
    adata_dict = {}
    
    for scalar in scalar_list:
        # Create a copy of the AnnData object for each scalar value
        adata_temp = adata.copy()
        
        # Apply perturb_counts to the copied AnnData object
        perturbed_adata = perturb_counts(adata_temp, tf_list, scalar)
        
        # Store the perturbed AnnData object in the dictionary with scalar as the key
        adata_dict[scalar] = perturbed_adata
    
    return adata_dict

def perturb_counts(adata, tf_list, scalar): 
    """
    Applies a perturbation to the expression data of specific genes in an AnnData object.

    This function performs the following steps:
    1. Computes the maximum gene expression level for each cell.
    2. Applies a scaling operation to the expression levels of genes listed in `tf_list`.
       - Each entry of these genes in the matrix is multiplied by the maximum expression level 
         of its respective cell and a specified scalar value.
    3. Updates the AnnData object with new columns:
       - 'scaled': A boolean column indicating whether each gene is in the `tf_list`.
       - 'scaled_by': Contains the scaling factor used for each gene (the product of the maximum 
         expression level of each cell and the scalar), or `1` if the gene was not in `tf_list`.
    
    Parameters:
    tf_list (list): A list of gene symbols to be perturbed.
    scalar (float): The scalar value used to scale the expression levels.
    adata (AnnData): The AnnData object containing gene expression data.

    Returns:
    AnnData: The updated AnnData object with applied perturbations and new columns.
    """

    # Create a boolean mask for genes in tf_list
    gene_mask = adata.var['gene_symbol'].isin(tf_list)
    
    # Save the original state of the parameter objects, in case some tfs do not translate (failsafe)
    original_X = adata.X.copy()
    original_gene_mask = gene_mask.copy()
    
    # Compute maximum expression level of each cell
    max_exp = np.max(adata.X, axis=1)

    """This is new today. v """
    # Raise an error if any of the gene names in tf_list do not match column names (we will manually update these in adata):
    missing_genes = [gene for gene in tf_list if gene not in adata.var['gene_symbol'].values]
    
    if missing_genes:
        # Restore original parameter objects
        adata.X = original_X
        gene_mask = original_gene_mask
        raise ValueError(f"Genes {missing_genes} not found in anndata object")

    else:    
        
        # Apply the scaling operation to the specified genes
        adata.X[:, gene_mask] = max_exp * scalar
        
        # Add/Update 'scaled' column in var
        adata.var['scaled'] = gene_mask
        
        # Add/Update 'scaled_by' column in var
        adata.var['scaled_by'] = scalar  # Default value for genes not in tf_list

        # Add/Update 'scaled_by' column in var
        adata.obs['U'] = scalar  # Default value for genes not in tf_list
    # adata.var = 
    return adata

def validateTFs(TFs, adata):
    adata_gene_list = adata.var['gene_symbol'].values.tolist()
    for TF in TFs:
        if TF not in adata_gene_list:
            return False
    return True

## Load Data

In [2]:
# Load reprogramming recipes
df = pd.read_csv('data/recipe_table_9_6_2024.csv')
df.head()

# Load firboblast source cells
DATAPATH = "/nfs/turbo/umms-indikar/shared/projects/DARPA_AI/in-silico-reprogramming/unperturbed"
FILE = "fibroblast.h5ad"
adata = sp.read_h5ad(os.path.join(DATAPATH, FILE))
adata_gene_list = adata.var['gene_symbol'].values.tolist()

## Make Files

In [3]:
output_directory = "/nfs/turbo/umms-indikar/shared/projects/DARPA_AI/in-silico-reprogramming/one-shot/perturbed"
scalars = [0.5, 0.75, 1.001]
for i in range(len(df['TFs'])):
    val = df['TFs'].iloc[i]
    TFs = val.split()
    if validateTFs(TFs, adata):

        # Join the TFs list into a string for the filename
        TFs_str = "_".join(TFs)
        
        # Generate the file path for saving
        file_name = f"{TFs_str}.h5ad"
        output_path = os.path.join(output_directory, file_name)

        # Check if the file already exists
        if os.path.exists(output_path):
            print(output_path + " already exist: continue!")
            continue
        
        print(TFs)
        adataDict = iterate_perturb_counts(adata.copy(), TFs, scalars)

        # Concatenate all AnnData objects along the observations axis
        concatenated_adata = ad.concat(list(adataDict.values()), axis=0)

        # This is the line was added today to fix the bug
        concatenated_adata.var = adata.var.copy()

        # Save reprogramming metadata into the concatenated_adata.obs table
        concatenated_adata.obs['Source_cells'] = df['Source cells'].iloc[i]
        concatenated_adata.obs['Target_cells'] = df['Target cells'].iloc[i]
        concatenated_adata.obs['Treatment'] = df['Treatment'].iloc[i]
        concatenated_adata.obs['Species'] = df['Species'].iloc[i]
        concatenated_adata.obs['Cell_Transplantation'] = df['Cell Transplantation'].iloc[i]
        concatenated_adata.obs['Published_Year'] = df['Published Year'].iloc[i]
        concatenated_adata.obs['PMID'] = df['PMID'].iloc[i]
        
        # Save the concatenated AnnData object to the file
        concatenated_adata.write_h5ad(output_path)
        print("    file created")

print('All recipes complete!')

/nfs/turbo/umms-indikar/shared/projects/DARPA_AI/in-silico-reprogramming/one-shot/perturbed/ASCL1_POU3F2_MYT1L_LMX1A_FOXA2.h5ad already exist: continue!
/nfs/turbo/umms-indikar/shared/projects/DARPA_AI/in-silico-reprogramming/one-shot/perturbed/SOX2.h5ad already exist: continue!
/nfs/turbo/umms-indikar/shared/projects/DARPA_AI/in-silico-reprogramming/one-shot/perturbed/NEUROG2_SOX11_ISL1_LHX3.h5ad already exist: continue!
/nfs/turbo/umms-indikar/shared/projects/DARPA_AI/in-silico-reprogramming/one-shot/perturbed/SOX2_HMGA2.h5ad already exist: continue!
/nfs/turbo/umms-indikar/shared/projects/DARPA_AI/in-silico-reprogramming/one-shot/perturbed/ASCL1_PAX6.h5ad already exist: continue!
/nfs/turbo/umms-indikar/shared/projects/DARPA_AI/in-silico-reprogramming/one-shot/perturbed/SOX2_PAX6.h5ad already exist: continue!
/nfs/turbo/umms-indikar/shared/projects/DARPA_AI/in-silico-reprogramming/one-shot/perturbed/PTF1A.h5ad already exist: continue!
/nfs/turbo/umms-indikar/shared/projects/DARPA_AI

  self._set_arrayXarray_sparse(i, j, x)
  self._set_arrayXarray_sparse(i, j, x)
  self._set_arrayXarray_sparse(i, j, x)
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


    file created
['MYOCD', 'GATA6', 'MEF2C']


  self._set_arrayXarray_sparse(i, j, x)
  self._set_arrayXarray_sparse(i, j, x)
  self._set_arrayXarray_sparse(i, j, x)
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


    file created
['SNAI2', 'EYA1', 'SIX1']


  self._set_arrayXarray_sparse(i, j, x)
  self._set_arrayXarray_sparse(i, j, x)
  self._set_arrayXarray_sparse(i, j, x)
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


    file created


AttributeError: 'float' object has no attribute 'split'

In [5]:
df['TFs']

0                                     ASCL1 LXMLA NR4A2
1                           POU3F2 ASCL1 MYT1L NEUROD11
2                        ASCL1 POU3F2 MYT1L LMX1A FOXA2
3                                                  SOX2
4                    ASCL1 NEUROG2 SOX2 NR4A2 PITX3 P53
5                               NEUROG2 SOX11 ISL1 LHX3
6                                            SOX2 HMGA2
7                                            ASCL1 PAX6
8                           POU3F2 ASCL1 MYT1L NEUROD11
9     ASCL1 ISL1 NEUROD11 POU3F2 MNX1 LHX3 MYT1L NEU...
10                                  SOX2 GATA3 NEUROD11
11                                            SOX2 PAX6
12                                                PTF1A
13        SOX2; SOX2 PAX6; SOX2 LMX1A; SOX2 LMX1A FOXA2
14                                   SOX2 ASCL1 NEUROG2
15                                           ASCL1 SOX2
16                           FOXM1 SOX2 MYC SALL4 STAT6
17            AR SOX2 SMAD3 MYC JUN WT1 TAL1 SPI

## Debug

In [4]:
adata

AnnData object with n_obs × n_vars = 38151 × 58870
    obs: 'organ_tissue', 'method', 'donor', 'anatomical_information', 'n_counts_UMIs', 'n_genes', 'cell_ontology_class', 'free_annotation', 'manually_annotated', 'compartment', 'gender'
    var: 'gene_symbol', 'feature_type', 'ensemblid', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: '_scvi', '_training_mode', 'dendrogram_cell_type_tissue', 'dendrogram_computational_compartment_assignment', 'dendrogram_consensus_prediction', 'dendrogram_tissue_cell_type', 'donor_colors', 'donor_method_colors', 'hvg', 'method_colors', 'neighbors', 'organ_tissue_colors', 'sex_colors', 'tissue_colors', 'umap'
    obsm: 'X_pca', 'X_scvi', 'X_scvi_umap', 'X_umap'
    layers: 'decontXcounts', 'raw_counts'
    obsp: 'connectivities', 'distances'

In [5]:
adata.var

Unnamed: 0,gene_symbol,feature_type,ensemblid,highly_variable,means,dispersions,dispersions_norm,mean,std
DDX11L1,DDX11L1,Gene Expression,ENSG00000223972.5,False,6.398244e-05,0.835044,-0.573947,0.000039,0.005574
WASH7P,WASH7P,Gene Expression,ENSG00000227232.5,False,2.274395e-03,2.442280,0.533203,0.001080,0.031731
MIR6859-1,MIR6859-1,Gene Expression,ENSG00000278267.1,False,6.175251e-05,1.295335,-0.256874,0.000033,0.005634
MIR1302-2HG,MIR1302-2HG,Gene Expression,ENSG00000243485.5,False,1.372886e-04,2.656352,0.680668,0.000048,0.008041
MIR1302-2,MIR1302-2,Gene Expression,ENSG00000284332.1,False,1.000000e-12,,0.000000,0.000000,1.000000
...,...,...,...,...,...,...,...,...,...
MT-ND6,MT-ND6,Gene Expression,ENSG00000198695.2,False,9.634841e-01,2.466404,0.154140,0.590065,0.741395
MT-TE,MT-TE,Gene Expression,ENSG00000210194.1,False,1.600667e-01,1.603787,-0.044396,0.083929,0.301820
MT-CYB,MT-CYB,Gene Expression,ENSG00000198727.2,False,4.367693e+00,4.765751,-0.499747,3.874830,1.104192
MT-TT,MT-TT,Gene Expression,ENSG00000210195.2,False,6.573967e-02,0.624316,-0.719108,0.040580,0.186848


In [6]:
padata = perturb_counts(adata, ['DDX11L1'], 1)


  self._set_arrayXarray_sparse(i, j, x)


In [12]:
concatenated_adata = ad.concat([adata, padata], axis=0)
concatenated_adata.var = adata.var.copy()
concatenated_adata.var

  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


Unnamed: 0,gene_symbol,feature_type,ensemblid,highly_variable,means,dispersions,dispersions_norm,mean,std,scaled,scaled_by
DDX11L1,DDX11L1,Gene Expression,ENSG00000223972.5,False,6.398244e-05,0.835044,-0.573947,0.000039,0.005574,True,1
WASH7P,WASH7P,Gene Expression,ENSG00000227232.5,False,2.274395e-03,2.442280,0.533203,0.001080,0.031731,False,1
MIR6859-1,MIR6859-1,Gene Expression,ENSG00000278267.1,False,6.175251e-05,1.295335,-0.256874,0.000033,0.005634,False,1
MIR1302-2HG,MIR1302-2HG,Gene Expression,ENSG00000243485.5,False,1.372886e-04,2.656352,0.680668,0.000048,0.008041,False,1
MIR1302-2,MIR1302-2,Gene Expression,ENSG00000284332.1,False,1.000000e-12,,0.000000,0.000000,1.000000,False,1
...,...,...,...,...,...,...,...,...,...,...,...
MT-ND6,MT-ND6,Gene Expression,ENSG00000198695.2,False,9.634841e-01,2.466404,0.154140,0.590065,0.741395,False,1
MT-TE,MT-TE,Gene Expression,ENSG00000210194.1,False,1.600667e-01,1.603787,-0.044396,0.083929,0.301820,False,1
MT-CYB,MT-CYB,Gene Expression,ENSG00000198727.2,False,4.367693e+00,4.765751,-0.499747,3.874830,1.104192,False,1
MT-TT,MT-TT,Gene Expression,ENSG00000210195.2,False,6.573967e-02,0.624316,-0.719108,0.040580,0.186848,False,1


In [10]:
adata.var

Unnamed: 0,gene_symbol,feature_type,ensemblid,highly_variable,means,dispersions,dispersions_norm,mean,std,scaled,scaled_by
DDX11L1,DDX11L1,Gene Expression,ENSG00000223972.5,False,6.398244e-05,0.835044,-0.573947,0.000039,0.005574,True,1
WASH7P,WASH7P,Gene Expression,ENSG00000227232.5,False,2.274395e-03,2.442280,0.533203,0.001080,0.031731,False,1
MIR6859-1,MIR6859-1,Gene Expression,ENSG00000278267.1,False,6.175251e-05,1.295335,-0.256874,0.000033,0.005634,False,1
MIR1302-2HG,MIR1302-2HG,Gene Expression,ENSG00000243485.5,False,1.372886e-04,2.656352,0.680668,0.000048,0.008041,False,1
MIR1302-2,MIR1302-2,Gene Expression,ENSG00000284332.1,False,1.000000e-12,,0.000000,0.000000,1.000000,False,1
...,...,...,...,...,...,...,...,...,...,...,...
MT-ND6,MT-ND6,Gene Expression,ENSG00000198695.2,False,9.634841e-01,2.466404,0.154140,0.590065,0.741395,False,1
MT-TE,MT-TE,Gene Expression,ENSG00000210194.1,False,1.600667e-01,1.603787,-0.044396,0.083929,0.301820,False,1
MT-CYB,MT-CYB,Gene Expression,ENSG00000198727.2,False,4.367693e+00,4.765751,-0.499747,3.874830,1.104192,False,1
MT-TT,MT-TT,Gene Expression,ENSG00000210195.2,False,6.573967e-02,0.624316,-0.719108,0.040580,0.186848,False,1


In [9]:
concatenated_adata.var

DDX11L1
WASH7P
MIR6859-1
MIR1302-2HG
MIR1302-2
...
MT-ND6
MT-TE
MT-CYB
MT-TT
MT-TP


In [7]:
padata.var

Unnamed: 0,gene_symbol,feature_type,ensemblid,highly_variable,means,dispersions,dispersions_norm,mean,std,scaled,scaled_by
DDX11L1,DDX11L1,Gene Expression,ENSG00000223972.5,False,6.398244e-05,0.835044,-0.573947,0.000039,0.005574,True,1
WASH7P,WASH7P,Gene Expression,ENSG00000227232.5,False,2.274395e-03,2.442280,0.533203,0.001080,0.031731,False,1
MIR6859-1,MIR6859-1,Gene Expression,ENSG00000278267.1,False,6.175251e-05,1.295335,-0.256874,0.000033,0.005634,False,1
MIR1302-2HG,MIR1302-2HG,Gene Expression,ENSG00000243485.5,False,1.372886e-04,2.656352,0.680668,0.000048,0.008041,False,1
MIR1302-2,MIR1302-2,Gene Expression,ENSG00000284332.1,False,1.000000e-12,,0.000000,0.000000,1.000000,False,1
...,...,...,...,...,...,...,...,...,...,...,...
MT-ND6,MT-ND6,Gene Expression,ENSG00000198695.2,False,9.634841e-01,2.466404,0.154140,0.590065,0.741395,False,1
MT-TE,MT-TE,Gene Expression,ENSG00000210194.1,False,1.600667e-01,1.603787,-0.044396,0.083929,0.301820,False,1
MT-CYB,MT-CYB,Gene Expression,ENSG00000198727.2,False,4.367693e+00,4.765751,-0.499747,3.874830,1.104192,False,1
MT-TT,MT-TT,Gene Expression,ENSG00000210195.2,False,6.573967e-02,0.624316,-0.719108,0.040580,0.186848,False,1


# Day 7

Nat says the Reproramming recipes should work

## Set up

In [1]:
import numpy as np
import anndata as ad
import pandas as pd
import scanpy as sp
import os

In [3]:
def iterate_perturb_counts(adata, tf_list, scalar_list):
    """
    Applies perturbations to the expression data of specified transcription factors across multiple scalars 
    and stores the resulting AnnData objects in a dictionary.

    This function performs the following steps:
    1. Iterates over a list of scalar values.
    2. For each scalar, creates a copy of the AnnData object to preserve the original data.
    3. Applies the `perturb_counts` function to scale the expression data of genes listed in `tf_list` by
       the maximum gene expression of each cell and the current scalar.
    4. Stores the perturbed AnnData object in a dictionary with the scalar as the key.

    Parameters:
    tf_list (list): A list of gene symbols (transcription factors) to be perturbed.
    scalar_list (list): A list of scalar values for scaling the gene expression.
    adata (AnnData): The AnnData object containing gene expression data (cells x genes).

    Returns:
    dict: A dictionary where keys are scalar values and values are the corresponding perturbed AnnData objects.
    """
    
    adata_dict = {}
    
    for scalar in scalar_list:
        # Create a copy of the AnnData object for each scalar value
        adata_temp = adata.copy()
        
        # Apply perturb_counts to the copied AnnData object
        perturbed_adata = perturb_counts(adata_temp, tf_list, scalar)
        
        # Store the perturbed AnnData object in the dictionary with scalar as the key
        adata_dict[scalar] = perturbed_adata
    
    return adata_dict

def perturb_counts(adata, tf_list, scalar): 
    """
    Applies a perturbation to the expression data of specific genes in an AnnData object.

    This function performs the following steps:
    1. Computes the maximum gene expression level for each cell.
    2. Applies a scaling operation to the expression levels of genes listed in `tf_list`.
       - Each entry of these genes in the matrix is multiplied by the maximum expression level 
         of its respective cell and a specified scalar value.
    3. Updates the AnnData object with new columns:
       - 'scaled': A boolean column indicating whether each gene is in the `tf_list`.
       - 'scaled_by': Contains the scaling factor used for each gene (the product of the maximum 
         expression level of each cell and the scalar), or `1` if the gene was not in `tf_list`.
    
    Parameters:
    tf_list (list): A list of gene symbols to be perturbed.
    scalar (float): The scalar value used to scale the expression levels.
    adata (AnnData): The AnnData object containing gene expression data.

    Returns:
    AnnData: The updated AnnData object with applied perturbations and new columns.
    """

    # Create a boolean mask for genes in tf_list
    gene_mask = adata.var['gene_symbol'].isin(tf_list)
    
    # Save the original state of the parameter objects, in case some tfs do not translate (failsafe)
    original_X = adata.X.copy()
    original_gene_mask = gene_mask.copy()
    
    # Compute maximum expression level of each cell
    max_exp = np.max(adata.X, axis=1)

    """This is new today. v """
    # Raise an error if any of the gene names in tf_list do not match column names (we will manually update these in adata):
    missing_genes = [gene for gene in tf_list if gene not in adata.var['gene_symbol'].values]
    
    if missing_genes:
        # Restore original parameter objects
        adata.X = original_X
        gene_mask = original_gene_mask
        raise ValueError(f"Genes {missing_genes} not found in anndata object")

    else:    
        
        # Apply the scaling operation to the specified genes
        adata.X[:, gene_mask] = max_exp * scalar
        
        # Add/Update 'scaled' column in var
        adata.var['scaled'] = gene_mask
        
        # Add/Update 'scaled_by' column in var
        adata.var['scaled_by'] = scalar  # Default value for genes not in tf_list

        # Add/Update 'scaled_by' column in var
        adata.obs['U'] = scalar  # Default value for genes not in tf_list
    # adata.var = 
    return adata

def validateTFs(TFs, adata):
    adata_gene_list = adata.var['gene_symbol'].values.tolist()
    for TF in TFs:
        if TF not in adata_gene_list:
            return False
    return True

## Validate Recipes

In [4]:
DATAPATH = "/nfs/turbo/umms-indikar/shared/projects/DGC/data/tabula_sapiens/jpic/"
FILE = "fibroblast.h5ad"
adata = sp.read_h5ad(os.path.join(DATAPATH, FILE))
adata_gene_list = adata.var['gene_symbol'].values.tolist()

In [5]:
df = pd.read_csv('data/recipe_table_9_6_2024.csv')
df.head()

Unnamed: 0,Source cells,Source,Target cells,Target,Treatment,TFs,Species,Cell Transplantation,Published Year,PMID,Unnamed: 10,Notes
0,Embryonic Fibroblasts and Adult Skin Fibroblasts,,Dopamine Neurons,,"ASCL1, LXMLA, and NURR1",ASCL1 LXMLA NR4A2,Both,Yes,2011.0,2172534_27,,
1,Fetal Fibroblasts and Postnatal Foreskin Fibro...,,Neuronal Cells,,"BRN2, ASCL1, MYT1L, and NEUROD1",POU3F2 ASCL1 MYT1L NEUROD11,Human,,2011.0,21617644_28,,
2,Embryonic Fibroblasts and Postnatal Fibroblasts,,Neuronal Cells,,"ASCL1, BRN2, MYT1L, LMX1A, and FOXA2",ASCL1 POU3F2 MYT1L LMX1A FOXA2,Human,,2011.0,21646515_29,,
3,Embryonic Fibroblasts and Fetal Foreskin Fibro...,,Neural Stem Cells,,SOX2,SOX2,Both,Yes,2012.0,22683203_31,,
4,Fibroblasts (IMR90 Cells),,Dopaminergic Neurons,,"MASH1, NGN2, SOX2, NURR1, and PITX3 + A Domina...",ASCL1 NEUROG2 SOX2 NR4A2 PITX3 P53,Human,Yes,2014.0,25129808_214,,


In [6]:
true_count = 0

for i in range(df.shape[0]):
    try:
        TFs = df['TFs'].iloc[i].split()
        result = validateTFs(TFs, adata)
        # print(result)
        if result == True:
            true_count += 1
    except:
        continue

print(f'Number of times True was printed: {true_count}')
print(f'Number of times True rate: {true_count/df.shape[0]}')


Number of times True was printed: 34
Number of times True rate: 0.6296296296296297


## Make the perturbation files

In [7]:
adata.var

Unnamed: 0,gene_symbol,feature_type,ensemblid,highly_variable,means,dispersions,dispersions_norm,mean,std
DDX11L1,DDX11L1,Gene Expression,ENSG00000223972.5,False,6.398244e-05,0.835044,-0.573947,0.000039,0.005574
WASH7P,WASH7P,Gene Expression,ENSG00000227232.5,False,2.274395e-03,2.442280,0.533203,0.001080,0.031731
MIR6859-1,MIR6859-1,Gene Expression,ENSG00000278267.1,False,6.175251e-05,1.295335,-0.256874,0.000033,0.005634
MIR1302-2HG,MIR1302-2HG,Gene Expression,ENSG00000243485.5,False,1.372886e-04,2.656352,0.680668,0.000048,0.008041
MIR1302-2,MIR1302-2,Gene Expression,ENSG00000284332.1,False,1.000000e-12,,0.000000,0.000000,1.000000
...,...,...,...,...,...,...,...,...,...
MT-ND6,MT-ND6,Gene Expression,ENSG00000198695.2,False,9.634841e-01,2.466404,0.154140,0.590065,0.741395
MT-TE,MT-TE,Gene Expression,ENSG00000210194.1,False,1.600667e-01,1.603787,-0.044396,0.083929,0.301820
MT-CYB,MT-CYB,Gene Expression,ENSG00000198727.2,False,4.367693e+00,4.765751,-0.499747,3.874830,1.104192
MT-TT,MT-TT,Gene Expression,ENSG00000210195.2,False,6.573967e-02,0.624316,-0.719108,0.040580,0.186848


In [8]:
output_directory = "/nfs/turbo/umms-indikar/shared/projects/DARPA_AI/in-silico-reprogramming/one-shot/perturbed"
scalars = [0.5, 0.75, 1.001]
for i in range(len(df['TFs'])):
    val = df['TFs'].iloc[i]
    TFs = val.split()
    if validateTFs(TFs, adata):
        print(TFs)
        adataDict = iterate_perturb_counts(adata.copy(), TFs, scalars)

        # Concatenate all AnnData objects along the observations axis
        concatenated_adata = ad.concat(list(adataDict.values()), axis=0)

        # Save reprogramming metadata into the concatenated_adata.obs table
        concatenated_adata.obs['Source_cells'] = df['Source cells'].iloc[i]
        concatenated_adata.obs['Target_cells'] = df['Target cells'].iloc[i]
        concatenated_adata.obs['Treatment'] = df['Treatment'].iloc[i]
        concatenated_adata.obs['Species'] = df['Species'].iloc[i]
        concatenated_adata.obs['Cell_Transplantation'] = df['Cell Transplantation'].iloc[i]
        concatenated_adata.obs['Published_Year'] = df['Published Year'].iloc[i]
        concatenated_adata.obs['PMID'] = df['PMID'].iloc[i]
        
        # Join the TFs list into a string for the filename
        TFs_str = "_".join(TFs)
        
        # Generate the file path for saving
        file_name = f"{TFs_str}.h5ad"
        output_path = os.path.join(output_directory, file_name)
        
        # Save the concatenated AnnData object to the file
        concatenated_adata.write_h5ad(output_path)
        break

['ASCL1', 'POU3F2', 'MYT1L', 'LMX1A', 'FOXA2']


  self._set_arrayXarray_sparse(i, j, x)
  self._set_arrayXarray_sparse(i, j, x)
  self._set_arrayXarray_sparse(i, j, x)
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


OSError: [Errno 5] Unable to extend file properly, errno = 9, error message = 'bad file descriptor' (file write failed: time = Fri Sep  6 16:27:05 2024
, filename = '/nfs/turbo/umms-indikar/shared/projects/DARPA_AI/in-silico-reprogramming/one-shot/perturbed/ASCL1_POU3F2_MYT1L_LMX1A_FOXA2.h5ad', file descriptor = 54, errno = 5, error message = 'Input/output error', buf = 0x1063e960, total write size = 272, bytes this sub-write = 272, bytes actually written = 18446744073709551615, offset = 0)

In [23]:
concatenated_adata.obs

Unnamed: 0,organ_tissue,method,donor,anatomical_information,n_counts_UMIs,n_genes,cell_ontology_class,free_annotation,manually_annotated,compartment,gender,U,Source_cells,Target_cells,Treatment,Species,Cell_Transplantation,Published_Year,PMID
AACAAAGTCCGGCTTT_TSP6_Liver_NA_10X_1_1,Liver,10X,TSP6,,10423.0,2986,fibroblast,Stellate/Fibroblast,True,stromal,male,0.500,Embryonic Fibroblasts and Postnatal Fibroblasts,Neuronal Cells,"ASCL1, BRN2, MYT1L, LMX1A, and FOXA2",Human,,2011.0,21646515_29
AATGACCAGTCTTCGA_TSP6_Liver_NA_10X_1_1,Liver,10X,TSP6,,11223.0,3248,fibroblast,Stellate/Fibroblast,True,stromal,male,0.500,Embryonic Fibroblasts and Postnatal Fibroblasts,Neuronal Cells,"ASCL1, BRN2, MYT1L, LMX1A, and FOXA2",Human,,2011.0,21646515_29
ACCACAATCCGAGAAG_TSP6_Liver_NA_10X_1_1,Liver,10X,TSP6,,14695.0,4469,fibroblast,Stellate/Fibroblast,True,stromal,male,0.500,Embryonic Fibroblasts and Postnatal Fibroblasts,Neuronal Cells,"ASCL1, BRN2, MYT1L, LMX1A, and FOXA2",Human,,2011.0,21646515_29
ACTTCGCCAACTTCTT_TSP6_Liver_NA_10X_1_1,Liver,10X,TSP6,,8977.0,2875,fibroblast,Stellate/Fibroblast,True,stromal,male,0.500,Embryonic Fibroblasts and Postnatal Fibroblasts,Neuronal Cells,"ASCL1, BRN2, MYT1L, LMX1A, and FOXA2",Human,,2011.0,21646515_29
AGAAGTATCAGGGATG_TSP6_Liver_NA_10X_1_1,Liver,10X,TSP6,,12109.0,3239,fibroblast,Stellate/Fibroblast,True,stromal,male,0.500,Embryonic Fibroblasts and Postnatal Fibroblasts,Neuronal Cells,"ASCL1, BRN2, MYT1L, LMX1A, and FOXA2",Human,,2011.0,21646515_29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TSP2_Vasculature_aorta_SS2_B114585_B133324_Stromal_P3_S99,Vasculature,smartseq2,TSP2,,308152.0,2090,fibroblast,fibroblast,True,stromal,female,1.001,Embryonic Fibroblasts and Postnatal Fibroblasts,Neuronal Cells,"ASCL1, BRN2, MYT1L, LMX1A, and FOXA2",Human,,2011.0,21646515_29
TSP2_Vasculature_aorta_SS2_B114585_B133324_Stromal_P7_S103,Vasculature,smartseq2,TSP2,,387043.0,2132,fibroblast,fibroblast,True,stromal,female,1.001,Embryonic Fibroblasts and Postnatal Fibroblasts,Neuronal Cells,"ASCL1, BRN2, MYT1L, LMX1A, and FOXA2",Human,,2011.0,21646515_29
TSP2_Vasculature_aorta_SS2_B114585_B133324_Stromal_P9_S105,Vasculature,smartseq2,TSP2,,412608.0,3140,fibroblast,fibroblast,True,stromal,female,1.001,Embryonic Fibroblasts and Postnatal Fibroblasts,Neuronal Cells,"ASCL1, BRN2, MYT1L, LMX1A, and FOXA2",Human,,2011.0,21646515_29
TSP2_Vasculature_aorta_SS2_B113343_B133091_Immune_L6_S270,Vasculature,smartseq2,TSP2,aorta,341884.0,4379,fibroblast,fibroblast,True,stromal,female,1.001,Embryonic Fibroblasts and Postnatal Fibroblasts,Neuronal Cells,"ASCL1, BRN2, MYT1L, LMX1A, and FOXA2",Human,,2011.0,21646515_29


In [24]:
adata.var

Unnamed: 0,gene_symbol,feature_type,ensemblid,highly_variable,means,dispersions,dispersions_norm,mean,std
DDX11L1,DDX11L1,Gene Expression,ENSG00000223972.5,False,6.398244e-05,0.835044,-0.573947,0.000039,0.005574
WASH7P,WASH7P,Gene Expression,ENSG00000227232.5,False,2.274395e-03,2.442280,0.533203,0.001080,0.031731
MIR6859-1,MIR6859-1,Gene Expression,ENSG00000278267.1,False,6.175251e-05,1.295335,-0.256874,0.000033,0.005634
MIR1302-2HG,MIR1302-2HG,Gene Expression,ENSG00000243485.5,False,1.372886e-04,2.656352,0.680668,0.000048,0.008041
MIR1302-2,MIR1302-2,Gene Expression,ENSG00000284332.1,False,1.000000e-12,,0.000000,0.000000,1.000000
...,...,...,...,...,...,...,...,...,...
MT-ND6,MT-ND6,Gene Expression,ENSG00000198695.2,False,9.634841e-01,2.466404,0.154140,0.590065,0.741395
MT-TE,MT-TE,Gene Expression,ENSG00000210194.1,False,1.600667e-01,1.603787,-0.044396,0.083929,0.301820
MT-CYB,MT-CYB,Gene Expression,ENSG00000198727.2,False,4.367693e+00,4.765751,-0.499747,3.874830,1.104192
MT-TT,MT-TT,Gene Expression,ENSG00000210195.2,False,6.573967e-02,0.624316,-0.719108,0.040580,0.186848


In [22]:
concatenated_adata.var

DDX11L1
WASH7P
MIR6859-1
MIR1302-2HG
MIR1302-2
...
MT-ND6
MT-TE
MT-CYB
MT-TT
MT-TP


# Day 6

Make perturbations for all TFs in the [Human Transcription Factors Database](https://humantfs.ccbr.utoronto.ca/download.php)

## Set up

In [1]:
import numpy as np
import anndata as ad
import pandas as pd
import scanpy as sp
import os

In [2]:
def iterate_perturb_counts(adata, tf_list, scalar_list):
    """
    Applies perturbations to the expression data of specified transcription factors across multiple scalars 
    and stores the resulting AnnData objects in a dictionary.

    This function performs the following steps:
    1. Iterates over a list of scalar values.
    2. For each scalar, creates a copy of the AnnData object to preserve the original data.
    3. Applies the `perturb_counts` function to scale the expression data of genes listed in `tf_list` by
       the maximum gene expression of each cell and the current scalar.
    4. Stores the perturbed AnnData object in a dictionary with the scalar as the key.

    Parameters:
    tf_list (list): A list of gene symbols (transcription factors) to be perturbed.
    scalar_list (list): A list of scalar values for scaling the gene expression.
    adata (AnnData): The AnnData object containing gene expression data (cells x genes).

    Returns:
    dict: A dictionary where keys are scalar values and values are the corresponding perturbed AnnData objects.
    """
    
    adata_dict = {}
    
    for scalar in scalar_list:
        # Create a copy of the AnnData object for each scalar value
        adata_temp = adata.copy()
        
        # Apply perturb_counts to the copied AnnData object
        perturbed_adata = perturb_counts(adata_temp, tf_list, scalar)
        
        # Store the perturbed AnnData object in the dictionary with scalar as the key
        adata_dict[scalar] = perturbed_adata
    
    return adata_dict

def perturb_counts(adata, tf_list, scalar): 
    """
    Applies a perturbation to the expression data of specific genes in an AnnData object.

    This function performs the following steps:
    1. Computes the maximum gene expression level for each cell.
    2. Applies a scaling operation to the expression levels of genes listed in `tf_list`.
       - Each entry of these genes in the matrix is multiplied by the maximum expression level 
         of its respective cell and a specified scalar value.
    3. Updates the AnnData object with new columns:
       - 'scaled': A boolean column indicating whether each gene is in the `tf_list`.
       - 'scaled_by': Contains the scaling factor used for each gene (the product of the maximum 
         expression level of each cell and the scalar), or `1` if the gene was not in `tf_list`.
    
    Parameters:
    tf_list (list): A list of gene symbols to be perturbed.
    scalar (float): The scalar value used to scale the expression levels.
    adata (AnnData): The AnnData object containing gene expression data.

    Returns:
    AnnData: The updated AnnData object with applied perturbations and new columns.
    """

    # Create a boolean mask for genes in tf_list
    gene_mask = adata.var['gene_symbol'].isin(tf_list)
    
    # Save the original state of the parameter objects, in case some tfs do not translate (failsafe)
    original_X = adata.X.copy()
    original_gene_mask = gene_mask.copy()
    
    # Compute maximum expression level of each cell
    max_exp = np.max(adata.X, axis=1)

    """This is new today. v """
    # Raise an error if any of the gene names in tf_list do not match column names (we will manually update these in adata):
    missing_genes = [gene for gene in tf_list if gene not in adata.var['gene_symbol'].values]
    
    if missing_genes:
        # Restore original parameter objects
        adata.X = original_X
        gene_mask = original_gene_mask
        raise ValueError(f"Genes {missing_genes} not found in anndata object")

    else:    
        
        # Apply the scaling operation to the specified genes
        adata.X[:, gene_mask] = max_exp * scalar
        
        # Add/Update 'scaled' column in var
        adata.var['scaled'] = gene_mask
        
        # Add/Update 'scaled_by' column in var
        adata.var['scaled_by'] = scalar  # Default value for genes not in tf_list

        # Add/Update 'scaled_by' column in var
        adata.obs['U'] = scalar  # Default value for genes not in tf_list
    
    return adata

def validateTFs(TFs, adata):
    adata_gene_list = adata.var['gene_symbol'].values.tolist()
    for TF in TFs:
        if TF not in adata_gene_list:
            return False
    return True

## Load Data

In [3]:
df = pd.read_csv('data/HumanTFs_v_1.01.csv')

DATAPATH = "/nfs/turbo/umms-indikar/shared/projects/DGC/data/tabula_sapiens/jpic/"
FILE = "fibroblast.h5ad"
adata = sp.read_h5ad(os.path.join(DATAPATH, FILE))
adata_gene_list = adata.var['gene_symbol'].values.tolist()


In [4]:
df = df[df['Is TF?'] == 'Yes']
all_tfs = list(df['HGNC symbol'].values)

## Make Perturbation Files

In [None]:
output_directory = "/nfs/turbo/umms-indikar/shared/projects/DARPA_AI/in-silico-reprogramming/one-shot/perturbed"
scalars = [0.5, 0.75, 1.001]
for tf in all_tfs:
    TFs = [tf]
    if validateTFs(TFs, adata):
        # Join the TFs list into a string for the filename
        TFs_str = "_".join(TFs)
       
        # Generate the file path for saving
        file_name = f"{TFs_str}.h5ad"
        output_path = os.path.join(output_directory, file_name)

        # prevents writing duplicate files
        if os.path.exists(output_path):
            continue

        # display some information
        print(TFs)

        adataDict = iterate_perturb_counts(adata, TFs, scalars)

        # Concatenate all AnnData objects along the observations axis
        concatenated_adata = ad.concat(list(adataDict.values()), axis=0)

        # Save reprogramming metadata into the concatenated_adata.obs table
        concatenated_adata.obs['HumanTFs'] = True
       
        # Save the concatenated AnnData object to the file
        concatenated_adata.write_h5ad(output_path)


['ARID3C']


  self._set_arrayXarray_sparse(i, j, x)
  self._set_arrayXarray_sparse(i, j, x)
  self._set_arrayXarray_sparse(i, j, x)
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


['ARID5A']


  self._set_arrayXarray_sparse(i, j, x)
  self._set_arrayXarray_sparse(i, j, x)
  self._set_arrayXarray_sparse(i, j, x)
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


['ARID5B']


  self._set_arrayXarray_sparse(i, j, x)
  self._set_arrayXarray_sparse(i, j, x)
  self._set_arrayXarray_sparse(i, j, x)
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


['KDM5B']


  self._set_arrayXarray_sparse(i, j, x)
  self._set_arrayXarray_sparse(i, j, x)
  self._set_arrayXarray_sparse(i, j, x)
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


['ARID2']


In [10]:
concatenated_adata.obs

Unnamed: 0,organ_tissue,method,donor,anatomical_information,n_counts_UMIs,n_genes,cell_ontology_class,free_annotation,manually_annotated,compartment,gender
AACAAAGTCCGGCTTT_TSP6_Liver_NA_10X_1_1,Liver,10X,TSP6,,10423.0,2986,fibroblast,Stellate/Fibroblast,True,stromal,male
AATGACCAGTCTTCGA_TSP6_Liver_NA_10X_1_1,Liver,10X,TSP6,,11223.0,3248,fibroblast,Stellate/Fibroblast,True,stromal,male
ACCACAATCCGAGAAG_TSP6_Liver_NA_10X_1_1,Liver,10X,TSP6,,14695.0,4469,fibroblast,Stellate/Fibroblast,True,stromal,male
ACTTCGCCAACTTCTT_TSP6_Liver_NA_10X_1_1,Liver,10X,TSP6,,8977.0,2875,fibroblast,Stellate/Fibroblast,True,stromal,male
AGAAGTATCAGGGATG_TSP6_Liver_NA_10X_1_1,Liver,10X,TSP6,,12109.0,3239,fibroblast,Stellate/Fibroblast,True,stromal,male
...,...,...,...,...,...,...,...,...,...,...,...
TSP2_Vasculature_aorta_SS2_B114585_B133324_Stromal_P3_S99,Vasculature,smartseq2,TSP2,,308152.0,2090,fibroblast,fibroblast,True,stromal,female
TSP2_Vasculature_aorta_SS2_B114585_B133324_Stromal_P7_S103,Vasculature,smartseq2,TSP2,,387043.0,2132,fibroblast,fibroblast,True,stromal,female
TSP2_Vasculature_aorta_SS2_B114585_B133324_Stromal_P9_S105,Vasculature,smartseq2,TSP2,,412608.0,3140,fibroblast,fibroblast,True,stromal,female
TSP2_Vasculature_aorta_SS2_B113343_B133091_Immune_L6_S270,Vasculature,smartseq2,TSP2,aorta,341884.0,4379,fibroblast,fibroblast,True,stromal,female


# Day 5

Helping Nat make some changes

In [1]:
output_directory = "/nfs/turbo/umms-indikar/shared/projects/DARPA_AI/in-silico-reprogramming/one-shot/perturbed"
scalars = [0.5, 0.75, 1.001]
for i in range(len(df['TFs'])):
    val = df['TFs'].iloc[i]
    val = val.replace(',',' ')
    val = val.replace(';',' ')
    val = val.replace(':',' ')
    TFs = val.split(' ')
    if not validateTFs(TFs, adata):
        print(TFs)

NameError: name 'df' is not defined

In [None]:
output_directory = "/nfs/turbo/umms-indikar/shared/projects/DARPA_AI/in-silico-reprogramming/one-shot/perturbed"
scalars = [0.5, 0.75, 1.001]
for i in range(len(df['TFs'])):
    val = df['TFs'].iloc[i]
    val = val.replace(',',' ')
    val = val.replace(';',' ')
    val = val.replace(':',' ')
    TFs = val.split(' ')
    if validateTFs(TFs, adata):
        # Join the TFs list into a string for the filename
        TFs_str = "_".join(TFs)

        # Generate the file path for saving
        file_name = f"{TFs_str}.h5ad"
        output_path = os.path.join(output_directory, file_name)

        # prevents writing duplicate files
        if os.path.exists(output_filepath):
            continue

        # display some information
        print(TFs)

        adataDict = iterate_perturb_counts(adata, TFs, scalars)

        # Concatenate all AnnData objects along the observations axis
        concatenated_adata = ad.concat(list(adataDict.values()), axis=0)

        # Save reprogramming metadata into the concatenated_adata.obs table
        concatenated_adata.obs['Source_cells'] = df['Source cells'].iloc[i]
        concatenated_adata.obs['Target_cells'] = df['Target cells'].iloc[i]
        concatenated_adata.obs['Treatment'] = df['Treatment'].iloc[i]
        concatenated_adata.obs['Species'] = df['Species'].iloc[i]
        concatenated_adata.obs['Cell_Transplantation'] = df['Cell Transplantation'].iloc[i]
        concatenated_adata.obs['Published_Year'] = df['Published Year'].iloc[i]
        concatenated_adata.obs['PMID'] = df['PMID'].iloc[i]
        
        # Save the concatenated AnnData object to the file
        concatenated_adata.write_h5ad(output_path)


# Day 4

**Focus:** check out Nats code (debug a bit) and create a few perturbations
- changes made to NO's code:
    1. iterate_perturb_counts: changes the order of the arguments to `adata, tf_list, scalar_list`
    2. perturb_counts: changes the order of the arguments to `adata, tf_list, scalar_list`
    3. perturb_counts: there was an issue with the use of `[: np.newaxis]` with respect to `max_exp`, which is a `coo_matrix` (special type of sparse matrix). Code was modified to address an issue being thrown here.
- new function:
    1. validateTFs(TFs, adata): this checks if all the transcription factors are present in the adata
- pertrubation driver (`Perform Perturbations and Create new files`):
    1. loads Fibroblast data from Tabula Sapiens
    2. loads `.csv` file of known reprogrmaming protocols
    3. for each set of TFs that are validated by `validateTFs`:
        1. use `iterate_perturb_counts` to generate perturbations with scalars `[0.5, 0.75, 1]`
        2. concatenate the dataframes to make a single dataframe
        3. save metadata from reprogramming protocol (i.e. PMID, source/targets, etc.)
        4. save the new anndata as a `.h5ad` file

## Nat's Code with some modifications

In [5]:
import numpy as np
import anndata as ad
import pandas as pd
import scanpy as sp
import os

In [52]:
def iterate_perturb_counts(adata, tf_list, scalar_list):
    """
    Applies perturbations to the expression data of specified transcription factors across multiple scalars 
    and stores the resulting AnnData objects in a dictionary.

    This function performs the following steps:
    1. Iterates over a list of scalar values.
    2. For each scalar, creates a copy of the AnnData object to preserve the original data.
    3. Applies the `perturb_counts` function to scale the expression data of genes listed in `tf_list` by
       the maximum gene expression of each cell and the current scalar.
    4. Stores the perturbed AnnData object in a dictionary with the scalar as the key.

    Parameters:
    tf_list (list): A list of gene symbols (transcription factors) to be perturbed.
    scalar_list (list): A list of scalar values for scaling the gene expression.
    adata (AnnData): The AnnData object containing gene expression data (cells x genes).

    Returns:
    dict: A dictionary where keys are scalar values and values are the corresponding perturbed AnnData objects.
    """
    
    adata_dict = {}
    
    for scalar in scalar_list:
        # Create a copy of the AnnData object for each scalar value
        adata_temp = adata.copy()
        
        # Apply perturb_counts to the copied AnnData object
        perturbed_adata = perturb_counts(adata_temp, tf_list, scalar)
        
        # Store the perturbed AnnData object in the dictionary with scalar as the key
        adata_dict[scalar] = perturbed_adata
    
    return adata_dict

def perturb_counts(adata, tf_list, scalar): 
    """
    Applies a perturbation to the expression data of specific genes in an AnnData object.

    This function performs the following steps:
    1. Computes the maximum gene expression level for each cell.
    2. Applies a scaling operation to the expression levels of genes listed in `tf_list`.
       - Each entry of these genes in the matrix is multiplied by the maximum expression level 
         of its respective cell and a specified scalar value.
    3. Updates the AnnData object with new columns:
       - 'scaled': A boolean column indicating whether each gene is in the `tf_list`.
       - 'scaled_by': Contains the scaling factor used for each gene (the product of the maximum 
         expression level of each cell and the scalar), or `1` if the gene was not in `tf_list`.
    
    Parameters:
    tf_list (list): A list of gene symbols to be perturbed.
    scalar (float): The scalar value used to scale the expression levels.
    adata (AnnData): The AnnData object containing gene expression data.

    Returns:
    AnnData: The updated AnnData object with applied perturbations and new columns.
    """

    # Create a boolean mask for genes in tf_list
    gene_mask = adata.var['gene_symbol'].isin(tf_list)
    
    # Save the original state of the parameter objects, in case some tfs do not translate (failsafe)
    original_X = adata.X.copy()
    original_gene_mask = gene_mask.copy()
    
    # Compute maximum expression level of each cell
    max_exp = np.max(adata.X, axis=1)

    """This is new today. v """
    # Raise an error if any of the gene names in tf_list do not match column names (we will manually update these in adata):
    missing_genes = [gene for gene in tf_list if gene not in adata.var['gene_symbol'].values]
    
    if missing_genes:
        # Restore original parameter objects
        adata.X = original_X
        gene_mask = original_gene_mask
        raise ValueError(f"Genes {missing_genes} not found in anndata object")

    else:    
        
        # Apply the scaling operation to the specified genes
        adata.X[:, gene_mask] = max_exp * scalar
        
        # Add/Update 'scaled' column in var
        adata.var['scaled'] = gene_mask
        
        # Add/Update 'scaled_by' column in var
        adata.var['scaled_by'] = scalar  # Default value for genes not in tf_list
    
    return adata


## New code to validate lists of TFs

In [49]:
def validateTFs(TFs, adata):
    adata_gene_list = adata.var['gene_symbol'].values.tolist()
    for TF in TFs:
        if TF not in adata_gene_list:
            return False
    return True

## Load data and perturbations

In [6]:
df = pd.read_csv('data/first_5_recepies_8_29_2024.csv')

DATAPATH = "/nfs/turbo/umms-indikar/shared/projects/DGC/data/tabula_sapiens/jpic/"
FILE = "fibroblast.h5ad"
adata = sp.read_h5ad(os.path.join(DATAPATH, FILE))
adata_gene_list = adata.var['gene_symbol'].values.tolist()


In [7]:
adata.var

Unnamed: 0,gene_symbol,feature_type,ensemblid,highly_variable,means,dispersions,dispersions_norm,mean,std
DDX11L1,DDX11L1,Gene Expression,ENSG00000223972.5,False,6.398244e-05,0.835044,-0.573947,0.000039,0.005574
WASH7P,WASH7P,Gene Expression,ENSG00000227232.5,False,2.274395e-03,2.442280,0.533203,0.001080,0.031731
MIR6859-1,MIR6859-1,Gene Expression,ENSG00000278267.1,False,6.175251e-05,1.295335,-0.256874,0.000033,0.005634
MIR1302-2HG,MIR1302-2HG,Gene Expression,ENSG00000243485.5,False,1.372886e-04,2.656352,0.680668,0.000048,0.008041
MIR1302-2,MIR1302-2,Gene Expression,ENSG00000284332.1,False,1.000000e-12,,0.000000,0.000000,1.000000
...,...,...,...,...,...,...,...,...,...
MT-ND6,MT-ND6,Gene Expression,ENSG00000198695.2,False,9.634841e-01,2.466404,0.154140,0.590065,0.741395
MT-TE,MT-TE,Gene Expression,ENSG00000210194.1,False,1.600667e-01,1.603787,-0.044396,0.083929,0.301820
MT-CYB,MT-CYB,Gene Expression,ENSG00000198727.2,False,4.367693e+00,4.765751,-0.499747,3.874830,1.104192
MT-TT,MT-TT,Gene Expression,ENSG00000210195.2,False,6.573967e-02,0.624316,-0.719108,0.040580,0.186848


## Perform Perturbations and Create new files

In [79]:
len(df['TFs'])

50

In [None]:
output_directory = "/nfs/turbo/umms-indikar/shared/projects/DARPA_AI/in-silico-reprogramming/one-shot/perturbed"
scalars = [0.5, 0.75, 1.001]
for i in range(len(df['TFs'])):
    val = df['TFs'].iloc[i]
    val = val.replace(',',' ')
    val = val.replace(';',' ')
    val = val.replace(':',' ')
    TFs = val.split(' ')
    if validateTFs(TFs, adata):
        print(TFs)
        adataDict = iterate_perturb_counts(adata, TFs, scalars)

        # Concatenate all AnnData objects along the observations axis
        concatenated_adata = ad.concat(list(adataDict.values()), axis=0)

        # Save reprogramming metadata into the concatenated_adata.obs table
        concatenated_adata.obs['Source_cells'] = df['Source cells'].iloc[i]
        concatenated_adata.obs['Target_cells'] = df['Target cells'].iloc[i]
        concatenated_adata.obs['Treatment'] = df['Treatment'].iloc[i]
        concatenated_adata.obs['Species'] = df['Species'].iloc[i]
        concatenated_adata.obs['Cell_Transplantation'] = df['Cell Transplantation'].iloc[i]
        concatenated_adata.obs['Published_Year'] = df['Published Year'].iloc[i]
        concatenated_adata.obs['PMID'] = df['PMID'].iloc[i]
        
        # Join the TFs list into a string for the filename
        TFs_str = "_".join(TFs)
        
        # Generate the file path for saving
        file_name = f"{TFs_str}.h5ad"
        output_path = os.path.join(output_directory, file_name)
        
        # Save the concatenated AnnData object to the file
        concatenated_adata.write_h5ad(output_path)


['SOX2']


  self._set_arrayXarray_sparse(i, j, x)
  self._set_arrayXarray_sparse(i, j, x)
  self._set_arrayXarray_sparse(i, j, x)
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


['SOX2', 'HMGA2']


  self._set_arrayXarray_sparse(i, j, x)
  self._set_arrayXarray_sparse(i, j, x)
  self._set_arrayXarray_sparse(i, j, x)
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


['ASCL1', 'PAX6']


  self._set_arrayXarray_sparse(i, j, x)
  self._set_arrayXarray_sparse(i, j, x)
  self._set_arrayXarray_sparse(i, j, x)
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


['SOX2', 'GATA3', 'NEUROD1']


  self._set_arrayXarray_sparse(i, j, x)
  self._set_arrayXarray_sparse(i, j, x)
  self._set_arrayXarray_sparse(i, j, x)
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


['SOX2', 'PAX6']


  self._set_arrayXarray_sparse(i, j, x)
  self._set_arrayXarray_sparse(i, j, x)
  self._set_arrayXarray_sparse(i, j, x)
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


['SOX18']


## Scratch

In [4]:
adata

NameError: name 'adata' is not defined

In [2]:
file = "fibroblast.h5ad"
file[:-5]

'fibroblast'

In [77]:
concatenated_adata.obs

Unnamed: 0,organ_tissue,method,donor,anatomical_information,n_counts_UMIs,n_genes,cell_ontology_class,free_annotation,manually_annotated,compartment,gender,Source_cells
AACAAAGTCCGGCTTT_TSP6_Liver_NA_10X_1_1,Liver,10X,TSP6,,10423.0,2986,fibroblast,Stellate/Fibroblast,True,stromal,male,Embryonic Fibroblasts and Adult Skin Fibroblasts
AATGACCAGTCTTCGA_TSP6_Liver_NA_10X_1_1,Liver,10X,TSP6,,11223.0,3248,fibroblast,Stellate/Fibroblast,True,stromal,male,Embryonic Fibroblasts and Adult Skin Fibroblasts
ACCACAATCCGAGAAG_TSP6_Liver_NA_10X_1_1,Liver,10X,TSP6,,14695.0,4469,fibroblast,Stellate/Fibroblast,True,stromal,male,Embryonic Fibroblasts and Adult Skin Fibroblasts
ACTTCGCCAACTTCTT_TSP6_Liver_NA_10X_1_1,Liver,10X,TSP6,,8977.0,2875,fibroblast,Stellate/Fibroblast,True,stromal,male,Embryonic Fibroblasts and Adult Skin Fibroblasts
AGAAGTATCAGGGATG_TSP6_Liver_NA_10X_1_1,Liver,10X,TSP6,,12109.0,3239,fibroblast,Stellate/Fibroblast,True,stromal,male,Embryonic Fibroblasts and Adult Skin Fibroblasts
...,...,...,...,...,...,...,...,...,...,...,...,...
TSP2_Vasculature_aorta_SS2_B114585_B133324_Stromal_P3_S99,Vasculature,smartseq2,TSP2,,308152.0,2090,fibroblast,fibroblast,True,stromal,female,Embryonic Fibroblasts and Adult Skin Fibroblasts
TSP2_Vasculature_aorta_SS2_B114585_B133324_Stromal_P7_S103,Vasculature,smartseq2,TSP2,,387043.0,2132,fibroblast,fibroblast,True,stromal,female,Embryonic Fibroblasts and Adult Skin Fibroblasts
TSP2_Vasculature_aorta_SS2_B114585_B133324_Stromal_P9_S105,Vasculature,smartseq2,TSP2,,412608.0,3140,fibroblast,fibroblast,True,stromal,female,Embryonic Fibroblasts and Adult Skin Fibroblasts
TSP2_Vasculature_aorta_SS2_B113343_B133091_Immune_L6_S270,Vasculature,smartseq2,TSP2,aorta,341884.0,4379,fibroblast,fibroblast,True,stromal,female,Embryonic Fibroblasts and Adult Skin Fibroblasts


In [76]:
concatenated_adata.obs['Source_cells'] = df['Source cells'].iloc[0]

In [None]:
concatenated_adata.obs['Source_cells'] = df['Source cells'].iloc[i]
concatenated_adata.obs['Target_cells'] = df['Target cells'].iloc[i]
concatenated_adata.obs['Treatment'] = df['Treatment'].iloc[i]
concatenated_adata.obs['Species'] = df['Species'].iloc[i]
concatenated_adata.obs['Cell_Transplantation'] = df['Cell Transplantation'].iloc[i]
concatenated_adata.obs['Published_Year'] = df['Published Year'].iloc[i]
concatenated_adata.obs['PMID'] = df['PMID'].iloc[i]


In [71]:
TFs_str = "_".join(TFs)
TFs_str

'SOX2_HMGA2'

In [62]:
import anndata as ad
# Concatenate all AnnData objects along the observations axis
concatenated_adata = ad.concat(list(adataDict.values()), axis=0)

  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


In [67]:
len(list(concatenated_adata.obs.index))

114453

In [68]:
len(list(set(list(concatenated_adata.obs.index))))

38151

In [48]:
adata.X[:, 0] = max_exp

  self._set_arrayXarray_sparse(i, j, x)


In [20]:
df = pd.read_csv('data/first_5_recepies_8_29_2024.csv')
words = []
for val in df['TFs']:
    val = val.replace(',',' ')
    val = val.replace(';',' ')
    val = val.replace(':',' ')
    words += val.split(' ')
words = list(set(words))

for word in words:
#     print(word in adata_gene_list)
    if word not in adata_gene_list:
        print(word)


+
Ptf1a
NEUROD
2)
c-MYC
(ETS
or
OCT4
OCT9
BRN2
L-MYC
NURR1
SV40
OSTERIX
P53
PU.1
Knockdown
Large
Variant
LXH3
Antigen
HNF6
Any
and
OCT3/4
ER71/ETV2
of
AP-2A
LEF-1
Three
NGN2
NF-κB
N-MYC
HB9
OCT6
LXMLA
PPARG2
MASH1
the
Two
Factors
T


# Day 3

In [1]:
import numpy as np
import pandas as pd
import scanpy as sp
import os

# Day 2

In [1]:
import numpy as np
import pandas as pd
import scanpy as sp
import os

## Build Embeddings of Target Data

### Neurons

In [None]:
import numpy as np
import pandas as pd
import scanpy as sp
import os

# location to data
DATAPATH = "/nfs/turbo/umms-indikar/shared/projects/public_data/cellXgene"
FILE = "neurons.h5ad"

# load data
adata = sp.read_h5ad(os.path.join(DATAPATH, FILE))

# filter to only neuron data
neurons_adata = adata[adata.obs['cell_type'] == 'neuron']

# this data needs to be embedded with all the models


## Build Embedding Driver

In [None]:
def main(job_number, parameter_file):
    """
    Main function for performing in silico reprogramming experiments. It selects the correct model, source, target, 
    transcription factors (TFs), and other parameters based on the job number in a SLURM array job. Then, it processes 
    the data and saves the results.
    
    Args:
        job_number (int): The SLURM job array index to determine which parameters to use.
        parameter_file (str): Path to the CSV file containing model parameters.
    
    Returns:
        int: Return code (0 for success).
    """

    OUTPUTPATH = "/nfs/turbo/umms-indikar/shared/projects/DARPA_AI/in-silico-reprogramming/one-shot"
    
    # Load parameters from CSV
    df_embedding_parameters = pd.read_csv(parameter_file)
    TFs    = df_embedding_parameters['TFs'].values[job_number]
    model  = df_embedding_parameters['model'].values[job_number]
    source = df_embedding_parameters['source'].values[job_number]
    target = df_embedding_parameters['target'].values[job_number]
    PMID   = df_embedding_parameters['PMID'].values[job_number]
    year   = df_embedding_parameters['year'].values[job_number]
    organ  = df_embedding_parameters['organ'].values[job_number]

    print(f"Starting job {job_number} with model {model}, TFs {TFs}, source {source}, target {target}, PMID {PMID}, year {year}")

    # Load the source data (Fibroblasts)
    source_adata = load_fibroblasts()
    print("Source data loaded")

    # TODO: Load the target data
    targets_adata = load_targets()
    print("Target data loaded")

    # Perturb the source data
    perturbed_adata = perturbation_model(source_adata, TFs)
    print("Data perturbed")

    # Combine source, perturbed, and target data
    combined_adata = combine_adata([source_adata, perturbed_adata, targets_adata])

    # Add a new 'reprogrammed type' column to indicate the source, target, and reprogrammed cells
    combined_adata.obs['reprogrammed type'] = ['source'] * source_adata.shape[0] + \
                                              ['reprogrammed'] * perturbed_adata.shape[0] + \
                                              ['target'] * targets_adata.shape[0]
    print("Combined data and added 'reprogrammed type' column")

    # Add columns to the 'obs' field of combined_adata for metadata tracking
    combined_adata.obs['model'] = model
    combined_adata.obs['TFs'] = ','.join(TFs)
    combined_adata.obs['PMID'] = PMID
    combined_adata.obs['year'] = year
    combined_adata.obs['organ'] = organ
    print("Added metadata columns to combined data")

    # Generate embeddings using the specified model
    if model == 'geneformer':
        adata_embedded = embed_geneformer(combined_adata)
    elif model == 'tGPT':
        adata_embedded = embed_tGPT(combined_adata)
    elif model == 'scGTP':
        adata_embedded = embed_scGTP([combined_adata])
    else:
        raise ValueError(f"Unknown model: {model}")

    print(f"Embeddings generated using model: {model}")

    # Save the results to a file named according to the job number
    result_filepath = os.path.join(OUTPUTPATH, f"results_job_{job_number}.h5ad")
    adata_embedded.write(result_filepath)
    print(f"Results saved to {result_filepath}")

    return 0



# Day 1

In [31]:
import numpy as np
import pandas as pd
import scanpy as sp
import os

## Perturbation Model Discussion

E.V. = expression values

Possible algorithm:
```
1. find highest E.V.  for a single cell
2. find expression value of TFs being modified
3. have a value k for the number of different concentrations we want to test
4. choose k different amounts to increase the TFs from there measured E.V. to the 150% maximum E.V.
   - make an arbitray choice and code it up
```

**A reasonable person could write this 10s of different ways**

## Visualize Input Data

In [11]:
DATAPATH = "/nfs/turbo/umms-indikar/shared/projects/DGC/data/tabula_sapiens/extract/"
FILE = "TS_epithelial.h5ad"

adata = sp.read_h5ad(os.path.join(DATAPATH, FILE))

In [12]:
adata

AnnData object with n_obs × n_vars = 104148 × 58870
    obs: 'organ_tissue', 'method', 'donor', 'anatomical_information', 'n_counts_UMIs', 'n_genes', 'cell_ontology_class', 'free_annotation', 'manually_annotated', 'compartment', 'gender'
    var: 'gene_symbol', 'feature_type', 'ensemblid', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: '_scvi', '_training_mode', 'dendrogram_cell_type_tissue', 'dendrogram_computational_compartment_assignment', 'dendrogram_consensus_prediction', 'dendrogram_tissue_cell_type', 'donor_colors', 'donor_method_colors', 'hvg', 'method_colors', 'neighbors', 'organ_tissue_colors', 'sex_colors', 'tissue_colors', 'umap'
    obsm: 'X_pca', 'X_scvi', 'X_scvi_umap', 'X_umap'
    layers: 'decontXcounts', 'raw_counts'
    obsp: 'connectivities', 'distances'

In [17]:
adata.var

Unnamed: 0,gene_symbol,feature_type,ensemblid,highly_variable,means,dispersions,dispersions_norm,mean,std
DDX11L1,DDX11L1,Gene Expression,ENSG00000223972.5,False,6.398244e-05,0.835044,-0.573947,0.000039,0.005574
WASH7P,WASH7P,Gene Expression,ENSG00000227232.5,False,2.274395e-03,2.442280,0.533203,0.001080,0.031731
MIR6859-1,MIR6859-1,Gene Expression,ENSG00000278267.1,False,6.175251e-05,1.295335,-0.256874,0.000033,0.005634
MIR1302-2HG,MIR1302-2HG,Gene Expression,ENSG00000243485.5,False,1.372886e-04,2.656352,0.680668,0.000048,0.008041
MIR1302-2,MIR1302-2,Gene Expression,ENSG00000284332.1,False,1.000000e-12,,0.000000,0.000000,1.000000
...,...,...,...,...,...,...,...,...,...
MT-ND6,MT-ND6,Gene Expression,ENSG00000198695.2,False,9.634841e-01,2.466404,0.154140,0.590065,0.741395
MT-TE,MT-TE,Gene Expression,ENSG00000210194.1,False,1.600667e-01,1.603787,-0.044396,0.083929,0.301820
MT-CYB,MT-CYB,Gene Expression,ENSG00000198727.2,False,4.367693e+00,4.765751,-0.499747,3.874830,1.104192
MT-TT,MT-TT,Gene Expression,ENSG00000210195.2,False,6.573967e-02,0.624316,-0.719108,0.040580,0.186848


In [28]:
adata.X.max(axis=1) # what is the value of the highest expressed gene for each cell?

<104148x1 sparse matrix of type '<class 'numpy.float32'>'
	with 104148 stored elements in COOrdinate format>

In [34]:
TF = 'DDX11L1'
index = np.where(adata.var['gene_symbol'] == TF)[0]
index

array([0])

In [25]:
adata.var['gene_symbol']

DDX11L1            DDX11L1
WASH7P              WASH7P
MIR6859-1        MIR6859-1
MIR1302-2HG    MIR1302-2HG
MIR1302-2        MIR1302-2
                  ...     
MT-ND6              MT-ND6
MT-TE                MT-TE
MT-CYB              MT-CYB
MT-TT                MT-TT
MT-TP                MT-TP
Name: gene_symbol, Length: 58870, dtype: object

In [15]:
adata.obs

Unnamed: 0,organ_tissue,method,donor,anatomical_information,n_counts_UMIs,n_genes,cell_ontology_class,free_annotation,manually_annotated,compartment,gender
AACAGGGCATGTGCTA_TSP6_Liver_NA_10X_1_1,Liver,10X,TSP6,,14151.0,3404,hepatocyte,Hepatocyte,True,epithelial,male
AATGAAGTCTAAGGAA_TSP6_Liver_NA_10X_1_1,Liver,10X,TSP6,,10303.0,2863,hepatocyte,Hepatocyte,True,epithelial,male
ACAGAAAAGCTGTCCG_TSP6_Liver_NA_10X_1_1,Liver,10X,TSP6,,9391.0,2738,hepatocyte,Hepatocyte,True,epithelial,male
ACGATGTTCGACTCCT_TSP6_Liver_NA_10X_1_1,Liver,10X,TSP6,,4634.0,1634,hepatocyte,Hepatocyte,True,epithelial,male
ACGTTCCAGAACCCGA_TSP6_Liver_NA_10X_1_1,Liver,10X,TSP6,,33370.0,6590,intrahepatic cholangiocyte,BECS,True,epithelial,male
...,...,...,...,...,...,...,...,...,...,...,...
TGACAGTAGATGGCGT_TSP2_Vasculature_Aorta_10X_1_2,Vasculature,10X,TSP2,Aorta,10354.0,2456,epithelial cell,epithelial cell,True,epithelial,female
TGTTGGATCGACATCA_TSP2_Vasculature_Aorta_10X_1_2,Vasculature,10X,TSP2,Aorta,15677.0,3234,epithelial cell,epithelial cell,True,epithelial,female
TTGGATGGTGGCTACC_TSP2_Vasculature_Aorta_10X_1_2,Vasculature,10X,TSP2,Aorta,18638.0,3368,epithelial cell,epithelial cell,True,epithelial,female
CTTGATTTCTTGCAAG_TSP2_Vasculature_Aorta_10X_2_2,Vasculature,10X,TSP2,Aorta,36555.0,5102,epithelial cell,epithelial cell,True,epithelial,female


In [37]:
list(adata.obs['cell_ontology_class'].unique())

['hepatocyte',
 'intrahepatic cholangiocyte',
 'tracheal goblet cell',
 'ciliated cell',
 'ionocyte',
 'secretory cell',
 'basal cell',
 'mucus secreting cell',
 'serous cell of epithelium of trachea',
 'acinar cell of salivary gland',
 'duct epithelial cell',
 'myoepithelial cell',
 'epithelial cell',
 'keratinocyte',
 'luminal epithelial cell of mammary gland',
 'epithelial cell of uterus',
 'ciliated epithelial cell',
 'conjunctival epithelial cell',
 'eye photoreceptor cell',
 'limbal stem cell',
 'epithelial cell of lacrimal sac',
 'corneal keratocyte',
 'retinal pigment epithelial cell',
 'corneal epithelial cell',
 'ocular surface cell',
 'ciliary body',
 'pancreatic acinar cell',
 'pancreatic ductal cell',
 'pancreatic beta cell',
 'basal cell of prostate epithelium',
 'hillock-club cell of prostate epithelium',
 'luminal cell of prostate epithelium',
 'salivary gland cell',
 'medullary thymic epithelial cell',
 'bladder urothelial cell',
 'enterocyte of epithelium of large int

## Build driver

In [9]:
import pandas as pd

def load_fibroblasts():
    """
    Loads the single-cell RNA sequencing data for fibroblast cells.

    The function loads a specified .h5ad file containing cellular data, filters the data
    to only include fibroblasts, and returns the AnnData object.

    Returns:
        AnnData: The AnnData object containing data for fibroblast cells only.
    """

    # Path to the data
    DATAPATH = "/nfs/turbo/umms-indikar/shared/projects/DGC/data/tabula_sapiens/jpic/"
    FILE = "fibroblast.h5ad"
    
    # Load the data
    adata = sc.read_h5ad(os.path.join(DATAPATH, FILE))
    
    return fibroblasts

import os
import pandas as pd
import scanpy as sc

def main(job_number, parameter_file):
    """
    Main function for performing in silico reprogramming experiments. It selects the correct model, source, target, 
    transcription factors (TFs), and other parameters based on the job number in a SLURM array job. Then, it processes 
    the data and saves the results.
    
    Args:
        job_number (int): The SLURM job array index to determine which parameters to use.
        parameter_file (str): Path to the CSV file containing model parameters.
    
    Returns:
        int: Return code (0 for success).
    """

    OUTPUTPATH = "/nfs/turbo/umms-indikar/shared/projects/DARPA_AI/in-silico-reprogramming/one-shot"
    
    # Load parameters from CSV
    df_embedding_parameters = pd.read_csv(parameter_file)
    TFs    = df_embedding_parameters['TFs'].values[job_number]
    model  = df_embedding_parameters['model'].values[job_number]
    source = df_embedding_parameters['source'].values[job_number]
    target = df_embedding_parameters['target'].values[job_number]
    PMID   = df_embedding_parameters['PMID'].values[job_number]
    year   = df_embedding_parameters['year'].values[job_number]
    organ  = df_embedding_parameters['organ'].values[job_number]

    print(f"Starting job {job_number} with model {model}, TFs {TFs}, source {source}, target {target}, PMID {PMID}, year {year}")

    # Load the source data (Fibroblasts)
    source_adata = load_fibroblasts()
    print("Source data loaded")

    # TODO: Load the target data
    targets_adata = load_targets()
    print("Target data loaded")

    # Perturb the source data
    perturbed_adata = perturbation_model(source_adata, TFs)
    print("Data perturbed")

    # Combine source, perturbed, and target data
    combined_adata = combine_adata([source_adata, perturbed_adata, targets_adata])

    # Add a new 'reprogrammed type' column to indicate the source, target, and reprogrammed cells
    combined_adata.obs['reprogrammed type'] = ['source'] * source_adata.shape[0] + \
                                              ['reprogrammed'] * perturbed_adata.shape[0] + \
                                              ['target'] * targets_adata.shape[0]
    print("Combined data and added 'reprogrammed type' column")

    # Add columns to the 'obs' field of combined_adata for metadata tracking
    combined_adata.obs['model'] = model
    combined_adata.obs['TFs'] = ','.join(TFs)
    combined_adata.obs['PMID'] = PMID
    combined_adata.obs['year'] = year
    combined_adata.obs['organ'] = organ
    print("Added metadata columns to combined data")

    # Generate embeddings using the specified model
    if model == 'geneformer':
        adata_embedded = embed_geneformer(combined_adata)
    elif model == 'tGPT':
        adata_embedded = embed_tGPT(combined_adata)
    elif model == 'scGTP':
        adata_embedded = embed_scGTP([combined_adata])
    else:
        raise ValueError(f"Unknown model: {model}")

    print(f"Embeddings generated using model: {model}")

    # Save the results to a file named according to the job number
    result_filepath = os.path.join(OUTPUTPATH, f"results_job_{job_number}.h5ad")
    adata_embedded.write(result_filepath)
    print(f"Results saved to {result_filepath}")

    return 0



## Build parameter dataframe

In [8]:
embedding_parameters = {
    'source': [],
    'target': [],
    'TFs'   : [],
    'model' : []
}
models = ['geneformer', 'tGPT', 'scGTP']

df = pd.read_csv('data/first_5_recepies_8_29_2024.csv')

for i in range(5):
    TFs = df['TFs'].values[i].split()
    source = df['Source'].values[i]
    target = df['Target'].values[i]
    for model in models:
        embedding_parameters['TFs'].append(TFs)
        embedding_parameters['source'].append(source)
        embedding_parameters['target'].append(source)
        embedding_parameters['model'].append(model)

df_embedding_parameters = pd.DataFrame(embedding_parameters)

df_embedding_parameters

Unnamed: 0,source,target,TFs,model
0,,,"[ASCL1, LXMLA, NURR1]",geneformer
1,,,"[ASCL1, LXMLA, NURR1]",tGPT
2,,,"[ASCL1, LXMLA, NURR1]",scGTP
3,,,"[BRN2, ASCL1, MYT1L, NEUROD1]",geneformer
4,,,"[BRN2, ASCL1, MYT1L, NEUROD1]",tGPT
5,,,"[BRN2, ASCL1, MYT1L, NEUROD1]",scGTP
6,,,"[ASCL1, BRN2, MYT1L, LMX1A, FOXA2]",geneformer
7,,,"[ASCL1, BRN2, MYT1L, LMX1A, FOXA2]",tGPT
8,,,"[ASCL1, BRN2, MYT1L, LMX1A, FOXA2]",scGTP
9,,,[SOX2],geneformer


In [7]:
df

Unnamed: 0,Source cells,Source,Target cells,Target,Treatment,TFs,Species,Cell Transplantation,Published Year,PMID
0,Embryonic Fibroblasts and Adult Skin Fibroblasts,,Dopamine Neurons,,"ASCL1, LXMLA, and NURR1",ASCL1 LXMLA NURR1,Both,Yes,2011,2172534_27
1,Fetal Fibroblasts and Postnatal Foreskin Fibro...,,Neuronal Cells,,"BRN2, ASCL1, MYT1L, and NEUROD1",BRN2 ASCL1 MYT1L NEUROD1,Human,,2011,21617644_28
2,Embryonic Fibroblasts and Postnatal Fibroblasts,,Neuronal Cells,,"ASCL1, BRN2, MYT1L, LMX1A, and FOXA2",ASCL1 BRN2 MYT1L LMX1A FOXA2,Human,,2011,21646515_29
3,Embryonic Fibroblasts and Fetal Foreskin Fibro...,,Neural Stem Cells,,SOX2,SOX2,Both,Yes,2012,22683203_31
4,Fibroblasts (IMR90 Cells),,Dopaminergic Neurons,,"MASH1, NGN2, SOX2, NURR1, and PITX3 + A Domina...",MASH1 NGN2 SOX2 NURR1 PITX3 P53,Human,Yes,2014,25129808_214
5,Adult Skin Fibroblasts,,Motor Neurons,,"NEUROG2, SOX11, ISL1, and LXH3",NEUROG2 SOX11 ISL1 LXH3,Human,,2016,26725112_46
6,Umbilical Cord Blood Cells,,Neural Stem Cells,,SOX2 and HMGA2,SOX2 HMGA2,Human,,2017,28844127_33
7,Fibroblast-Like Cells from Retinal Tissues,,Neuronal Cells,,ASCL1 and PAX6,ASCL1 PAX6,Human,,2017,28697461_217
8,Pluripotent Stem Cell-Derived Cardiomyocytes,,Neuronal Cells,,"BRN2, ASCL1, MYT1L, and NEUROD1",BRN2 ASCL1 MYT1L NEUROD1,Both,,2017,28327614_218
9,Fibroblasts,,Motor Neurons,,"ASCL1, ISL1, NEUROD1, BRN2, HB9, LHX3, MYT1L, ...",ASCL1 ISL1 NEUROD1 BRN2 HB9 LHX3 MYT1L NGN2,Human,,2017,28099929_219


# Day 0

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [9]:
# load known reprogramming regiems
df = pd.read_csv('data/known-regiems-T1.csv')

In [14]:
# get list of unique transcription factors
TFs = []
for regime in df['TFs'].unique():
    TFs += regime.replace(',', '').split()
TFs = list(set(TFs))
print(f"{len(TFs)=}")
TFs

len(TFs)=126


['PROX1',
 'P53',
 'N-MYC',
 'LEF-1',
 'PRDM16',
 'NFE2',
 'RUNX1',
 'A',
 'Ptf1a',
 'SALL4',
 'MESP1',
 'PAX6',
 'MYT1L',
 'GATA3',
 'SNAI2',
 'ER71/ETV2',
 'MAFA',
 'CEBPB',
 'Antigen',
 'LXMLA',
 'Two',
 'SOX2;',
 'FOXA3',
 'HAND2',
 'MBD2',
 'NEUROD1',
 'BACH1',
 'AR',
 'ZFPM2',
 'RUNX2',
 'CRX',
 'GATA4',
 'CEBPA',
 'OCT4',
 'FOXM1',
 'MYC',
 'PAX4',
 'MITF',
 'CDX2',
 'LMX1A',
 'OTX2',
 'Variant',
 'BRN2',
 'MAFG',
 'HB9',
 'EYA1',
 'PBX1',
 'and',
 'Factors:',
 'SV40',
 'NANOG',
 'HNF1B',
 'TP53',
 'HNF6',
 'LXH3',
 'SOX10',
 'OSTERIX',
 'Large',
 'TAL1',
 'ESRRG',
 'NEUROD',
 'MASH1',
 'SOX9',
 'PHOX2A',
 'FOXA1',
 'STAT6',
 'Any',
 'SOX18',
 'PHOX2B',
 'the',
 '(ETS',
 'WT1',
 'MEF2C',
 'IRF8',
 'NEUROG2',
 'Dominant-Negative',
 'PU.1',
 'SOX11',
 'LHX3',
 'RAX',
 'NGN2',
 'ASCL1',
 'BATF3',
 'NURR1',
 'SOX2',
 'AP-2A',
 'MAFK',
 'or',
 'PAX6;',
 'JUN',
 'ISL1',
 'Knockdown',
 'EMX2',
 'GATA6',
 '+',
 'LMX1A;',
 'POU3F2',
 'c-MYC',
 'HMGA2',
 'HNF1A',
 'SMAD3',
 'OCT3/4',
 'OC