# Reprogramming Recepies

Auth: Nat Oliven, Joshua Pickard

Date: August 26, 2024

In [7]:
import numpy as np
import pandas as pd
import scanpy as sp
import os

# Day 2

In [8]:

# Copied from a prev day

DATAPATH = "/nfs/turbo/umms-indikar/shared/projects/DGC/data/tabula_sapiens/extract/"
FILE = "TS_epithelial.h5ad"
adata = sp.read_h5ad(os.path.join(DATAPATH, FILE))
adata_gene_list = adata.var['gene_symbol'].values.tolist()

# Print the first 5 entries
print("First 5 entries:")
print(adata_gene_list[:5])

# Print the last 5 entries
print("Last 5 entries:")
print(adata_gene_list[-5:])

First 5 entries:
['DDX11L1', 'WASH7P', 'MIR6859-1', 'MIR1302-2HG', 'MIR1302-2']
Last 5 entries:
['MT-ND6', 'MT-TE', 'MT-CYB', 'MT-TT', 'MT-TP']


In [13]:
# outside the function so I can manipulate these directly.
# I also checked whether it matters if I do case insensitive (capitalize everything then compare) or case sensitive.
# Unsurprisingly, case sensitive has more discrepancies (45 vs. 44), with the one extra that was picked up as "Ptf1a".
# I left the case insensitive version.
# get a list of words (potential genes, also includes and, + , etc. ) from the table from the review paper
table_1_df = pd.read_csv("/home/oliven/scFoundationModels/notebooks/reprogramming/data/table_1_data_from_paper_9_1.csv")
combined_string = ' '.join(table_1_df['TFs'].astype(str)).replace(',', '')
word_list = combined_string.split()

# get a list of genes that appear in the data matrix
DATAPATH = "/nfs/turbo/umms-indikar/shared/projects/DGC/data/tabula_sapiens/extract/"
FILE = "TS_epithelial.h5ad"
adata = sp.read_h5ad(os.path.join(DATAPATH, FILE))
adata_gene_list = adata.var['gene_symbol'].values.tolist()

# New today
def check_valid_tfs(word_list, adata_gene_list):

    word_list_upper = [word.upper() for word in word_list]
    adata_gene_list_upper = [gene.upper() for gene in adata_gene_list]

    # print what does not overlap
    not_valid_gene = set(word_list_upper) - set(adata_gene_list_upper)

    print("Entries in the table that are not genes in the counts matrix: ")
    
    return list(not_valid_gene)
    
len(check_valid_tfs(word_list, adata_gene_list))

Entries in the table that are not genes in the counts matrix: 


44

In [16]:

def check_valid_tfs_case_sen(word_list, adata_gene_list):

    # print what does not overlap
    not_valid_gene = set(word_list) - set(adata_gene_list)

    print("Entries in the table that are not genes in the counts matrix: ")
    
    return list(not_valid_gene)
    
len(check_valid_tfs_case_sen(word_list, adata_gene_list))



Entries in the table that are not genes in the counts matrix: 


45

In [21]:

# Get the lists of not valid genes
not_valid_genes_case_sen = check_valid_tfs_case_sen(word_list, adata_gene_list)
not_valid_genes = check_valid_tfs(word_list, adata_gene_list)

# Convert the results to lowercase
not_valid_genes_case_sen_lower = [gene.lower() for gene in not_valid_genes_case_sen]
not_valid_genes_lower = [gene.lower() for gene in not_valid_genes]

# Convert lists to sets and then subtract
set_not_valid_genes_case_sen = set(not_valid_genes_case_sen_lower)
set_not_valid_genes = set(not_valid_genes_lower)

# Calculate the difference
difference = set_not_valid_genes_case_sen - set_not_valid_genes

print(f"Difference in not valid genes: {difference}")

Entries in the table that are not genes in the counts matrix: 
Entries in the table that are not genes in the counts matrix: 
Difference in not valid genes: {'ptf1a'}


In [None]:
# for those genes with multiple aliases, checking which are valid
multiple_alias_list = {}
multiple_alias_list.update({
    'p53': ['BCC7','BMFS5', 'LFS1', 'TRP53'],
    
})

In [None]:
# The above list is small enough that I can manually check it.

# Things to remove from word_list:
not_genes = ['Variant', 'Large',  '(ETS', '2)', 'Knockdown', ]

# Valid genes to replace/rename in word_list:
genes_to_translate = ['LMX1A;', #'P53']
translated_names = ['LMX1A',]

# These ones might have appeared as multiple entries, etc. b/c of spacing. easiest way was to delete and add back
genes_to_add = ['ETS2',]

# replace then subtract and add. [--------------]
new_word_list = set(word_list) - set(not_genes)

# Running the function one more time to check:





In [None]:
""" Renamed med_nonz to max_exp to be more accurate. """

In [None]:
# Copied from a prev day

"""
Josh, please read: the adata.obs['scalar'] = scalar copies the scalar down for that call, associated with every cell in X. Same with ['scaled'] and ['scaled_by']
in var. This is good in case the data is later appended into one anndata object.
But my return from the perturb_counts loop (cell below this) is a dictionary of all of the perturb_counts, since appending along any axis will probably either overwrite
obs or var.
"""


def perturb_counts(tf_list, scalar, adata): 
    """
    Applies a perturbation to the expression data of specific genes in an AnnData object.

    This function performs the following steps:
    1. Computes the maximum gene expression level for each cell.
    2. Applies a scaling operation to the expression levels of genes listed in `tf_list`.
       - Each entry of these genes in the matrix is multiplied by the maximum expression level 
         of its respective cell and a specified scalar value.
    3. Updates the AnnData object with new columns:
       - 'scaled': A boolean column indicating whether each gene is in the `tf_list`.
       - 'scaled_by': Contains the scaling factor used for each gene (the product of the maximum 
         expression level of each cell and the scalar), or `1` if the gene was not in `tf_list`.
    
    Parameters:
    tf_list (list): A list of gene symbols to be perturbed.
    scalar (float): The scalar value used to scale the expression levels.
    adata (AnnData): The AnnData object containing gene expression data.

    Returns:
    AnnData: The updated AnnData object with applied perturbations and new columns.
    """
    # Save the original state of the parameter objects, in case some tfs do not translate (failsafe)
    original_X = adata.X.copy()
    original_gene_mask = gene_mask.copy()
    
    # Compute maximum expression level of each cell
    max_exp = np.max(adata.X, axis=1)
    
    # Create a boolean mask for genes in tf_list
    gene_mask = adata.var['gene_symbol'].isin(tf_list)

    """This is new today. v """
    # Raise an error if any of the gene names in tf_list do not match column names (we will manually update these in adata):
    missing_genes = [gene for gene in tf_list if gene not in adata.var['gene_symbol'].values]
    
    if missing_genes:
        # Restore original parameter objects
        adata.X = original_X
        gene_mask = original_gene_mask
        raise ValueError(f"Genes {missing_genes} not found in anndata object")

    else:
    """ This is new today. ^ """
    
        # Apply the scaling operation to the specified genes
        adata.X[:, gene_mask] *= max_exp[:, np.newaxis] * scalar
        
        # Add/Update 'scaled' column in var
        adata.var['scaled'] = gene_mask
        
        # Add/Update 'scaled_by' column in var
        adata.var['scaled_by'] = 1  # Default value for genes not in tf_list
        adata.var.loc[gene_mask, 'scaled_by'] = max_exp[:, np.newaxis] * scalar  # Correct scaling factor assignment
    
    return adata



In [None]:
# Copied from a prev day

import anndata

def iterate_perturb_counts(tf_list, scalar_list, adata):
    """
    Applies perturbations to the expression data of specified transcription factors across multiple scalars 
    and stores the resulting AnnData objects in a dictionary.

    This function performs the following steps:
    1. Iterates over a list of scalar values.
    2. For each scalar, creates a copy of the AnnData object to preserve the original data.
    3. Applies the `perturb_counts` function to scale the expression data of genes listed in `tf_list` by
       the maximum gene expression of each cell and the current scalar.
    4. Stores the perturbed AnnData object in a dictionary with the scalar as the key.

    Parameters:
    tf_list (list): A list of gene symbols (transcription factors) to be perturbed.
    scalar_list (list): A list of scalar values for scaling the gene expression.
    adata (AnnData): The AnnData object containing gene expression data (cells x genes).

    Returns:
    dict: A dictionary where keys are scalar values and values are the corresponding perturbed AnnData objects.
    """
    
    adata_dict = {}
    
    for scalar in scalar_list:
        # Create a copy of the AnnData object for each scalar value
        adata_temp = adata.copy()
        
        # Apply perturb_counts to the copied AnnData object
        perturbed_adata = perturb_counts(tf_list, scalar, adata_temp)
        
        # Store the perturbed AnnData object in the dictionary with scalar as the key
        adata_dict[scalar] = perturbed_adata
    
    return adata_dict



In [None]:
# We want each recipe (returned as a dictionary of anndata objects, one adata object for each scalar)
def save_perturb_to_turbo()

In [None]:
### Testing on one of the tf lists from the file.


# Day 1

## Perturbation Model Discussion

E.V. = expression values

Possible algorithm:
```
1. find highest E.V.  for a single cell
2. find expression value of TFs being modified
3. have a value k for the number of different concentrations we want to test
4. choose k different amounts to increase the TFs from there measured E.V. to the 150% maximum E.V.
   - make an arbitray choice and code it up
```

**A reasonable person could write this 10s of different ways**

In [3]:
import numpy as np
# median nonzero value of each row
# the w stands for working. I just dont want to screw up the original.
DATAPATH = "/nfs/turbo/umms-indikar/shared/projects/DGC/data/tabula_sapiens/extract/"
FILE = "TS_epithelial.h5ad"
adata_w = sp.read_h5ad(os.path.join(DATAPATH, FILE))
adata_w
# def median_nonzero(col):
#     nonzero_vals = col[col != 0]  # Extract nonzero values
#     if len(nonzero_vals) == 0:    # If no nonzero values, return NaN
#         return np.nan
#     return np.median(nonzero_vals)

# # Apply the function to each column and store the results
# med_nonz = np.apply_along_axis(median_nonzero, axis=0, arr=adata_w.X)
# adata_w.var['med_nonz'] = med_nonz
# adata_w.var

AnnData object with n_obs × n_vars = 104148 × 58870
    obs: 'organ_tissue', 'method', 'donor', 'anatomical_information', 'n_counts_UMIs', 'n_genes', 'cell_ontology_class', 'free_annotation', 'manually_annotated', 'compartment', 'gender'
    var: 'gene_symbol', 'feature_type', 'ensemblid', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: '_scvi', '_training_mode', 'dendrogram_cell_type_tissue', 'dendrogram_computational_compartment_assignment', 'dendrogram_consensus_prediction', 'dendrogram_tissue_cell_type', 'donor_colors', 'donor_method_colors', 'hvg', 'method_colors', 'neighbors', 'organ_tissue_colors', 'sex_colors', 'tissue_colors', 'umap'
    obsm: 'X_pca', 'X_scvi', 'X_scvi_umap', 'X_umap'
    layers: 'decontXcounts', 'raw_counts'
    obsp: 'connectivities', 'distances'

In [4]:
import numpy as np


X = adata_w.X.toarray() if not isinstance(adata_w.X, np.ndarray) else adata_w.X

def median_nonzero(col):
    nonzero_vals = col[col != 0] 
    return np.median(nonzero_vals) if len(nonzero_vals) > 0 else 0

#perform and save results of fn
med_nonz = np.apply_along_axis(median_nonzero, axis=0, arr=X)
adata_w.var['med_nonz'] = med_nonz


In [5]:
adata_w.var.head()

Unnamed: 0,gene_symbol,feature_type,ensemblid,highly_variable,means,dispersions,dispersions_norm,mean,std,med_nonz
DDX11L1,DDX11L1,Gene Expression,ENSG00000223972.5,False,6.398244e-05,0.835044,-0.573947,3.9e-05,0.005574,6.029026
WASH7P,WASH7P,Gene Expression,ENSG00000227232.5,False,0.002274395,2.44228,0.533203,0.00108,0.031731,0.770418
MIR6859-1,MIR6859-1,Gene Expression,ENSG00000278267.1,False,6.175251e-05,1.295335,-0.256874,3.3e-05,0.005634,10.0
MIR1302-2HG,MIR1302-2HG,Gene Expression,ENSG00000243485.5,False,0.0001372886,2.656352,0.680668,4.8e-05,0.008041,5.04275
MIR1302-2,MIR1302-2,Gene Expression,ENSG00000284332.1,False,1e-12,,0.0,0.0,1.0,0.0


In [9]:
# seeing what it looks like before tf_list changes the first 3 rows
# Convert to dense if it's sparse and display the first five rows
import numpy as np

# Convert to a dense array if necessary
dense_X = adata_w.X.toarray() if not isinstance(adata_w.X, np.ndarray) else adata_w.X



[[0.        0.        0.        ... 4.490315  0.        0.       ]
 [0.        0.        0.        ... 4.4802847 0.        0.       ]
 [0.        0.        0.        ... 4.457717  0.        0.       ]
 ...
 [0.        0.        0.        ... 4.9418464 0.        0.       ]
 [0.        0.        0.        ... 4.8312063 0.        0.       ]
 [0.        0.        0.        ... 5.0140185 0.        0.       ]]


In [25]:
# problem, scaling by a factor of the max expressed gene in that cell means that you could be scaling by different genes for each cell,
# when the cells are all of the same type. for each get the median nonzero expression

# for testing purposes: v
tf_list = ['DDX11L1', 'WASH7P', 'MIR6859-1']
tf = 'DDX11L1'
# for testing purposes: ^

#mask = adata_w.obs_names.isin(tf_list)
mask = np.where(adata_w.var['gene_symbol'] == tf)[0]
adata_w.X[mask, :] = adata_w.X[mask, :] * adata_w.var['med_nonz'].values 



  self._set_arrayXarray(i, j, x)


In [26]:
DATAPATH = "/nfs/turbo/umms-indikar/shared/projects/DGC/data/tabula_sapiens/extract/"
FILE = "TS_epithelial.h5ad"
adata = sp.read_h5ad(os.path.join(DATAPATH, FILE))
(adata_w.X[mask, :] - adata.X[mask, :]).sum()

2140043500.0

In [21]:
adata_w.X[mask, :]

<0x58870 sparse matrix of type '<class 'numpy.float32'>'
	with 0 stored elements in Compressed Sparse Row format>

### Scaling by median nonzero entry of each gene across all cells (old)

In [2]:

# import numpy as np
# DATAPATH = "/nfs/turbo/umms-indikar/shared/projects/DGC/data/tabula_sapiens/extract/"
# FILE = "TS_epithelial.h5ad"
# adata_w = sp.read_h5ad(os.path.join(DATAPATH, FILE))
# #tf = 'DDX11L1'
# tf_list =['DDX11L1', 'WASH7P', 'MIR6859-1']

# def median_nonzero(col):
#     nonzero_vals = col[col != 0] 
#     return np.median(nonzero_vals) if len(nonzero_vals) > 0 else 0

# # requires scalar is a scalar
# def perturb_counts(tf_list, scalar, adata): 
#     # compute nonzero median expression of each gene across cells, save to var
#     med_nonz = np.apply_along_axis(median_nonzero, axis=0, arr=adata.X)
#     adata.var['med_nonz'] = med_nonz

#     # filter by desired tf(s), and apply the nonzero_median scaling operation to only these  
#     mask = np.where(adata.var['gene_symbol'].isin(tf_list))[0]
#     adata.X[mask, :] = adata.X[mask, :] * adata.var['med_nonz'].values * scalar
#     return adata




### Scaling by max gene expression within each cell

In [3]:
# old version with (I believe) improper mask that was being applied to rows and not columns

# import numpy as np
# DATAPATH = "/nfs/turbo/umms-indikar/shared/projects/DGC/data/tabula_sapiens/extract/"
# FILE = "TS_epithelial.h5ad"
# adata_w = sp.read_h5ad(os.path.join(DATAPATH, FILE))
# #tf = 'DDX11L1'
# tf_list =['DDX11L1', 'WASH7P', 'MIR6859-1']


# # requires scalar is a scalar
# def perturb_counts(tf_list, scalar, adata): 
#     # compute nonzero median expression of each gene across cells, save to var
#     med_nonz = med_nonz = np.max(adata.X, axis=1)
#     adata.obs['med_nonz'] = med_nonz

#     # filter by desired tf(s), and apply the nonzero_median scaling operation to only these  
#     mask = np.where(adata.var['gene_symbol'].isin(tf_list))[0]
#     adata.X[mask, :] = adata.X[mask, :] * adata.obs['med_nonz'].values * scalar
#     return adata

# old version without extra obs and var rows telling what was scaled and by how much


# def perturb_counts(tf_list, scalar, adata): 
#     # Compute maximum expression level of each cell and save it to obs
#     med_nonz = np.max(adata.X, axis=1)
#     adata.obs['med_nonz'] = med_nonz
    
#     # apply operation only to genes in tf_list
#     mask = np.where(adata.var['gene_symbol'].isin(tf_list))[0]
#     adata.X[:, mask] = adata.X[:, mask] * adata.obs['med_nonz'].values[:, np.newaxis] * scalar
    
#     return adata

# version before gpt optimized

# def perturb_counts(tf_list, scalar, adata): 
#     # Compute maximum expression level of each cell and save it to obs
#     med_nonz = np.max(adata.X, axis=1)
#     adata.obs['med_nonz'] = med_nonz
    
#     # Add a new obs column called 'scalar' containing the scalar value for each row
#     adata.obs['scalar'] = scalar
    
#     # Create a mask for genes in tf_list
#     mask = np.where(adata.var['gene_symbol'].isin(tf_list))[0]
#     # Apply the scaling operation to the specified genes
#     adata.X[:, mask] = adata.X[:, mask] * adata.obs['med_nonz'].values[:, np.newaxis] * scalar
    
#     # Add a new var column called 'scaled' with True for genes in tf_list and False otherwise
#     adata.var['scaled'] = adata.var['gene_symbol'].isin(tf_list)
    
#     # Add a new var column 'scaled_by'
#     adata.var['scaled_by'] = 1
#     # Set scaling factor for genes in tf_list

#     ############I asked gpt to do this line and am unsure if it is correct. checking now.
#     adata.var.loc[adata.var['scaled'], 'scaled_by'] = adata.obs['med_nonz'].values[:, np.newaxis] * scalar
#     ###############
    
#     return adata

# loop version before gpt optimized

# # requires that within each perturbation, all of the transcription factors in tf_list are scaled by the same amount, that is, (scalar * "max gene expression in that cell")
# # requires adata is cells x genes

# import anndata

# def iterate_perturb_counts(tf_list, scalar_list, adata):
#     adata_dict = {}
    
#     for scalar in scalar_list:
#         adata_temp = adata.copy()
#         perturbed_adata = perturb_counts(tf_list, scalar, adata_temp)
#         adata_dict[scalar] = perturbed_adata
    
#     return adata_dict



In [5]:

"""
Josh, please read: the adata.obs['scalar'] = scalar copies the scalar down for that call, associated with every cell in X. Same with ['scaled'] and ['scaled_by']
in var. This is good in case the data is later appended into one anndata object.
But my return from the perturb_counts loop (cell below this) is a dictionary of all of the perturb_counts, since appending along any axis will probably either overwrite
obs or var.
"""
import numpy as np

def perturb_counts(tf_list, scalar, adata): 
    """
    Applies a perturbation to the expression data of specific genes in an AnnData object.

    This function performs the following steps:
    1. Computes the maximum gene expression level for each cell.
    2. Applies a scaling operation to the expression levels of genes listed in `tf_list`.
       - Each entry of these genes in the matrix is multiplied by the maximum expression level 
         of its respective cell and a specified scalar value.
    3. Updates the AnnData object with new columns:
       - 'scaled': A boolean column indicating whether each gene is in the `tf_list`.
       - 'scaled_by': Contains the scaling factor used for each gene (the product of the maximum 
         expression level of each cell and the scalar), or `1` if the gene was not in `tf_list`.
    
    Parameters:
    tf_list (list): A list of gene symbols to be perturbed.
    scalar (float): The scalar value used to scale the expression levels.
    adata (AnnData): The AnnData object containing gene expression data.

    Returns:
    AnnData: The updated AnnData object with applied perturbations and new columns.
    """
    
    # Compute maximum expression level of each cell
    med_nonz = np.max(adata.X, axis=1)
    
    # Create a boolean mask for genes in tf_list
    gene_mask = adata.var['gene_symbol'].isin(tf_list)
    
    # Apply the scaling operation to the specified genes
    adata.X[:, gene_mask] *= med_nonz[:, np.newaxis] * scalar
    
    # Add/Update 'scaled' column in var
    adata.var['scaled'] = gene_mask
    
    # Add/Update 'scaled_by' column in var
    adata.var['scaled_by'] = 1  # Default value for genes not in tf_list
    adata.var.loc[gene_mask, 'scaled_by'] = med_nonz[:, np.newaxis] * scalar  # Correct scaling factor assignment
    
    return adata



In [None]:
import anndata

def iterate_perturb_counts(tf_list, scalar_list, adata):
    """
    Applies perturbations to the expression data of specified transcription factors across multiple scalars 
    and stores the resulting AnnData objects in a dictionary.

    This function performs the following steps:
    1. Iterates over a list of scalar values.
    2. For each scalar, creates a copy of the AnnData object to preserve the original data.
    3. Applies the `perturb_counts` function to scale the expression data of genes listed in `tf_list` by
       the maximum gene expression of each cell and the current scalar.
    4. Stores the perturbed AnnData object in a dictionary with the scalar as the key.

    Parameters:
    tf_list (list): A list of gene symbols (transcription factors) to be perturbed.
    scalar_list (list): A list of scalar values for scaling the gene expression.
    adata (AnnData): The AnnData object containing gene expression data (cells x genes).

    Returns:
    dict: A dictionary where keys are scalar values and values are the corresponding perturbed AnnData objects.
    """
    
    adata_dict = {}
    
    for scalar in scalar_list:
        # Create a copy of the AnnData object for each scalar value
        adata_temp = adata.copy()
        
        # Apply perturb_counts to the copied AnnData object
        perturbed_adata = perturb_counts(tf_list, scalar, adata_temp)
        
        # Store the perturbed AnnData object in the dictionary with scalar as the key
        adata_dict[scalar] = perturbed_adata
    
    return adata_dict



## Visualize Input Data

In [2]:
DATAPATH = "/nfs/turbo/umms-indikar/shared/projects/DGC/data/tabula_sapiens/extract/"
FILE = "TS_epithelial.h5ad"

adata = sp.read_h5ad(os.path.join(DATAPATH, FILE))

In [3]:
adata

AnnData object with n_obs × n_vars = 104148 × 58870
    obs: 'organ_tissue', 'method', 'donor', 'anatomical_information', 'n_counts_UMIs', 'n_genes', 'cell_ontology_class', 'free_annotation', 'manually_annotated', 'compartment', 'gender'
    var: 'gene_symbol', 'feature_type', 'ensemblid', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: '_scvi', '_training_mode', 'dendrogram_cell_type_tissue', 'dendrogram_computational_compartment_assignment', 'dendrogram_consensus_prediction', 'dendrogram_tissue_cell_type', 'donor_colors', 'donor_method_colors', 'hvg', 'method_colors', 'neighbors', 'organ_tissue_colors', 'sex_colors', 'tissue_colors', 'umap'
    obsm: 'X_pca', 'X_scvi', 'X_scvi_umap', 'X_umap'
    layers: 'decontXcounts', 'raw_counts'
    obsp: 'connectivities', 'distances'

In [4]:
adata.var

Unnamed: 0,gene_symbol,feature_type,ensemblid,highly_variable,means,dispersions,dispersions_norm,mean,std
DDX11L1,DDX11L1,Gene Expression,ENSG00000223972.5,False,6.398244e-05,0.835044,-0.573947,0.000039,0.005574
WASH7P,WASH7P,Gene Expression,ENSG00000227232.5,False,2.274395e-03,2.442280,0.533203,0.001080,0.031731
MIR6859-1,MIR6859-1,Gene Expression,ENSG00000278267.1,False,6.175251e-05,1.295335,-0.256874,0.000033,0.005634
MIR1302-2HG,MIR1302-2HG,Gene Expression,ENSG00000243485.5,False,1.372886e-04,2.656352,0.680668,0.000048,0.008041
MIR1302-2,MIR1302-2,Gene Expression,ENSG00000284332.1,False,1.000000e-12,,0.000000,0.000000,1.000000
...,...,...,...,...,...,...,...,...,...
MT-ND6,MT-ND6,Gene Expression,ENSG00000198695.2,False,9.634841e-01,2.466404,0.154140,0.590065,0.741395
MT-TE,MT-TE,Gene Expression,ENSG00000210194.1,False,1.600667e-01,1.603787,-0.044396,0.083929,0.301820
MT-CYB,MT-CYB,Gene Expression,ENSG00000198727.2,False,4.367693e+00,4.765751,-0.499747,3.874830,1.104192
MT-TT,MT-TT,Gene Expression,ENSG00000210195.2,False,6.573967e-02,0.624316,-0.719108,0.040580,0.186848


In [5]:
adata.X.max(axis=1) # what is the value of the highest expressed gene for each cell?

<104148x1 sparse matrix of type '<class 'numpy.float32'>'
	with 104148 stored elements in COOrdinate format>

In [6]:
TF = 'DDX11L1'
index = np.where(adata.var['gene_symbol'] == TF)[0]
index

array([0])

In [7]:
adata.var['gene_symbol']

DDX11L1            DDX11L1
WASH7P              WASH7P
MIR6859-1        MIR6859-1
MIR1302-2HG    MIR1302-2HG
MIR1302-2        MIR1302-2
                  ...     
MT-ND6              MT-ND6
MT-TE                MT-TE
MT-CYB              MT-CYB
MT-TT                MT-TT
MT-TP                MT-TP
Name: gene_symbol, Length: 58870, dtype: category
Categories (57316, object): ['A1BG', 'A1BG-AS1', 'A1CF', 'A2M', ..., 'ZYX', 'ZYXP1', 'ZZEF1', 'hsa-mir-1253']

In [8]:
adata.obs

Unnamed: 0,organ_tissue,method,donor,anatomical_information,n_counts_UMIs,n_genes,cell_ontology_class,free_annotation,manually_annotated,compartment,gender
AACAGGGCATGTGCTA_TSP6_Liver_NA_10X_1_1,Liver,10X,TSP6,,14151.0,3404,hepatocyte,Hepatocyte,True,epithelial,male
AATGAAGTCTAAGGAA_TSP6_Liver_NA_10X_1_1,Liver,10X,TSP6,,10303.0,2863,hepatocyte,Hepatocyte,True,epithelial,male
ACAGAAAAGCTGTCCG_TSP6_Liver_NA_10X_1_1,Liver,10X,TSP6,,9391.0,2738,hepatocyte,Hepatocyte,True,epithelial,male
ACGATGTTCGACTCCT_TSP6_Liver_NA_10X_1_1,Liver,10X,TSP6,,4634.0,1634,hepatocyte,Hepatocyte,True,epithelial,male
ACGTTCCAGAACCCGA_TSP6_Liver_NA_10X_1_1,Liver,10X,TSP6,,33370.0,6590,intrahepatic cholangiocyte,BECS,True,epithelial,male
...,...,...,...,...,...,...,...,...,...,...,...
TGACAGTAGATGGCGT_TSP2_Vasculature_Aorta_10X_1_2,Vasculature,10X,TSP2,Aorta,10354.0,2456,epithelial cell,epithelial cell,True,epithelial,female
TGTTGGATCGACATCA_TSP2_Vasculature_Aorta_10X_1_2,Vasculature,10X,TSP2,Aorta,15677.0,3234,epithelial cell,epithelial cell,True,epithelial,female
TTGGATGGTGGCTACC_TSP2_Vasculature_Aorta_10X_1_2,Vasculature,10X,TSP2,Aorta,18638.0,3368,epithelial cell,epithelial cell,True,epithelial,female
CTTGATTTCTTGCAAG_TSP2_Vasculature_Aorta_10X_2_2,Vasculature,10X,TSP2,Aorta,36555.0,5102,epithelial cell,epithelial cell,True,epithelial,female


## Build driver

In [9]:
import pandas as pd

def main(job_number, parameter_file):
    """
    This is the main function for the array job to perform the reprogramming experiment. job_number is a single parameter
    that will be used to look up in a parameter table which model, reprogramming recipe, and other information relevant
    to the test.
    """

    # Determine embedding parameters and recipie
    df_embedding_parameters = pd.read_csv(parameter_file)
    TFs    = df_embedding_parameters['TFs'].values[job_number]
    model  = df_embedding_parameters['model'].values[job_number]
    source = df_embedding_parameters['source'].values[job_number]
    target = df_embedding_parameters['target'].values[job_number]

    # Load the source data
    adata = 

    # Perturb the data
    perturbed_adata = perturbation_model(adata, TFs)

    # Generate embeddings
    if model == 'geneformer':
        adata_embedded = embed_geneformer([source_adata, perturbed_adata, target_adata])
    elif model == 'tGPT':
        adata_embedded = embed_tGPT([source_adata, perturbed_adata, target_adata])
    elif model == 'scGTP':
        adata_embedded = embed_scGTP([source_adata, perturbed_adata, target_adata])

    # Save the results to a file

    return 0


SyntaxError: invalid syntax (1649016643.py, line 18)

## Build parameter dataframe

In [None]:
embedding_parameters = {
    'source': [],
    'target': [],
    'TFs'   : [],
    'model' : []
}
models = ['geneformer', 'tGPT', 'scGTP']

df = pd.read_csv('data/first_5_recepies_8_29_2024.csv')

for i in range(5):
    TFs = df['TFs'].values[i].split()
    source = df['Source'].values[i]
    target = df['Target'].values[i]
    for model in models:
        embedding_parameters['TFs'].append(TFs)
        embedding_parameters['source'].append(source)
        embedding_parameters['target'].append(source)
        embedding_parameters['model'].append(model)

df_embedding_parameters = pd.DataFrame(embedding_parameters)

df_embedding_parameters

In [None]:
df

# Day 0

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# load known reprogramming regiems
df = pd.read_csv('data/known-regiems-T1.csv')

In [None]:
# get list of unique transcription factors
TFs = []
for regime in df['TFs'].unique():
    TFs += regime.replace(',', '').split()
TFs = list(set(TFs))
print(f"{len(TFs)=}")
TFs