# scGPT Reprogramming Embeddings

The purpose of this notebook is to embed perturbed single cell files. In particular:

1. one-shot perturbed single cell expression data are obtained from: `/nfs/turbo/umms-indikar/shared/projects/DARPA_AI/in-silico-reprogramming/one-shot/perturbed`
2. each file in the input directory is read and embedded with scGPT
3. the corresponding embeddings are saved to `/nfs/turbo/umms-indikar/shared/projects/DARPA_AI/in-silico-reprogramming/one-shot/scGPT`
4. if a particular recipie already

In [1]:
import os
import sys
from pathlib import Path

import numpy as np
import pandas as pd
from scipy.stats import mode
import scanpy as sc
import sklearn
import warnings
import torch

sys.path.insert(0, "../../")
import scgpt as scg

# extra dependency for similarity search
try:
    import faiss

    faiss_imported = True
except ImportError:
    faiss_imported = False
    print(
        "faiss not installed! We highly recommend installing it for fast similarity search."
    )
    print("To install it, see https://github.com/facebookresearch/faiss/wiki/Installing-Faiss")

warnings.filterwarnings("ignore", category=ResourceWarning)



In [2]:
# Directory containing the .h5ad files
input_directory = "/nfs/turbo/umms-indikar/shared/projects/DARPA_AI/in-silico-reprogramming/one-shot/perturbed"
output_directory = "/nfs/turbo/umms-indikar/shared/projects/DARPA_AI/in-silico-reprogramming/one-shot/scGPT/"

# Load source data
DATAPATH = "/nfs/turbo/umms-indikar/shared/projects/DGC/data/tabula_sapiens/jpic/"
FILE = "fibroblast.h5ad"
adata = sc.read_h5ad(os.path.join(DATAPATH, FILE))
print(f"{adata.shape=}")

adata.shape=(38151, 58870)


In [3]:
# scGPT parameters
model_dir     = Path("/nfs/turbo/umms-indikar/shared/projects/foundation_models/scGPT_human")
cell_type_key = "Celltype"
gene_col      = "gene_symbol"

In [None]:
# Loop over all .h5ad files found in input_directory
for filename in os.listdir(input_directory):
    if filename.endswith(".h5ad"):
        print(f"{filename=}")

        # Check if the output file already exists
        output_filepath = os.path.join(output_directory, filename)
        print(f"{output_filepath=}")

        if os.path.exists(output_filepath):
            print(f"File already exists, skipping: {output_filepath}")
            continue  # Skip to the next file
        else:
            print("This file does not exist. Embeddings will be generated ...")
        
        # Check the input perturbations
        filepath = os.path.join(input_directory, filename)
        print(f"{filepath=}")

        # Read in the perturbed data
        adata = sc.read_h5ad(filepath)
        adata.X = adata.X.toarray()
        print(f"{adata.shape=}")

        # Frees up all the unused cached memory on the GPU
        torch.cuda.empty_cache()
        
        # Perform the embeddings with scGPT
        ref_embed_adata = scg.tasks.embed_data(
            adata,
            model_dir,
            gene_col="index",
            obs_to_save=list(adata.obs.columns),  # optional arg, only for saving metainfo
            batch_size=64,
            return_new_adata=True,
        )

        # Save the embeddings to a new file in the output path
        ref_embed_adata.write(output_filepath)
        print("File saved")

filename='SOX2.h5ad'
output_filepath='/nfs/turbo/umms-indikar/shared/projects/DARPA_AI/in-silico-reprogramming/one-shot/scGPT/SOX2.h5ad'
File already exists, skipping: /nfs/turbo/umms-indikar/shared/projects/DARPA_AI/in-silico-reprogramming/one-shot/scGPT/SOX2.h5ad
filename='SOX2_HMGA2.h5ad'
output_filepath='/nfs/turbo/umms-indikar/shared/projects/DARPA_AI/in-silico-reprogramming/one-shot/scGPT/SOX2_HMGA2.h5ad'
File already exists, skipping: /nfs/turbo/umms-indikar/shared/projects/DARPA_AI/in-silico-reprogramming/one-shot/scGPT/SOX2_HMGA2.h5ad
filename='ASCL1_PAX6.h5ad'
output_filepath='/nfs/turbo/umms-indikar/shared/projects/DARPA_AI/in-silico-reprogramming/one-shot/scGPT/ASCL1_PAX6.h5ad'
This file does not exist. Embeddings will be generated ...
filepath='/nfs/turbo/umms-indikar/shared/projects/DARPA_AI/in-silico-reprogramming/one-shot/perturbed/ASCL1_PAX6.h5ad'


  utils.warn_names_duplicates("obs")


adata.shape=(114453, 58870)
scGPT - INFO - match 37825/58870 genes in vocabulary of size 60697.


Embedding cells: 100%|██████████| 1789/1789 [07:19<00:00,  4.07it/s]
  utils.warn_names_duplicates("obs")


File saved
filename='SOX2_PAX6.h5ad'
output_filepath='/nfs/turbo/umms-indikar/shared/projects/DARPA_AI/in-silico-reprogramming/one-shot/scGPT/SOX2_PAX6.h5ad'
This file does not exist. Embeddings will be generated ...
filepath='/nfs/turbo/umms-indikar/shared/projects/DARPA_AI/in-silico-reprogramming/one-shot/perturbed/SOX2_PAX6.h5ad'


  utils.warn_names_duplicates("obs")


adata.shape=(114453, 58870)
scGPT - INFO - match 37825/58870 genes in vocabulary of size 60697.


Embedding cells:  49%|████▉     | 876/1789 [03:48<03:30,  4.33it/s] 