# Minimal Working Example to run tGPT on Great Lakes

The model parameters are stored at `/nfs/turbo/umms-indikar/shared/projects/foundation_models/transcriptome-gpt-1024-8-16-64` and some example data is located at `/nfs/turbo/umms-indikar/shared/projects/foundation_models/example_inputs/tGPT`



In [1]:
import re
import os
import sys
import gzip
import torch
import numpy as np
import pandas as pd
import scanpy as sc
from tqdm import tqdm
from torch.utils.data import DataLoader, Dataset
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, GPT2Model

# Setting parameter and file path

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu" 
print(device)
tokenizer_file = "/nfs/turbo/umms-indikar/shared/projects/foundation_models/transcriptome-gpt-1024-8-16-64" 
checkpoint     = "/nfs/turbo/umms-indikar/shared/projects/foundation_models/transcriptome-gpt-1024-8-16-64" ## Pretrained model
celltype_path  = "/nfs/turbo/umms-indikar/shared/projects/foundation_models/example_inputs/tGPT/Muris_cell_labels.txt.gz" ## Cell type annotation
max_len        = 64 ## Number of top genes used for analysis
text_file      = "/nfs/turbo/umms-indikar/shared/projects/foundation_models/example_inputs/tGPT/Muris_gene_rankings.txt.gz"  ## Gene symbols ranked by exprssion

cuda


# Extract features

In [3]:
class LineDataset(Dataset):
    def __init__(self, lines):
        self.lines = lines
        self.regex = re.compile(r'\-|\.')
    def __getitem__(self, i):
        return self.regex.sub('_', self.lines[i])
    def __len__(self):
        return len(self.lines)

tokenizer = PreTrainedTokenizerFast.from_pretrained(tokenizer_file)
print('Tokenizer set')
model = GPT2LMHeadModel.from_pretrained(checkpoint,output_hidden_states = True).transformer
print('model set')
model = model.to(device)
print('model to device')
model.eval()
print('model eval')

lines = [s.decode().strip() for s in gzip.open(text_file, "r").readlines()]

ds = LineDataset(lines)
dl = DataLoader(ds, batch_size=64)

Xs = []
for a in tqdm(dl, total=len(dl)):
    batch = tokenizer(a, max_length= max_len, truncation=True, padding=True, return_tensors="pt")

    for k, v in batch.items():
        batch[k] = v.to(device)

    with torch.no_grad():
        x = model(**batch)
    
    eos_idxs = batch.attention_mask.sum(dim=1) - 1
    xx = x.last_hidden_state
       
    result_list = [[] for i in range(len(xx))]

    for j, item in enumerate(xx):
        result_list[j] = item[1:int(eos_idxs[j]),:].mean(dim =0).tolist()
        
    Xs.extend(result_list)
    
features = np.stack(Xs)

Tokenizer set


  return self.fget.__get__(instance, owner)()


model set
model to device
model eval


100%|██████████| 858/858 [01:59<00:00,  7.19it/s]


# Visualization

In [5]:
adata=sc.AnnData(features)
celltype = pd.read_csv(celltype_path, header=None)[0].tolist()
adata.obs["celltype"] = celltype
adata.obs["celltype"] = adata.obs["celltype"].astype("category")

In [6]:
sc.pp.neighbors(adata,n_neighbors=20)

         Falling back to preprocessing with `sc.pp.pca` and default params.


In [7]:
sc.tl.leiden(adata,resolution=0.6)


 To achieve the future defaults please pass: flavor="igraph" and n_iterations=2.  directed must also be False to work with igraph's implementation.
  sc.tl.leiden(adata,resolution=0.6)


In [8]:
sc.tl.umap(adata)

In [None]:
#################  Cell Type  #######################
sc.pl.umap(adata, color = ["celltype"], show = True)

In [None]:
############ Single-cell Clustering  #############
sc.pl.umap(adata, color = ["leiden"], show = True)

In [None]:
sc.set_figure_params(dpi=300,figsize=(7,7))
sc.pl.umap(adata, color = ["celltype"], save="celltype.png")
sc.pl.umap(adata, color = ["leiden"], save="leiden.png")