In [9]:
!pip install -q -U scgpt "flash-attn<1.0.5"
!pip install -q flash_attn==01.0.4
!pip install -U -q ipython ipykernel

In [10]:
import os
import sys
from pathlib import Path
import warnings

import scanpy as sc
import scib
import numpy as np
import pandas as pd

import scgpt as scg
import matplotlib.pyplot as plt

import gc

In [11]:
sys.path.insert(0, "../")
plt.style.context('default')
warnings.simplefilter("ignore", ResourceWarning)

In [13]:
model_dir = Path("/kaggle/input/scgpt_whole_human/pytorch/default/4")
covid_dataset_path = '/kaggle/input/covid19-split-small'
output_path = "/kaggle/working/"
output_file = "embedded_data_split_severity_celltype_progression_all.pkl"
output_path = os.path.join(output_path, output_file)
!rm -rf /kaggle/working/*

columns = ['embeddings', 'severity', 'celltype', 'Sample time']

for file in os.listdir(covid_dataset_path):
    data = []
    file_path = os.path.join(covid_dataset_path, file)
    if os.path.isfile(file_path):
        adata = sc.read_h5ad(file_path)
        gene_col = "gene_symbol"
        cell_type_key = "celltype"
        batch_key = "tech"
        N_HVG = 1800
        adata.var[gene_col] = adata.var.index.values

        # preprocess
        sc.pp.normalize_total(adata, target_sum=1e4)
        sc.pp.log1p(adata)
        # highly variable genes
        sc.pp.highly_variable_genes(adata, n_top_genes=N_HVG, flavor='seurat_v3')
        adata = adata[:, adata.var['highly_variable']]
        embed_adata = scg.tasks.embed_data(
            adata,
            model_dir,
            gene_col=gene_col,
            batch_size=64,
        )
        for i in range(len(embed_adata.obsm['X_scGPT'])):
            data.append([embed_adata.obsm['X_scGPT'][i], embed_adata.obs['CoVID-19 severity'][i], embed_adata.obs['celltype'][i], embed_adata.obs['Sample time'][i]])
        del adata
        del embed_adata
        gc.collect()            
        df = pd.DataFrame(data, columns=columns)
        del data
        gc.collect()
        if os.path.exists(output_path):
            output = pd.read_pickle(output_path)
        else:
            output = pd.DataFrame()
        output = pd.concat([output, df], ignore_index=True)
        output.to_pickle(output_path)
        del df
        del output
        gc.collect()

  adata.var["id_in_vocab"] = [


scGPT - INFO - match 1439/1800 genes in vocabulary of size 60697.


Embedding cells: 100%|██████████| 915/915 [04:18<00:00,  3.54it/s]
  adata.obsm["X_scGPT"] = cell_embeddings
  data.append([embed_adata.obsm['X_scGPT'][i], embed_adata.obs['CoVID-19 severity'][i], embed_adata.obs['celltype'][i], embed_adata.obs['Sample time'][i]])
  adata.var["id_in_vocab"] = [


scGPT - INFO - match 1377/1800 genes in vocabulary of size 60697.


Embedding cells: 100%|██████████| 915/915 [05:59<00:00,  2.55it/s]
  adata.obsm["X_scGPT"] = cell_embeddings
  data.append([embed_adata.obsm['X_scGPT'][i], embed_adata.obs['CoVID-19 severity'][i], embed_adata.obs['celltype'][i], embed_adata.obs['Sample time'][i]])
  adata.var["id_in_vocab"] = [


scGPT - INFO - match 1320/1800 genes in vocabulary of size 60697.


Embedding cells: 100%|██████████| 915/915 [01:13<00:00, 12.53it/s]
  adata.obsm["X_scGPT"] = cell_embeddings
  data.append([embed_adata.obsm['X_scGPT'][i], embed_adata.obs['CoVID-19 severity'][i], embed_adata.obs['celltype'][i], embed_adata.obs['Sample time'][i]])
  adata.var["id_in_vocab"] = [


scGPT - INFO - match 1556/1800 genes in vocabulary of size 60697.


Embedding cells: 100%|██████████| 915/915 [08:01<00:00,  1.90it/s]
  adata.obsm["X_scGPT"] = cell_embeddings
  data.append([embed_adata.obsm['X_scGPT'][i], embed_adata.obs['CoVID-19 severity'][i], embed_adata.obs['celltype'][i], embed_adata.obs['Sample time'][i]])
  adata.var["id_in_vocab"] = [


scGPT - INFO - match 1481/1800 genes in vocabulary of size 60697.


Embedding cells: 100%|██████████| 915/915 [04:28<00:00,  3.41it/s]
  adata.obsm["X_scGPT"] = cell_embeddings
  data.append([embed_adata.obsm['X_scGPT'][i], embed_adata.obs['CoVID-19 severity'][i], embed_adata.obs['celltype'][i], embed_adata.obs['Sample time'][i]])
  adata.var["id_in_vocab"] = [


scGPT - INFO - match 1529/1800 genes in vocabulary of size 60697.


Embedding cells: 100%|██████████| 915/915 [05:35<00:00,  2.73it/s]
  adata.obsm["X_scGPT"] = cell_embeddings
  data.append([embed_adata.obsm['X_scGPT'][i], embed_adata.obs['CoVID-19 severity'][i], embed_adata.obs['celltype'][i], embed_adata.obs['Sample time'][i]])
  adata.var["id_in_vocab"] = [


scGPT - INFO - match 1401/1800 genes in vocabulary of size 60697.


Embedding cells: 100%|██████████| 915/915 [03:37<00:00,  4.21it/s]
  adata.obsm["X_scGPT"] = cell_embeddings
  data.append([embed_adata.obsm['X_scGPT'][i], embed_adata.obs['CoVID-19 severity'][i], embed_adata.obs['celltype'][i], embed_adata.obs['Sample time'][i]])
  adata.var["id_in_vocab"] = [


scGPT - INFO - match 1551/1800 genes in vocabulary of size 60697.


Embedding cells: 100%|██████████| 915/915 [04:37<00:00,  3.30it/s]
  adata.obsm["X_scGPT"] = cell_embeddings
  data.append([embed_adata.obsm['X_scGPT'][i], embed_adata.obs['CoVID-19 severity'][i], embed_adata.obs['celltype'][i], embed_adata.obs['Sample time'][i]])
  adata.var["id_in_vocab"] = [


scGPT - INFO - match 1562/1800 genes in vocabulary of size 60697.


Embedding cells: 100%|██████████| 915/915 [05:35<00:00,  2.72it/s]
  adata.obsm["X_scGPT"] = cell_embeddings
  data.append([embed_adata.obsm['X_scGPT'][i], embed_adata.obs['CoVID-19 severity'][i], embed_adata.obs['celltype'][i], embed_adata.obs['Sample time'][i]])
  adata.var["id_in_vocab"] = [


scGPT - INFO - match 1533/1800 genes in vocabulary of size 60697.


Embedding cells: 100%|██████████| 915/915 [04:42<00:00,  3.24it/s]
  adata.obsm["X_scGPT"] = cell_embeddings
  data.append([embed_adata.obsm['X_scGPT'][i], embed_adata.obs['CoVID-19 severity'][i], embed_adata.obs['celltype'][i], embed_adata.obs['Sample time'][i]])
  adata.var["id_in_vocab"] = [


scGPT - INFO - match 1471/1800 genes in vocabulary of size 60697.


Embedding cells: 100%|██████████| 915/915 [05:27<00:00,  2.80it/s]
  adata.obsm["X_scGPT"] = cell_embeddings
  data.append([embed_adata.obsm['X_scGPT'][i], embed_adata.obs['CoVID-19 severity'][i], embed_adata.obs['celltype'][i], embed_adata.obs['Sample time'][i]])
  adata.var["id_in_vocab"] = [


scGPT - INFO - match 1537/1800 genes in vocabulary of size 60697.


Embedding cells: 100%|██████████| 915/915 [04:46<00:00,  3.19it/s]
  adata.obsm["X_scGPT"] = cell_embeddings
  data.append([embed_adata.obsm['X_scGPT'][i], embed_adata.obs['CoVID-19 severity'][i], embed_adata.obs['celltype'][i], embed_adata.obs['Sample time'][i]])
  adata.var["id_in_vocab"] = [


scGPT - INFO - match 1435/1800 genes in vocabulary of size 60697.


Embedding cells: 100%|██████████| 915/915 [04:09<00:00,  3.67it/s]
  adata.obsm["X_scGPT"] = cell_embeddings
  data.append([embed_adata.obsm['X_scGPT'][i], embed_adata.obs['CoVID-19 severity'][i], embed_adata.obs['celltype'][i], embed_adata.obs['Sample time'][i]])
  adata.var["id_in_vocab"] = [


scGPT - INFO - match 1524/1800 genes in vocabulary of size 60697.


Embedding cells: 100%|██████████| 915/915 [05:15<00:00,  2.90it/s]
  adata.obsm["X_scGPT"] = cell_embeddings
  data.append([embed_adata.obsm['X_scGPT'][i], embed_adata.obs['CoVID-19 severity'][i], embed_adata.obs['celltype'][i], embed_adata.obs['Sample time'][i]])
  adata.var["id_in_vocab"] = [


scGPT - INFO - match 1555/1800 genes in vocabulary of size 60697.


Embedding cells: 100%|██████████| 915/915 [05:04<00:00,  3.01it/s]
  adata.obsm["X_scGPT"] = cell_embeddings
  data.append([embed_adata.obsm['X_scGPT'][i], embed_adata.obs['CoVID-19 severity'][i], embed_adata.obs['celltype'][i], embed_adata.obs['Sample time'][i]])
  adata.var["id_in_vocab"] = [


scGPT - INFO - match 1498/1800 genes in vocabulary of size 60697.


Embedding cells: 100%|██████████| 915/915 [06:19<00:00,  2.41it/s]
  adata.obsm["X_scGPT"] = cell_embeddings
  data.append([embed_adata.obsm['X_scGPT'][i], embed_adata.obs['CoVID-19 severity'][i], embed_adata.obs['celltype'][i], embed_adata.obs['Sample time'][i]])
  adata.var["id_in_vocab"] = [


scGPT - INFO - match 1494/1800 genes in vocabulary of size 60697.


Embedding cells: 100%|██████████| 915/915 [03:25<00:00,  4.44it/s]
  adata.obsm["X_scGPT"] = cell_embeddings
  data.append([embed_adata.obsm['X_scGPT'][i], embed_adata.obs['CoVID-19 severity'][i], embed_adata.obs['celltype'][i], embed_adata.obs['Sample time'][i]])
  adata.var["id_in_vocab"] = [


scGPT - INFO - match 1354/1800 genes in vocabulary of size 60697.


Embedding cells: 100%|██████████| 915/915 [01:34<00:00,  9.69it/s]
  adata.obsm["X_scGPT"] = cell_embeddings
  data.append([embed_adata.obsm['X_scGPT'][i], embed_adata.obs['CoVID-19 severity'][i], embed_adata.obs['celltype'][i], embed_adata.obs['Sample time'][i]])
  adata.var["id_in_vocab"] = [


scGPT - INFO - match 1526/1800 genes in vocabulary of size 60697.


Embedding cells: 100%|██████████| 915/915 [04:54<00:00,  3.11it/s]
  adata.obsm["X_scGPT"] = cell_embeddings
  data.append([embed_adata.obsm['X_scGPT'][i], embed_adata.obs['CoVID-19 severity'][i], embed_adata.obs['celltype'][i], embed_adata.obs['Sample time'][i]])
  adata.var["id_in_vocab"] = [


scGPT - INFO - match 1438/1800 genes in vocabulary of size 60697.


Embedding cells: 100%|██████████| 915/915 [05:10<00:00,  2.94it/s]
  adata.obsm["X_scGPT"] = cell_embeddings
  data.append([embed_adata.obsm['X_scGPT'][i], embed_adata.obs['CoVID-19 severity'][i], embed_adata.obs['celltype'][i], embed_adata.obs['Sample time'][i]])
  adata.var["id_in_vocab"] = [


scGPT - INFO - match 1478/1800 genes in vocabulary of size 60697.


Embedding cells: 100%|██████████| 915/915 [04:17<00:00,  3.56it/s]
  adata.obsm["X_scGPT"] = cell_embeddings
  data.append([embed_adata.obsm['X_scGPT'][i], embed_adata.obs['CoVID-19 severity'][i], embed_adata.obs['celltype'][i], embed_adata.obs['Sample time'][i]])
  adata.var["id_in_vocab"] = [


scGPT - INFO - match 1481/1800 genes in vocabulary of size 60697.


Embedding cells: 100%|██████████| 915/915 [05:00<00:00,  3.04it/s]
  adata.obsm["X_scGPT"] = cell_embeddings
  data.append([embed_adata.obsm['X_scGPT'][i], embed_adata.obs['CoVID-19 severity'][i], embed_adata.obs['celltype'][i], embed_adata.obs['Sample time'][i]])
  adata.var["id_in_vocab"] = [


scGPT - INFO - match 1388/1800 genes in vocabulary of size 60697.


Embedding cells: 100%|██████████| 915/915 [04:01<00:00,  3.79it/s]
  adata.obsm["X_scGPT"] = cell_embeddings
  data.append([embed_adata.obsm['X_scGPT'][i], embed_adata.obs['CoVID-19 severity'][i], embed_adata.obs['celltype'][i], embed_adata.obs['Sample time'][i]])
  adata.var["id_in_vocab"] = [


scGPT - INFO - match 1403/1800 genes in vocabulary of size 60697.


Embedding cells: 100%|██████████| 915/915 [06:18<00:00,  2.42it/s]
  adata.obsm["X_scGPT"] = cell_embeddings
  data.append([embed_adata.obsm['X_scGPT'][i], embed_adata.obs['CoVID-19 severity'][i], embed_adata.obs['celltype'][i], embed_adata.obs['Sample time'][i]])
  adata.var["id_in_vocab"] = [


scGPT - INFO - match 1538/1800 genes in vocabulary of size 60697.


Embedding cells: 100%|██████████| 915/915 [05:17<00:00,  2.88it/s]
  adata.obsm["X_scGPT"] = cell_embeddings
  data.append([embed_adata.obsm['X_scGPT'][i], embed_adata.obs['CoVID-19 severity'][i], embed_adata.obs['celltype'][i], embed_adata.obs['Sample time'][i]])
