# Embedding Generation Script for scRNA-seq and scATAC-seq Data
### This script generates embeddings for single-cell RNA-seq (scRNA-seq) and ATAC-seq (scATAC-seq) data using a pre-trained transformer model. It processes input datasets, loads a pre-trained model, and produces latent representations that can be used for downstream analysis tasks.

### 1. Imported Packages, Initial Setup and Configuration

In [1]:
import os
import pickle
import subprocess
import numpy as np
import random
import torch
import pandas as pd
import json
from datasets import load_from_disk, concatenate_datasets, Dataset
from transformers.training_args import TrainingArguments
from transformers import BertConfig

from scarf.contrastive_model import TotalModel_downstream
from scarf.pretrainer_modified import Pretrainer
from scarf.data_collator_modified import DataCollatorForLanguageModeling_Inference
from scarf.pretrainer_modified import PreCollator

from scarf.utils import load_model_with_index
from tqdm import tqdm
os.environ["NCCL_DEBUG"] = "INFO"
os.environ["OMPI_MCA_opal_cuda_support"] = "true"
os.environ["CONDA_OVERRIDE_GLIBC"] = "2.56"
os.environ["CUDA_VISIBLE_DEVICES"] = '0'

seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)




### 2. Input Parameters and Output Settings

In [2]:
class NotebookSettings:
    def __init__(self,checkpoint_path,dset_path,h5ad_dset_path,dset_name,embed_type,has_label,modality):
        self.checkpoint_path = checkpoint_path
        self.dset_path =dset_path
        self.h5ad_dset_path = h5ad_dset_path
        self.dset_name = dset_name
        self.embed_type = embed_type
        self.has_label = has_label
        self.modality = modality
args = NotebookSettings(
        checkpoint_path = './checkpoint-271755',
        dset_path = '/public/share/t_lgl/scFM/evaluation/data_set/processed_data/09_hPBMC_10k_scGLUE_10xDemo/',
        h5ad_dset_path ="/public/share/t_lgl/scFM/evaluation/data_set/raw_data/09_hPBMC_10k_scGLUE_10xDemo/",
        dset_name = "hPBMC_10k_scGLUE_10xDemo",
        embed_type = "RNA_and_ATAC", # 可选项：RNA，ATAC，RNA_and_ATAC
        has_label = True,
        modality = 0,
)
checkpoint_path = args.checkpoint_path
dset_name = args.dset_name
data = load_from_disk(args.dset_path)
data_len = data.num_rows
data_modality = Dataset.from_dict({"modality": [args.modality] * data_len})
dataset = concatenate_datasets([data, data_modality], axis=1)
sorted_len = [32000] * data_len

# define output directory path
output_path = f'./get_embeds'
run_name = f"/{checkpoint_path.split('/')[-2].split('__')[-1]}_{checkpoint_path.split('/')[-1]}_temp"  
output_dir = output_path + run_name
os.makedirs(output_dir,exist_ok=True)

### 3. Prior knowledge
This code loads several prior knowledge components:

Token dictionary mapping genes to tokens

Homologous gene indices between species

Peak IDF (Inverse Document Frequency) values

Motif matrix for peak regions

In [3]:
dict_dir = './prior_data'
token_dict_path = f'{dict_dir}/hm_ENSG2token_dict.pickle'
with open(token_dict_path, "rb") as fp:
    token_dictionary = pickle.load(fp)

priors = {}
priors['peak_idf'] = np.load(f'{dict_dir}/peakToken_idf.npz')['arr_0']

### 4. Model Loading and Preparation

In [4]:

loaded_state_dict = load_model_with_index(checkpoint_path)
with open(checkpoint_path + '/config.json', 'r') as f:
    config = json.load(f)

loaded_config = BertConfig(**config)
loaded_config["atac_model_cfg"]["x_peak_inacc_ratio"] = 0

model = TotalModel_downstream(loaded_config, priors=priors)
model.load_state_dict(loaded_state_dict, strict=False)
del loaded_state_dict
print(model)

batch_size = 16
training_args = {
    "per_device_eval_batch_size": batch_size,
    "output_dir": output_dir,
    "half_precision_backend": 'apex',
    "dataloader_num_workers": 8,
    "dataloader_prefetch_factor": 2,
    "do_train": False,
    "do_eval": True,
    "group_by_length": False,
    "length_column_name": "length",
    "disable_tqdm": False,
    "save_safetensors": False
}

training_args = TrainingArguments(**training_args)
trainer = Pretrainer(
    model=model,
    args=training_args,
    example_lengths_file=sorted_len,
    token_dictionary=token_dictionary,
    data_collator=DataCollatorForLanguageModeling_Inference(
        tokenizer=PreCollator(token_dictionary=token_dictionary),
        mlm=True, config=loaded_config, priors=priors,
    )
)

BertForMaskedLMWithRNA has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


Now using our modified model!!!
Now using our modified model!!!
Now using our modified model!!!
Now using our modified model!!!
Now using our modified model!!!
Now using our modified model!!!
Now using our modified model!!!
Now using our modified model!!!
Now using our modified model!!!
Now using our modified model!!!
{'CTA_num': 164, 'R2R_rec': 'poisson', 'attention_probs_dropout_prob': 0.02, 'attn_cfg': {'num_heads': 8}, 'attn_layer_idx': [6], 'd_intermediate': 0, 'd_model': 512, 'encoder_type': 'mamba', 'expand': 2, 'fused_add_norm': True, 'head_dim': 64, 'hidden_act': 'silu', 'hidden_dropout_prob': 0.02, 'hidden_size': 512, 'inidtializer_range': 0.02, 'intermediate_size': 1024, 'kl_dim_rna': 256, 'layer_norm_eps': 1e-12, 'mamba_version': 'v_manual', 'max_position_embeddings': 2049, 'n_groups': 8, 'n_layer': 12, 'num_attention_heads': 4, 'num_heads': 16, 'num_hidden_layers': 12, 'pad_token_id': 0, 'pad_vocab_size_multiple': 16, 'residual_in_fp32': True, 'rms_norm': True, 'rna_max_in

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


TotalModel_downstream(
  (encoder_rna): BertForMaskedLMWithRNA(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(40990, 512, padding_idx=0)
        (values_embeddings): ContinuousValueEncoder(
          (linear1): Linear(in_features=1, out_features=512, bias=True)
          (activation): ReLU()
          (linear2): Linear(in_features=512, out_features=512, bias=True)
          (norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (concat_embeddings): Sequential(
          (cat_fc): Linear(in_features=1024, out_features=512, bias=True)
          (cat_ln): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (cat_gelu): QuickGELU()
          (cat_proj): Linear(in_features=512, out_features=512, bias=True)
        )
        (position_embeddings): Embedding(8195, 512)
        (token_type_embeddings): Embedding(2, 512)
        (LayerNorm): LayerNorm((512,), 

/public/share/t_lgl/scFM/shared/Final_models_250514/SCARF/conda_scarf/compiler_compat/ld: cannot find -lcufile: No such file or directory
collect2: error: ld returned 1 exit status


### 5. Outputs of SCARF
For each run, the script generates:

Cell embeddings (RNA and/or ATAC)

Cell names matching the embeddings

Label files (if available)

In [5]:

test_dataloader = trainer.get_test_dataloader(dataset)
trainer.model.eval()

is_only_emb = True
direct_outs = [False]
for direct_out in direct_outs:
    rna_res, atac_res, cell_names = [], [], []
    for step, inputs in enumerate(tqdm(test_dataloader)):
        with torch.no_grad():
            inputs_new = {}
            if args.embed_type == 'RNA':
                to_cuda_names = ['rna_gene_ids', 'rna_gene_values', 'rna_attention_mask', 'species', 'modality']
            elif args.embed_type == 'ATAC':
                to_cuda_names = ['atac_cell_peaks','atac_peak_ids', 'atac_peak_idfs','atac_attention_mask', 'species', 'modality']
            else:
                to_cuda_names = ['rna_gene_ids', 'rna_gene_values', 'rna_attention_mask','atac_peak_ids', 'atac_peak_idfs',
                                 'atac_attention_mask', 'species', 'modality']

            for each_name in to_cuda_names:
                if each_name in inputs:
                    inputs_new[each_name] = inputs[each_name].to("cuda")
            inputs_new['cell_name'] = inputs['cell_name']
            inputs_new['direct_out'] = direct_out
            if args.embed_type == 'RNA_and_ATAC':
                rna_res_dict, atac_res_dict, pred, gt = trainer.model.match_forward(**inputs_new)
                rna_res.extend(list(rna_res_dict.values()))
                atac_res.extend(list(atac_res_dict.values()))
                cell_names.extend(list(rna_res_dict.keys()))
                del rna_res_dict, atac_res_dict, pred, gt
            elif args.embed_type == 'RNA':
                rna_res_dict = trainer.model.get_rna_embeddings(**inputs_new)
                rna_res.extend(list(rna_res_dict.values()))
                cell_names.extend(list(rna_res_dict.keys()))
                del rna_res_dict
            elif args.embed_type == 'ATAC':
                atac_res_dict = trainer.model.get_atac_embeddings(**inputs_new)
                atac_res.extend(list(atac_res_dict.values()))
                cell_names.extend(list(atac_res_dict.keys()))
                del atac_res_dict
    save_path = f"{output_dir}/{dset_name}_{args.embed_type}_bs{batch_size}_directOut{int(direct_out)}"
    os.makedirs(save_path, exist_ok=True)

    if 'RNA' in args.embed_type:
        rna_res = rna_res[:data_len]
        cell_names = cell_names[:data_len]
        print(data_len, np.array(rna_res).shape, len(cell_names))
        np.save(f'{save_path}/rna_cell_embs.npy', np.array(rna_res).astype(np.float32))
        np.save(f'{save_path}/cell_names.npy', np.array(cell_names))

    if 'ATAC' in args.embed_type:
        atac_res = atac_res[:data_len]
        cell_names = cell_names[:data_len]
        print(data_len, np.array(atac_res).shape, len(cell_names))
        np.save(f'{save_path}/atac_cell_embs.npy', np.array(atac_res).astype(np.float32))
        np.save(f'{save_path}/cell_names.npy', np.array(cell_names))

    if args.has_label:
        cell_names = dataset['cell_name']
        cell_types = dataset['cell_types']
        name2type_dict = {cell_name: [cell_type] for cell_name, cell_type in zip(cell_names, cell_types)}
        name2type_df = pd.DataFrame(name2type_dict).T
        name2type_df.to_csv(f'{save_path}/labels.tsv.gz', header=False, index=True)


  0%|          | 0/602 [00:00<?, ?it/s]

100%|██████████| 602/602 [03:30<00:00,  2.86it/s]


9631 (0,) 0
9631 (0,) 0


100%|██████████| 602/602 [03:28<00:00,  2.89it/s]


9631 (0,) 0
9631 (0,) 0
