## scNym: https://github.com/calico/scnym 

In [1]:
import scnym
from scnym.api import scnym_api
import torch
import random
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import scanpy as sc




In [2]:
def split_data(data_path: str, 
               fold: int=1,
               folds: int=5,
               batch_key: str="patientID", 
               label_key: str="cell_type",
               seed: int=42,
               HVG: bool=False,
               HVGs: int=2000):
    
    adata = sc.read(data_path, cache=True)

    adata.obs["batch"] = adata.obs[batch_key]

    # Ensure reproducibility
    def rep_seed(seed):
        # Check if a GPU is available
        if torch.cuda.is_available():
            # Set the random seed for PyTorch CUDA (GPU) operations
            torch.cuda.manual_seed(seed)
            # Set the random seed for all CUDA devices (if multiple GPUs are available)
            torch.cuda.manual_seed_all(seed)
        
        # Set the random seed for CPU-based PyTorch operations
        torch.manual_seed(seed)
        
        # Set the random seed for NumPy
        np.random.seed(seed)
        
        # Set the random seed for Python's built-in 'random' module
        random.seed(seed)
        
        # Set the random seed for TensorFlow
        tf.random.set_seed(seed)
        
        # Set CuDNN to deterministic mode for PyTorch (GPU)
        torch.backends.cudnn.deterministic = True
        
        # Disable CuDNN's benchmarking mode for deterministic behavior
        torch.backends.cudnn.benchmark = False

    rep_seed(seed)

    # Initialize Stratified K-Fold
    stratified_kfold = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)

    # Iterate through the folds
    adata_original = adata.copy()
    test_adata_original = adata.copy()
    fold_counter = 0
    for train_index, test_index in stratified_kfold.split(adata_original.X, adata_original.obs[label_key]):
        fold_counter += 1
        
        adata = adata_original[train_index, :].copy()
        test_adata = test_adata_original[test_index, :].copy()

        if HVG:
            sc.pp.highly_variable_genes(adata, n_top_genes=HVGs, flavor="cell_ranger")
            test_adata = test_adata[:, adata.var["highly_variable"]].copy()
            adata = adata[:, adata.var["highly_variable"]].copy()
        
        if fold_counter == fold:
            
            return adata, test_adata


In [3]:
data_path = "../../../data/processed/data_for_evaluating_cell_type_annotation/Baron.h5ad"

# Iterate through folds
results = None
for fold_idx in range(5):
    fold = fold_idx + 1

    adata_train, adata_test = split_data(data_path=data_path, fold=fold)

    # Train model
    scnym_api(
        adata=adata_train,
        task='train',
        groupby='cell_type',
        out_path='./scnym_outputs',
        config='no_new_identity',
    )

    # Predict test data
    scnym_api(
        adata=adata_test,
        task='predict',
        key_added='scNym',
        config='no_new_identity',
        trained_model='./scnym_outputs'
    )

    # Save results in dataframe
    if fold == 1:
        results = pd.DataFrame({"pred": adata_test.obs['scNym'].to_list(), 
                                "true_label": adata_test.obs["cell_type"].to_list(), 
                                "fold": [fold]*adata_test.obs['scNym'].shape[0]})
    else:   
        dataframe_temp = pd.DataFrame({"pred": adata_test.obs['scNym'].to_list(), 
                                    "true_label": adata_test.obs["cell_type"].to_list(), 
                                    "fold": [fold]*adata_test.obs['scNym'].shape[0]})
        results = pd.concat([results,dataframe_temp], axis=0)

# Reset index
results.reset_index(drop=True, inplace=True)

  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


No CUDA device found.
Computations will be performed on the CPU.
Add a CUDA compute device to improve speed dramatically.

No unlabeled data was found.
Did you forget to set some examples as `"Unlabeled"`?
Proceeding with purely supervised training.

X:  (6855, 14322)
y:  (6855,)
Not weighting classes and not balancing classes.
Training...
Epoch 0/99
----------
Iter :  0
running_loss :  0.010722877457737923
running_acc  :  0.07421875
corrects: 19.000000 | total: 256.000000
train Loss : 0.0027
train Acc : 0.8287
TRAIN EPOCH corrects: 4601.000000 | total: 5552.000000
val Loss : 0.0009
val Acc : 0.9692
VAL EPOCH corrects: 598.000000 | total: 617.000000
Epoch 1/99
----------
Iter :  0
running_loss :  0.0007319571450352669
running_acc  :  0.96484375
corrects: 247.000000 | total: 256.000000
train Loss : 0.0006
train Acc : 0.9692
TRAIN EPOCH corrects: 5381.000000 | total: 5552.000000
val Loss : 0.0006
val Acc : 0.9789
VAL EPOCH corrects: 604.000000 | total: 617.000000
Epoch 2/99
----------
It

Finding cell types: 100%|██████████| 2/2 [00:00<00:00, 14.79it/s]


Extracting model embeddings...


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


No CUDA device found.
Computations will be performed on the CPU.
Add a CUDA compute device to improve speed dramatically.

No unlabeled data was found.
Did you forget to set some examples as `"Unlabeled"`?
Proceeding with purely supervised training.

X:  (6855, 14322)
y:  (6855,)
Not weighting classes and not balancing classes.
Training...
Epoch 0/99
----------
Iter :  0
running_loss :  0.011080782860517502
running_acc  :  0.05078125
corrects: 13.000000 | total: 256.000000
train Loss : 0.0028
train Acc : 0.8242
TRAIN EPOCH corrects: 4576.000000 | total: 5552.000000
val Loss : 0.0009
val Acc : 0.9433
VAL EPOCH corrects: 582.000000 | total: 617.000000
Epoch 1/99
----------
Iter :  0
running_loss :  0.0007139670196920633
running_acc  :  0.9609375
corrects: 246.000000 | total: 256.000000
train Loss : 0.0006
train Acc : 0.9678
TRAIN EPOCH corrects: 5373.000000 | total: 5552.000000
val Loss : 0.0006
val Acc : 0.9741
VAL EPOCH corrects: 601.000000 | total: 617.000000
Epoch 2/99
----------
Ite

Finding cell types: 100%|██████████| 2/2 [00:00<00:00, 14.46it/s]


Extracting model embeddings...


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


No CUDA device found.
Computations will be performed on the CPU.
Add a CUDA compute device to improve speed dramatically.

No unlabeled data was found.
Did you forget to set some examples as `"Unlabeled"`?
Proceeding with purely supervised training.

X:  (6855, 14322)
y:  (6855,)
Not weighting classes and not balancing classes.
Training...
Epoch 0/99
----------
Iter :  0
running_loss :  0.011110867373645306
running_acc  :  0.04296875
corrects: 11.000000 | total: 256.000000
train Loss : 0.0027
train Acc : 0.8285
TRAIN EPOCH corrects: 4600.000000 | total: 5552.000000
val Loss : 0.0012
val Acc : 0.9514
VAL EPOCH corrects: 587.000000 | total: 617.000000
Epoch 1/99
----------
Iter :  0
running_loss :  0.0007264005835168064
running_acc  :  0.97265625
corrects: 249.000000 | total: 256.000000
train Loss : 0.0006
train Acc : 0.9672
TRAIN EPOCH corrects: 5370.000000 | total: 5552.000000
val Loss : 0.0006
val Acc : 0.9708
VAL EPOCH corrects: 599.000000 | total: 617.000000
Epoch 2/99
----------
It

Finding cell types: 100%|██████████| 2/2 [00:00<00:00, 14.51it/s]


Extracting model embeddings...


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


No CUDA device found.
Computations will be performed on the CPU.
Add a CUDA compute device to improve speed dramatically.

No unlabeled data was found.
Did you forget to set some examples as `"Unlabeled"`?
Proceeding with purely supervised training.

X:  (6855, 14322)
y:  (6855,)
Not weighting classes and not balancing classes.
Training...
Epoch 0/99
----------
Iter :  0
running_loss :  0.011072940193116665
running_acc  :  0.0390625
corrects: 10.000000 | total: 256.000000
train Loss : 0.0027
train Acc : 0.8311
TRAIN EPOCH corrects: 4614.000000 | total: 5552.000000
val Loss : 0.0010
val Acc : 0.9676
VAL EPOCH corrects: 597.000000 | total: 617.000000
Epoch 1/99
----------
Iter :  0
running_loss :  0.000892101728823036
running_acc  :  0.96875
corrects: 248.000000 | total: 256.000000
train Loss : 0.0006
train Acc : 0.9683
TRAIN EPOCH corrects: 5376.000000 | total: 5552.000000
val Loss : 0.0006
val Acc : 0.9741
VAL EPOCH corrects: 601.000000 | total: 617.000000
Epoch 2/99
----------
Iter : 

Finding cell types: 100%|██████████| 2/2 [00:00<00:00, 14.73it/s]


Extracting model embeddings...


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


No CUDA device found.
Computations will be performed on the CPU.
Add a CUDA compute device to improve speed dramatically.

No unlabeled data was found.
Did you forget to set some examples as `"Unlabeled"`?
Proceeding with purely supervised training.

X:  (6856, 14322)
y:  (6856,)
Not weighting classes and not balancing classes.
Training...
Epoch 0/99
----------
Iter :  0
running_loss :  0.010688900016248226
running_acc  :  0.05859375
corrects: 15.000000 | total: 256.000000
train Loss : 0.0027
train Acc : 0.8271
TRAIN EPOCH corrects: 4593.000000 | total: 5553.000000
val Loss : 0.0009
val Acc : 0.9773
VAL EPOCH corrects: 603.000000 | total: 617.000000
Epoch 1/99
----------
Iter :  0
running_loss :  0.0008865036070346832
running_acc  :  0.9453125
corrects: 242.000000 | total: 256.000000
train Loss : 0.0006
train Acc : 0.9678
TRAIN EPOCH corrects: 5374.000000 | total: 5553.000000
val Loss : 0.0004
val Acc : 0.9822
VAL EPOCH corrects: 606.000000 | total: 617.000000
Epoch 2/99
----------
Ite

Finding cell types: 100%|██████████| 2/2 [00:00<00:00, 12.42it/s]


Extracting model embeddings...


In [4]:
# Save results
results.to_csv('results/scNym_output.csv', index=True)

In [5]:
# Extract the unique labels
unique_labels1 = np.unique(results.true_label)
unique_labels2 = np.unique(results.pred)
unique_labels = np.unique(np.concatenate([unique_labels1,unique_labels2]))

# Convert string labels to numerical labels
label_encoder_temp = LabelEncoder()
label_encoder_temp.fit(unique_labels)
y_true = label_encoder_temp.transform(results.true_label)
y_pred = label_encoder_temp.transform(results.pred)

# Calculate accuracy
accuracy = accuracy_score(y_true, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9883300268409383
