## scNym: https://github.com/calico/scnym 

In [1]:
import scnym
from scnym.api import scnym_api
import torch
import random
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import scanpy as sc




In [2]:
def split_data(data_path: str, 
               fold: int=1,
               folds: int=5,
               batch_key: str="patientID", 
               label_key: str="cell_type",
               seed: int=42,
               HVG: bool=True,
               HVGs: int=2000):
    
    adata = sc.read(data_path, cache=True)

    adata.obs["batch"] = adata.obs[batch_key]

    # Ensure reproducibility
    def rep_seed(seed):
        # Check if a GPU is available
        if torch.cuda.is_available():
            # Set the random seed for PyTorch CUDA (GPU) operations
            torch.cuda.manual_seed(seed)
            # Set the random seed for all CUDA devices (if multiple GPUs are available)
            torch.cuda.manual_seed_all(seed)
        
        # Set the random seed for CPU-based PyTorch operations
        torch.manual_seed(seed)
        
        # Set the random seed for NumPy
        np.random.seed(seed)
        
        # Set the random seed for Python's built-in 'random' module
        random.seed(seed)
        
        # Set the random seed for TensorFlow
        tf.random.set_seed(seed)
        
        # Set CuDNN to deterministic mode for PyTorch (GPU)
        torch.backends.cudnn.deterministic = True
        
        # Disable CuDNN's benchmarking mode for deterministic behavior
        torch.backends.cudnn.benchmark = False

    rep_seed(seed)

    # Initialize Stratified K-Fold
    stratified_kfold = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)

    # Iterate through the folds
    adata_original = adata.copy()
    test_adata_original = adata.copy()
    fold_counter = 0
    for train_index, test_index in stratified_kfold.split(adata_original.X, adata_original.obs[label_key]):
        fold_counter += 1
        
        if fold_counter == fold:

            adata = adata_original[train_index, :].copy()
            test_adata = test_adata_original[test_index, :].copy()

            if HVG:
                sc.pp.highly_variable_genes(adata, n_top_genes=HVGs, flavor="cell_ranger")
                test_adata = test_adata[:, adata.var["highly_variable"]].copy()
                adata = adata[:, adata.var["highly_variable"]].copy()
            
            return adata, test_adata


In [3]:
data_path = "../../../data/processed/data_for_evaluating_cell_type_annotation/Segerstolpe.h5ad"

# Iterate through folds
results = None
for fold_idx in range(5):
    fold = fold_idx + 1

    adata_train, adata_test = split_data(data_path=data_path, fold=fold)

    # Train model
    scnym_api(
        adata=adata_train,
        task='train',
        groupby='cell_type',
        out_path='./scnym_outputs',
        config='no_new_identity',
    )

    # Predict test data
    scnym_api(
        adata=adata_test,
        task='predict',
        key_added='scNym',
        config='no_new_identity',
        trained_model='./scnym_outputs'
    )

    # Save results in dataframe
    if fold == 1:
        results = pd.DataFrame({"pred": adata_test.obs['scNym'].to_list(), 
                                "true_label": adata_test.obs["cell_type"].to_list(), 
                                "fold": [fold]*adata_test.obs['scNym'].shape[0]})
    else:   
        dataframe_temp = pd.DataFrame({"pred": adata_test.obs['scNym'].to_list(), 
                                    "true_label": adata_test.obs["cell_type"].to_list(), 
                                    "fold": [fold]*adata_test.obs['scNym'].shape[0]})
        results = pd.concat([results,dataframe_temp], axis=0)

# Reset index
results.reset_index(drop=True, inplace=True)

  disp_grouped = df.groupby('mean_bin')['dispersions']


No CUDA device found.
Computations will be performed on the CPU.
Add a CUDA compute device to improve speed dramatically.

No unlabeled data was found.
Did you forget to set some examples as `"Unlabeled"`?
Proceeding with purely supervised training.

X:  (2811, 2000)
y:  (2811,)
Not weighting classes and not balancing classes.
Training...
Epoch 0/99
----------
Iter :  0
running_loss :  0.01115454826503992
running_acc  :  0.06640625
corrects: 17.000000 | total: 256.000000
train Loss : 0.0055
train Acc : 0.6446
TRAIN EPOCH corrects: 1467.000000 | total: 2276.000000
val Loss : 0.0036
val Acc : 0.8024
VAL EPOCH corrects: 203.000000 | total: 253.000000
Epoch 1/99
----------
Iter :  0
running_loss :  0.0029217652045190334
running_acc  :  0.828125
corrects: 212.000000 | total: 256.000000
train Loss : 0.0023
train Acc : 0.8629
TRAIN EPOCH corrects: 1964.000000 | total: 2276.000000
val Loss : 0.0020
val Acc : 0.8617
VAL EPOCH corrects: 218.000000 | total: 253.000000
Epoch 2/99
----------
Iter :

Finding cell types: 100%|██████████| 1/1 [00:00<00:00, 35.82it/s]

Extracting model embeddings...



  disp_grouped = df.groupby('mean_bin')['dispersions']


No CUDA device found.
Computations will be performed on the CPU.
Add a CUDA compute device to improve speed dramatically.

No unlabeled data was found.
Did you forget to set some examples as `"Unlabeled"`?
Proceeding with purely supervised training.

X:  (2811, 2000)
y:  (2811,)
Not weighting classes and not balancing classes.
Training...
Epoch 0/99
----------
Iter :  0
running_loss :  0.010858035646378994
running_acc  :  0.0859375
corrects: 22.000000 | total: 256.000000
train Loss : 0.0054
train Acc : 0.6547
TRAIN EPOCH corrects: 1490.000000 | total: 2276.000000
val Loss : 0.0038
val Acc : 0.7708
VAL EPOCH corrects: 195.000000 | total: 253.000000
Epoch 1/99
----------
Iter :  0
running_loss :  0.0024549320805817842
running_acc  :  0.859375
corrects: 220.000000 | total: 256.000000
train Loss : 0.0023
train Acc : 0.8634
TRAIN EPOCH corrects: 1965.000000 | total: 2276.000000
val Loss : 0.0015
val Acc : 0.9012
VAL EPOCH corrects: 228.000000 | total: 253.000000
Epoch 2/99
----------
Iter :

Finding cell types: 100%|██████████| 1/1 [00:00<00:00, 43.48it/s]

Extracting model embeddings...



  disp_grouped = df.groupby('mean_bin')['dispersions']


No CUDA device found.
Computations will be performed on the CPU.
Add a CUDA compute device to improve speed dramatically.

No unlabeled data was found.
Did you forget to set some examples as `"Unlabeled"`?
Proceeding with purely supervised training.

X:  (2811, 2000)
y:  (2811,)
Not weighting classes and not balancing classes.
Training...
Epoch 0/99
----------
Iter :  0
running_loss :  0.01094803772866726
running_acc  :  0.0546875
corrects: 14.000000 | total: 256.000000
train Loss : 0.0055
train Acc : 0.6336
TRAIN EPOCH corrects: 1442.000000 | total: 2276.000000
val Loss : 0.0036
val Acc : 0.7787
VAL EPOCH corrects: 197.000000 | total: 253.000000
Epoch 1/99
----------
Iter :  0
running_loss :  0.0027692916337400675
running_acc  :  0.84375
corrects: 216.000000 | total: 256.000000
train Loss : 0.0023
train Acc : 0.8598
TRAIN EPOCH corrects: 1957.000000 | total: 2276.000000
val Loss : 0.0021
val Acc : 0.8854
VAL EPOCH corrects: 224.000000 | total: 253.000000
Epoch 2/99
----------
Iter :  

Finding cell types: 100%|██████████| 1/1 [00:00<00:00, 45.45it/s]

Extracting model embeddings...



  disp_grouped = df.groupby('mean_bin')['dispersions']


No CUDA device found.
Computations will be performed on the CPU.
Add a CUDA compute device to improve speed dramatically.

No unlabeled data was found.
Did you forget to set some examples as `"Unlabeled"`?
Proceeding with purely supervised training.

X:  (2811, 2000)
y:  (2811,)
Not weighting classes and not balancing classes.
Training...
Epoch 0/99
----------
Iter :  0
running_loss :  0.010842676274478436
running_acc  :  0.08203125
corrects: 21.000000 | total: 256.000000
train Loss : 0.0055
train Acc : 0.6538
TRAIN EPOCH corrects: 1488.000000 | total: 2276.000000
val Loss : 0.0032
val Acc : 0.8221
VAL EPOCH corrects: 208.000000 | total: 253.000000
Epoch 1/99
----------
Iter :  0
running_loss :  0.0028882422484457493
running_acc  :  0.84765625
corrects: 217.000000 | total: 256.000000
train Loss : 0.0023
train Acc : 0.8616
TRAIN EPOCH corrects: 1961.000000 | total: 2276.000000
val Loss : 0.0017
val Acc : 0.8854
VAL EPOCH corrects: 224.000000 | total: 253.000000
Epoch 2/99
----------
Ite

Finding cell types: 100%|██████████| 1/1 [00:00<00:00, 54.73it/s]

Extracting model embeddings...



  disp_grouped = df.groupby('mean_bin')['dispersions']


No CUDA device found.
Computations will be performed on the CPU.
Add a CUDA compute device to improve speed dramatically.

No unlabeled data was found.
Did you forget to set some examples as `"Unlabeled"`?
Proceeding with purely supervised training.

X:  (2812, 2000)
y:  (2812,)
Not weighting classes and not balancing classes.
Training...
Epoch 0/99
----------
Iter :  0
running_loss :  0.010919968597590923
running_acc  :  0.08203125
corrects: 21.000000 | total: 256.000000
train Loss : 0.0055
train Acc : 0.6293
TRAIN EPOCH corrects: 1433.000000 | total: 2277.000000
val Loss : 0.0033
val Acc : 0.8379
VAL EPOCH corrects: 212.000000 | total: 253.000000
Epoch 1/99
----------
Iter :  0
running_loss :  0.002854082267731428
running_acc  :  0.8046875
corrects: 206.000000 | total: 256.000000
train Loss : 0.0024
train Acc : 0.8533
TRAIN EPOCH corrects: 1943.000000 | total: 2277.000000
val Loss : 0.0014
val Acc : 0.9051
VAL EPOCH corrects: 229.000000 | total: 253.000000
Epoch 2/99
----------
Iter 

Finding cell types: 100%|██████████| 1/1 [00:00<00:00, 60.17it/s]

Extracting model embeddings...





In [4]:
# Save results
results.to_csv('results/scNym_hvgs_output.csv', index=True)

In [5]:
# Extract the unique labels
unique_labels1 = np.unique(results.true_label)
unique_labels2 = np.unique(results.pred)
unique_labels = np.unique(np.concatenate([unique_labels1,unique_labels2]))

# Convert string labels to numerical labels
label_encoder_temp = LabelEncoder()
label_encoder_temp.fit(unique_labels)
y_true = label_encoder_temp.transform(results.true_label)
y_pred = label_encoder_temp.transform(results.pred)

# Calculate accuracy
accuracy = accuracy_score(y_true, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9083665338645418
