## scNym: https://github.com/calico/scnym 

In [1]:
import scnym
from scnym.api import scnym_api
import torch
import random
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import scanpy as sc




In [6]:
def split_data(data_path: str, 
               fold: int=1,
               folds: int=5,
               batch_key: str="patientID", 
               label_key: str="cell_type",
               seed: int=42,
               HVG: bool=False,
               HVGs: int=2000):
    
    adata = sc.read(data_path, cache=True)

    adata.obs["batch"] = adata.obs[batch_key]

    # Ensure reproducibility
    def rep_seed(seed):
        # Check if a GPU is available
        if torch.cuda.is_available():
            # Set the random seed for PyTorch CUDA (GPU) operations
            torch.cuda.manual_seed(seed)
            # Set the random seed for all CUDA devices (if multiple GPUs are available)
            torch.cuda.manual_seed_all(seed)
        
        # Set the random seed for CPU-based PyTorch operations
        torch.manual_seed(seed)
        
        # Set the random seed for NumPy
        np.random.seed(seed)
        
        # Set the random seed for Python's built-in 'random' module
        random.seed(seed)
        
        # Set the random seed for TensorFlow
        tf.random.set_seed(seed)
        
        # Set CuDNN to deterministic mode for PyTorch (GPU)
        torch.backends.cudnn.deterministic = True
        
        # Disable CuDNN's benchmarking mode for deterministic behavior
        torch.backends.cudnn.benchmark = False

    rep_seed(seed)

    # Initialize Stratified K-Fold
    stratified_kfold = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)

    # Iterate through the folds
    adata_original = adata.copy()
    test_adata_original = adata.copy()
    fold_counter = 0
    for train_index, test_index in stratified_kfold.split(adata_original.X, adata_original.obs[label_key]):
        fold_counter += 1
        
        adata = adata_original[train_index, :].copy()
        test_adata = test_adata_original[test_index, :].copy()

        if HVG:
            sc.pp.highly_variable_genes(adata, n_top_genes=HVGs, flavor="cell_ranger")
            test_adata = test_adata[:, adata.var["highly_variable"]].copy()
            adata = adata[:, adata.var["highly_variable"]].copy()
        
        if fold_counter == fold:
            
            return adata, test_adata


In [7]:
data_path = "../../../data/processed/data_for_evaluating_cell_type_annotation/Segerstolpe.h5ad"

# Iterate through folds
results = None
for fold_idx in range(5):
    fold = fold_idx + 1

    adata_train, adata_test = split_data(data_path=data_path, fold=fold)

    # Train model
    scnym_api(
        adata=adata_train,
        task='train',
        groupby='cell_type',
        out_path='./scnym_outputs',
        config='no_new_identity',
    )

    # Predict test data
    scnym_api(
        adata=adata_test,
        task='predict',
        key_added='scNym',
        config='no_new_identity',
        trained_model='./scnym_outputs'
    )

    # Save results in dataframe
    if fold == 1:
        results = pd.DataFrame({"pred": adata_test.obs['scNym'].to_list(), 
                                "true_label": adata_test.obs["cell_type"].to_list(), 
                                "fold": [fold]*adata_test.obs['scNym'].shape[0]})
    else:   
        dataframe_temp = pd.DataFrame({"pred": adata_test.obs['scNym'].to_list(), 
                                    "true_label": adata_test.obs["cell_type"].to_list(), 
                                    "fold": [fold]*adata_test.obs['scNym'].shape[0]})
        results = pd.concat([results,dataframe_temp], axis=0)

# Reset index
results.reset_index(drop=True, inplace=True)



No CUDA device found.
Computations will be performed on the CPU.
Add a CUDA compute device to improve speed dramatically.

No unlabeled data was found.
Did you forget to set some examples as `"Unlabeled"`?
Proceeding with purely supervised training.

X:  (2811, 17883)
y:  (2811,)
Not weighting classes and not balancing classes.
Training...
Epoch 0/99
----------
Iter :  0
running_loss :  0.011322320438921452
running_acc  :  0.04296875
corrects: 11.000000 | total: 256.000000
train Loss : 0.0061
train Acc : 0.5703
TRAIN EPOCH corrects: 1298.000000 | total: 2276.000000
val Loss : 0.0046
val Acc : 0.6957
VAL EPOCH corrects: 176.000000 | total: 253.000000
Epoch 1/99
----------
Iter :  0
running_loss :  0.003347627818584442
running_acc  :  0.78125
corrects: 200.000000 | total: 256.000000
train Loss : 0.0026
train Acc : 0.8326
TRAIN EPOCH corrects: 1895.000000 | total: 2276.000000
val Loss : 0.0025
val Acc : 0.8300
VAL EPOCH corrects: 210.000000 | total: 253.000000
Epoch 2/99
----------
Iter :

Finding cell types: 100%|██████████| 1/1 [00:00<00:00, 12.43it/s]

Extracting model embeddings...





No CUDA device found.
Computations will be performed on the CPU.
Add a CUDA compute device to improve speed dramatically.

No unlabeled data was found.
Did you forget to set some examples as `"Unlabeled"`?
Proceeding with purely supervised training.

X:  (2811, 17883)
y:  (2811,)
Not weighting classes and not balancing classes.
Training...
Epoch 0/99
----------
Iter :  0
running_loss :  0.011210799217224121
running_acc  :  0.0390625
corrects: 10.000000 | total: 256.000000
train Loss : 0.0061
train Acc : 0.5800
TRAIN EPOCH corrects: 1320.000000 | total: 2276.000000
val Loss : 0.0045
val Acc : 0.7273
VAL EPOCH corrects: 184.000000 | total: 253.000000
Epoch 1/99
----------
Iter :  0
running_loss :  0.00354150403290987
running_acc  :  0.76171875
corrects: 195.000000 | total: 256.000000
train Loss : 0.0027
train Acc : 0.8260
TRAIN EPOCH corrects: 1880.000000 | total: 2276.000000
val Loss : 0.0023
val Acc : 0.8458
VAL EPOCH corrects: 214.000000 | total: 253.000000
Epoch 2/99
----------
Iter 

Finding cell types: 100%|██████████| 1/1 [00:00<00:00, 13.10it/s]

Extracting model embeddings...





No CUDA device found.
Computations will be performed on the CPU.
Add a CUDA compute device to improve speed dramatically.

No unlabeled data was found.
Did you forget to set some examples as `"Unlabeled"`?
Proceeding with purely supervised training.

X:  (2811, 17883)
y:  (2811,)
Not weighting classes and not balancing classes.
Training...
Epoch 0/99
----------
Iter :  0
running_loss :  0.011280572041869164
running_acc  :  0.03515625
corrects: 9.000000 | total: 256.000000
train Loss : 0.0061
train Acc : 0.5747
TRAIN EPOCH corrects: 1308.000000 | total: 2276.000000
val Loss : 0.0041
val Acc : 0.7352
VAL EPOCH corrects: 186.000000 | total: 253.000000
Epoch 1/99
----------
Iter :  0
running_loss :  0.003396754153072834
running_acc  :  0.76171875
corrects: 195.000000 | total: 256.000000
train Loss : 0.0026
train Acc : 0.8374
TRAIN EPOCH corrects: 1906.000000 | total: 2276.000000
val Loss : 0.0023
val Acc : 0.8300
VAL EPOCH corrects: 210.000000 | total: 253.000000
Epoch 2/99
----------
Iter

Finding cell types: 100%|██████████| 1/1 [00:00<00:00, 13.98it/s]

Extracting model embeddings...





No CUDA device found.
Computations will be performed on the CPU.
Add a CUDA compute device to improve speed dramatically.

No unlabeled data was found.
Did you forget to set some examples as `"Unlabeled"`?
Proceeding with purely supervised training.

X:  (2811, 17883)
y:  (2811,)
Not weighting classes and not balancing classes.
Training...
Epoch 0/99
----------
Iter :  0
running_loss :  0.011091521941125393
running_acc  :  0.03515625
corrects: 9.000000 | total: 256.000000
train Loss : 0.0060
train Acc : 0.5800
TRAIN EPOCH corrects: 1320.000000 | total: 2276.000000
val Loss : 0.0041
val Acc : 0.7589
VAL EPOCH corrects: 192.000000 | total: 253.000000
Epoch 1/99
----------
Iter :  0
running_loss :  0.003266498912125826
running_acc  :  0.80859375
corrects: 207.000000 | total: 256.000000
train Loss : 0.0026
train Acc : 0.8370
TRAIN EPOCH corrects: 1905.000000 | total: 2276.000000
val Loss : 0.0028
val Acc : 0.8538
VAL EPOCH corrects: 216.000000 | total: 253.000000
Epoch 2/99
----------
Iter

Finding cell types: 100%|██████████| 1/1 [00:00<00:00, 11.24it/s]

Extracting model embeddings...





No CUDA device found.
Computations will be performed on the CPU.
Add a CUDA compute device to improve speed dramatically.

No unlabeled data was found.
Did you forget to set some examples as `"Unlabeled"`?
Proceeding with purely supervised training.

X:  (2812, 17883)
y:  (2812,)
Not weighting classes and not balancing classes.
Training...
Epoch 0/99
----------
Iter :  0
running_loss :  0.011192048899829388
running_acc  :  0.05859375
corrects: 15.000000 | total: 256.000000
train Loss : 0.0062
train Acc : 0.5665
TRAIN EPOCH corrects: 1290.000000 | total: 2277.000000
val Loss : 0.0042
val Acc : 0.7352
VAL EPOCH corrects: 186.000000 | total: 253.000000
Epoch 1/99
----------
Iter :  0
running_loss :  0.003049621358513832
running_acc  :  0.8125
corrects: 208.000000 | total: 256.000000
train Loss : 0.0029
train Acc : 0.8103
TRAIN EPOCH corrects: 1845.000000 | total: 2277.000000
val Loss : 0.0025
val Acc : 0.8261
VAL EPOCH corrects: 209.000000 | total: 253.000000
Epoch 2/99
----------
Iter : 

Finding cell types: 100%|██████████| 1/1 [00:00<00:00, 13.75it/s]

Extracting model embeddings...





In [8]:
# Save results
results.to_csv('results/scNym_output.csv', index=True)

In [9]:
# Extract the unique labels
unique_labels1 = np.unique(results.true_label)
unique_labels2 = np.unique(results.pred)
unique_labels = np.unique(np.concatenate([unique_labels1,unique_labels2]))

# Convert string labels to numerical labels
label_encoder_temp = LabelEncoder()
label_encoder_temp.fit(unique_labels)
y_true = label_encoder_temp.transform(results.true_label)
y_pred = label_encoder_temp.transform(results.pred)

# Calculate accuracy
accuracy = accuracy_score(y_true, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9143426294820717
