## scNym: https://github.com/calico/scnym 

In [1]:
import scnym
from scnym.api import scnym_api
import torch
import random
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import scanpy as sc




In [2]:
def split_data(data_path: str, 
               fold: int=1,
               folds: int=5,
               batch_key: str="patientID", 
               label_key: str="cell_type",
               seed: int=42,
               HVG: bool=False,
               HVGs: int=2000):
    
    adata = sc.read(data_path, cache=True)

    adata.obs["batch"] = adata.obs[batch_key]

    # Ensure reproducibility
    def rep_seed(seed):
        # Check if a GPU is available
        if torch.cuda.is_available():
            # Set the random seed for PyTorch CUDA (GPU) operations
            torch.cuda.manual_seed(seed)
            # Set the random seed for all CUDA devices (if multiple GPUs are available)
            torch.cuda.manual_seed_all(seed)
        
        # Set the random seed for CPU-based PyTorch operations
        torch.manual_seed(seed)
        
        # Set the random seed for NumPy
        np.random.seed(seed)
        
        # Set the random seed for Python's built-in 'random' module
        random.seed(seed)
        
        # Set the random seed for TensorFlow
        tf.random.set_seed(seed)
        
        # Set CuDNN to deterministic mode for PyTorch (GPU)
        torch.backends.cudnn.deterministic = True
        
        # Disable CuDNN's benchmarking mode for deterministic behavior
        torch.backends.cudnn.benchmark = False

    rep_seed(seed)

    # Initialize Stratified K-Fold
    stratified_kfold = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)

    # Iterate through the folds
    adata_original = adata.copy()
    test_adata_original = adata.copy()
    fold_counter = 0
    for train_index, test_index in stratified_kfold.split(adata_original.X, adata_original.obs[label_key]):
        fold_counter += 1
        
        adata = adata_original[train_index, :].copy()
        test_adata = test_adata_original[test_index, :].copy()

        if HVG:
            sc.pp.highly_variable_genes(adata, n_top_genes=HVGs, flavor="cell_ranger")
            test_adata = test_adata[:, adata.var["highly_variable"]].copy()
            adata = adata[:, adata.var["highly_variable"]].copy()
        
        if fold_counter == fold:
            
            return adata, test_adata


In [3]:
data_path = "../../../data/processed/data_for_evaluating_cell_type_annotation/MacParland.h5ad"

# Iterate through folds
results = None
for fold_idx in range(5):
    fold = fold_idx + 1

    adata_train, adata_test = split_data(data_path=data_path, fold=fold)

    # Train model
    scnym_api(
        adata=adata_train,
        task='train',
        groupby='cell_type',
        out_path='./scnym_outputs',
        config='no_new_identity',
    )

    # Predict test data
    scnym_api(
        adata=adata_test,
        task='predict',
        key_added='scNym',
        config='no_new_identity',
        trained_model='./scnym_outputs'
    )

    # Save results in dataframe
    if fold == 1:
        results = pd.DataFrame({"pred": adata_test.obs['scNym'].to_list(), 
                                "true_label": adata_test.obs["cell_type"].to_list(), 
                                "fold": [fold]*adata_test.obs['scNym'].shape[0]})
    else:   
        dataframe_temp = pd.DataFrame({"pred": adata_test.obs['scNym'].to_list(), 
                                    "true_label": adata_test.obs["cell_type"].to_list(), 
                                    "fold": [fold]*adata_test.obs['scNym'].shape[0]})
        results = pd.concat([results,dataframe_temp], axis=0)

# Reset index
results.reset_index(drop=True, inplace=True)



No CUDA device found.
Computations will be performed on the CPU.
Add a CUDA compute device to improve speed dramatically.

No unlabeled data was found.
Did you forget to set some examples as `"Unlabeled"`?
Proceeding with purely supervised training.

X:  (6755, 15386)
y:  (6755,)
Not weighting classes and not balancing classes.
Training...
Epoch 0/99
----------
Iter :  0
running_loss :  0.01210148073732853
running_acc  :  0.0546875
corrects: 14.000000 | total: 256.000000
train Loss : 0.0059
train Acc : 0.5853
TRAIN EPOCH corrects: 3202.000000 | total: 5471.000000
val Loss : 0.0048
val Acc : 0.7714
VAL EPOCH corrects: 469.000000 | total: 608.000000
Epoch 1/99
----------
Iter :  0
running_loss :  0.002864859299734235
running_acc  :  0.80859375
corrects: 207.000000 | total: 256.000000
train Loss : 0.0023
train Acc : 0.8450
TRAIN EPOCH corrects: 4623.000000 | total: 5471.000000
val Loss : 0.0027
val Acc : 0.8799
VAL EPOCH corrects: 535.000000 | total: 608.000000
Epoch 2/99
----------
Iter 

Finding cell types: 100%|██████████| 2/2 [00:00<00:00, 11.64it/s]


Extracting model embeddings...
No CUDA device found.
Computations will be performed on the CPU.
Add a CUDA compute device to improve speed dramatically.

No unlabeled data was found.
Did you forget to set some examples as `"Unlabeled"`?
Proceeding with purely supervised training.

X:  (6755, 15386)
y:  (6755,)
Not weighting classes and not balancing classes.
Training...
Epoch 0/99
----------
Iter :  0
running_loss :  0.012197762727737427
running_acc  :  0.0546875
corrects: 14.000000 | total: 256.000000
train Loss : 0.0060
train Acc : 0.5791
TRAIN EPOCH corrects: 3168.000000 | total: 5471.000000
val Loss : 0.0043
val Acc : 0.8174
VAL EPOCH corrects: 497.000000 | total: 608.000000
Epoch 1/99
----------
Iter :  0
running_loss :  0.003031804459169507
running_acc  :  0.7890625
corrects: 202.000000 | total: 256.000000
train Loss : 0.0024
train Acc : 0.8439
TRAIN EPOCH corrects: 4617.000000 | total: 5471.000000
val Loss : 0.0051
val Acc : 0.6480
VAL EPOCH corrects: 394.000000 | total: 608.000

Finding cell types: 100%|██████████| 2/2 [00:00<00:00, 11.96it/s]


Extracting model embeddings...
No CUDA device found.
Computations will be performed on the CPU.
Add a CUDA compute device to improve speed dramatically.

No unlabeled data was found.
Did you forget to set some examples as `"Unlabeled"`?
Proceeding with purely supervised training.

X:  (6755, 15386)
y:  (6755,)
Not weighting classes and not balancing classes.
Training...
Epoch 0/99
----------
Iter :  0
running_loss :  0.011812248267233372
running_acc  :  0.0625
corrects: 16.000000 | total: 256.000000
train Loss : 0.0059
train Acc : 0.5803
TRAIN EPOCH corrects: 3175.000000 | total: 5471.000000
val Loss : 0.0051
val Acc : 0.7072
VAL EPOCH corrects: 430.000000 | total: 608.000000
Epoch 1/99
----------
Iter :  0
running_loss :  0.0028045971412211657
running_acc  :  0.828125
corrects: 212.000000 | total: 256.000000
train Loss : 0.0023
train Acc : 0.8498
TRAIN EPOCH corrects: 4649.000000 | total: 5471.000000
val Loss : 0.0020
val Acc : 0.8865
VAL EPOCH corrects: 539.000000 | total: 608.000000

Finding cell types: 100%|██████████| 2/2 [00:00<00:00,  8.90it/s]


Extracting model embeddings...
No CUDA device found.
Computations will be performed on the CPU.
Add a CUDA compute device to improve speed dramatically.

No unlabeled data was found.
Did you forget to set some examples as `"Unlabeled"`?
Proceeding with purely supervised training.

X:  (6755, 15386)
y:  (6755,)
Not weighting classes and not balancing classes.
Training...
Epoch 0/99
----------
Iter :  0
running_loss :  0.012153228744864464
running_acc  :  0.06640625
corrects: 17.000000 | total: 256.000000
train Loss : 0.0060
train Acc : 0.5712
TRAIN EPOCH corrects: 3125.000000 | total: 5471.000000
val Loss : 0.0051
val Acc : 0.7862
VAL EPOCH corrects: 478.000000 | total: 608.000000
Epoch 1/99
----------
Iter :  0
running_loss :  0.002801226219162345
running_acc  :  0.80859375
corrects: 207.000000 | total: 256.000000
train Loss : 0.0024
train Acc : 0.8399
TRAIN EPOCH corrects: 4595.000000 | total: 5471.000000
val Loss : 0.0025
val Acc : 0.8931
VAL EPOCH corrects: 543.000000 | total: 608.0

Finding cell types: 100%|██████████| 2/2 [00:00<00:00, 13.48it/s]


Extracting model embeddings...




No CUDA device found.
Computations will be performed on the CPU.
Add a CUDA compute device to improve speed dramatically.

No unlabeled data was found.
Did you forget to set some examples as `"Unlabeled"`?
Proceeding with purely supervised training.

X:  (6756, 15386)
y:  (6756,)
Not weighting classes and not balancing classes.
Training...
Epoch 0/99
----------
Iter :  0
running_loss :  0.012113630771636963
running_acc  :  0.06640625
corrects: 17.000000 | total: 256.000000
train Loss : 0.0059
train Acc : 0.5682
TRAIN EPOCH corrects: 3109.000000 | total: 5472.000000
val Loss : 0.0046
val Acc : 0.7895
VAL EPOCH corrects: 480.000000 | total: 608.000000
Epoch 1/99
----------
Iter :  0
running_loss :  0.0030260439962148666
running_acc  :  0.78125
corrects: 200.000000 | total: 256.000000
train Loss : 0.0024
train Acc : 0.8339
TRAIN EPOCH corrects: 4563.000000 | total: 5472.000000
val Loss : 0.0033
val Acc : 0.8553
VAL EPOCH corrects: 520.000000 | total: 608.000000
Epoch 2/99
----------
Iter 

Finding cell types: 100%|██████████| 2/2 [00:00<00:00, 10.03it/s]

Extracting model embeddings...





In [4]:
# Save results
results.to_csv('results/scNym_output.csv', index=True)

In [5]:
# Extract the unique labels
unique_labels1 = np.unique(results.true_label)
unique_labels2 = np.unique(results.pred)
unique_labels = np.unique(np.concatenate([unique_labels1,unique_labels2]))

# Convert string labels to numerical labels
label_encoder_temp = LabelEncoder()
label_encoder_temp.fit(unique_labels)
y_true = label_encoder_temp.transform(results.true_label)
y_pred = label_encoder_temp.transform(results.pred)

# Calculate accuracy
accuracy = accuracy_score(y_true, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9515632401705353
