## scNym: https://github.com/calico/scnym 

In [1]:
import scnym
from scnym.api import scnym_api
import torch
import random
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import scanpy as sc




In [2]:
def split_data(data_path: str, 
               fold: int=1,
               folds: int=5,
               batch_key: str="patientID", 
               label_key: str="cell_type",
               seed: int=42,
               HVG: bool=True,
               HVGs: int=2000):
    
    adata = sc.read(data_path, cache=True)

    adata.obs["batch"] = adata.obs[batch_key]

    # Ensure reproducibility
    def rep_seed(seed):
        # Check if a GPU is available
        if torch.cuda.is_available():
            # Set the random seed for PyTorch CUDA (GPU) operations
            torch.cuda.manual_seed(seed)
            # Set the random seed for all CUDA devices (if multiple GPUs are available)
            torch.cuda.manual_seed_all(seed)
        
        # Set the random seed for CPU-based PyTorch operations
        torch.manual_seed(seed)
        
        # Set the random seed for NumPy
        np.random.seed(seed)
        
        # Set the random seed for Python's built-in 'random' module
        random.seed(seed)
        
        # Set the random seed for TensorFlow
        tf.random.set_seed(seed)
        
        # Set CuDNN to deterministic mode for PyTorch (GPU)
        torch.backends.cudnn.deterministic = True
        
        # Disable CuDNN's benchmarking mode for deterministic behavior
        torch.backends.cudnn.benchmark = False

    rep_seed(seed)

    # Initialize Stratified K-Fold
    stratified_kfold = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)

    # Iterate through the folds
    adata_original = adata.copy()
    test_adata_original = adata.copy()
    fold_counter = 0
    for train_index, test_index in stratified_kfold.split(adata_original.X, adata_original.obs[label_key]):
        fold_counter += 1
        
        adata = adata_original[train_index, :].copy()
        test_adata = test_adata_original[test_index, :].copy()

        if HVG:
            sc.pp.highly_variable_genes(adata, n_top_genes=HVGs, flavor="cell_ranger")
            test_adata = test_adata[:, adata.var["highly_variable"]].copy()
            adata = adata[:, adata.var["highly_variable"]].copy()
        
        if fold_counter == fold:
            
            return adata, test_adata


In [3]:
data_path = "../../../data/processed/data_for_evaluating_cell_type_annotation/MacParland.h5ad"

# Iterate through folds
results = None
for fold_idx in range(5):
    fold = fold_idx + 1

    adata_train, adata_test = split_data(data_path=data_path, fold=fold)

    # Train model
    scnym_api(
        adata=adata_train,
        task='train',
        groupby='cell_type',
        out_path='./scnym_outputs',
        config='no_new_identity',
    )

    # Predict test data
    scnym_api(
        adata=adata_test,
        task='predict',
        key_added='scNym',
        config='no_new_identity',
        trained_model='./scnym_outputs'
    )

    # Save results in dataframe
    if fold == 1:
        results = pd.DataFrame({"pred": adata_test.obs['scNym'].to_list(), 
                                "true_label": adata_test.obs["cell_type"].to_list(), 
                                "fold": [fold]*adata_test.obs['scNym'].shape[0]})
    else:   
        dataframe_temp = pd.DataFrame({"pred": adata_test.obs['scNym'].to_list(), 
                                    "true_label": adata_test.obs["cell_type"].to_list(), 
                                    "fold": [fold]*adata_test.obs['scNym'].shape[0]})
        results = pd.concat([results,dataframe_temp], axis=0)

# Reset index
results.reset_index(drop=True, inplace=True)

  disp_grouped = df.groupby('mean_bin')['dispersions']


No CUDA device found.
Computations will be performed on the CPU.
Add a CUDA compute device to improve speed dramatically.

No unlabeled data was found.
Did you forget to set some examples as `"Unlabeled"`?
Proceeding with purely supervised training.

X:  (6755, 2000)
y:  (6755,)
Not weighting classes and not balancing classes.
Training...
Epoch 0/99
----------
Iter :  0
running_loss :  0.01205784734338522
running_acc  :  0.03515625
corrects: 9.000000 | total: 256.000000
train Loss : 0.0060
train Acc : 0.5814
TRAIN EPOCH corrects: 3181.000000 | total: 5471.000000
val Loss : 0.0036
val Acc : 0.8586
VAL EPOCH corrects: 522.000000 | total: 608.000000
Epoch 1/99
----------
Iter :  0
running_loss :  0.00307990494184196
running_acc  :  0.796875
corrects: 204.000000 | total: 256.000000
train Loss : 0.0024
train Acc : 0.8320
TRAIN EPOCH corrects: 4552.000000 | total: 5471.000000
val Loss : 0.0029
val Acc : 0.8586
VAL EPOCH corrects: 522.000000 | total: 608.000000
Epoch 2/99
----------
Iter :  0

Finding cell types: 100%|██████████| 2/2 [00:00<00:00, 39.51it/s]

Extracting model embeddings...



  disp_grouped = df.groupby('mean_bin')['dispersions']
  disp_grouped = df.groupby('mean_bin')['dispersions']


No CUDA device found.
Computations will be performed on the CPU.
Add a CUDA compute device to improve speed dramatically.

No unlabeled data was found.
Did you forget to set some examples as `"Unlabeled"`?
Proceeding with purely supervised training.

X:  (6755, 2000)
y:  (6755,)
Not weighting classes and not balancing classes.
Training...
Epoch 0/99
----------
Iter :  0
running_loss :  0.012251749634742737
running_acc  :  0.0390625
corrects: 10.000000 | total: 256.000000
train Loss : 0.0061
train Acc : 0.5756
TRAIN EPOCH corrects: 3149.000000 | total: 5471.000000
val Loss : 0.0045
val Acc : 0.7401
VAL EPOCH corrects: 450.000000 | total: 608.000000
Epoch 1/99
----------
Iter :  0
running_loss :  0.0032649219501763582
running_acc  :  0.77734375
corrects: 199.000000 | total: 256.000000
train Loss : 0.0025
train Acc : 0.8318
TRAIN EPOCH corrects: 4551.000000 | total: 5471.000000
val Loss : 0.0022
val Acc : 0.9112
VAL EPOCH corrects: 554.000000 | total: 608.000000
Epoch 2/99
----------
Iter

Finding cell types: 100%|██████████| 2/2 [00:00<00:00, 33.22it/s]

Extracting model embeddings...



  disp_grouped = df.groupby('mean_bin')['dispersions']
  disp_grouped = df.groupby('mean_bin')['dispersions']
  disp_grouped = df.groupby('mean_bin')['dispersions']


No CUDA device found.
Computations will be performed on the CPU.
Add a CUDA compute device to improve speed dramatically.

No unlabeled data was found.
Did you forget to set some examples as `"Unlabeled"`?
Proceeding with purely supervised training.

X:  (6755, 2000)
y:  (6755,)
Not weighting classes and not balancing classes.
Training...
Epoch 0/99
----------
Iter :  0
running_loss :  0.012100618332624435
running_acc  :  0.06640625
corrects: 17.000000 | total: 256.000000
train Loss : 0.0061
train Acc : 0.5652
TRAIN EPOCH corrects: 3092.000000 | total: 5471.000000
val Loss : 0.0043
val Acc : 0.8125
VAL EPOCH corrects: 494.000000 | total: 608.000000
Epoch 1/99
----------
Iter :  0
running_loss :  0.002957125660032034
running_acc  :  0.76953125
corrects: 197.000000 | total: 256.000000
train Loss : 0.0024
train Acc : 0.8368
TRAIN EPOCH corrects: 4578.000000 | total: 5471.000000
val Loss : 0.0035
val Acc : 0.8372
VAL EPOCH corrects: 509.000000 | total: 608.000000
Epoch 2/99
----------
Iter

Finding cell types: 100%|██████████| 2/2 [00:00<00:00, 31.25it/s]


Extracting model embeddings...


  disp_grouped = df.groupby('mean_bin')['dispersions']
  disp_grouped = df.groupby('mean_bin')['dispersions']
  disp_grouped = df.groupby('mean_bin')['dispersions']
  disp_grouped = df.groupby('mean_bin')['dispersions']


No CUDA device found.
Computations will be performed on the CPU.
Add a CUDA compute device to improve speed dramatically.

No unlabeled data was found.
Did you forget to set some examples as `"Unlabeled"`?
Proceeding with purely supervised training.

X:  (6755, 2000)
y:  (6755,)
Not weighting classes and not balancing classes.
Training...
Epoch 0/99
----------
Iter :  0
running_loss :  0.012099924497306347
running_acc  :  0.078125
corrects: 20.000000 | total: 256.000000
train Loss : 0.0062
train Acc : 0.5739
TRAIN EPOCH corrects: 3140.000000 | total: 5471.000000
val Loss : 0.0043
val Acc : 0.8405
VAL EPOCH corrects: 511.000000 | total: 608.000000
Epoch 1/99
----------
Iter :  0
running_loss :  0.00324585591442883
running_acc  :  0.73828125
corrects: 189.000000 | total: 256.000000
train Loss : 0.0025
train Acc : 0.8340
TRAIN EPOCH corrects: 4563.000000 | total: 5471.000000
val Loss : 0.0023
val Acc : 0.8980
VAL EPOCH corrects: 546.000000 | total: 608.000000
Epoch 2/99
----------
Iter : 

Finding cell types: 100%|██████████| 2/2 [00:00<00:00, 25.00it/s]


Extracting model embeddings...


  disp_grouped = df.groupby('mean_bin')['dispersions']
  disp_grouped = df.groupby('mean_bin')['dispersions']
  disp_grouped = df.groupby('mean_bin')['dispersions']
  disp_grouped = df.groupby('mean_bin')['dispersions']
  disp_grouped = df.groupby('mean_bin')['dispersions']


No CUDA device found.
Computations will be performed on the CPU.
Add a CUDA compute device to improve speed dramatically.

No unlabeled data was found.
Did you forget to set some examples as `"Unlabeled"`?
Proceeding with purely supervised training.

X:  (6756, 2000)
y:  (6756,)
Not weighting classes and not balancing classes.
Training...
Epoch 0/99
----------
Iter :  0
running_loss :  0.012105373665690422
running_acc  :  0.08984375
corrects: 23.000000 | total: 256.000000
train Loss : 0.0061
train Acc : 0.5709
TRAIN EPOCH corrects: 3124.000000 | total: 5472.000000
val Loss : 0.0046
val Acc : 0.8125
VAL EPOCH corrects: 494.000000 | total: 608.000000
Epoch 1/99
----------
Iter :  0
running_loss :  0.0032169537153095007
running_acc  :  0.77734375
corrects: 199.000000 | total: 256.000000
train Loss : 0.0026
train Acc : 0.8257
TRAIN EPOCH corrects: 4518.000000 | total: 5472.000000
val Loss : 0.0021
val Acc : 0.9227
VAL EPOCH corrects: 561.000000 | total: 608.000000
Epoch 2/99
----------
Ite

Finding cell types: 100%|██████████| 2/2 [00:00<00:00, 25.60it/s]

Extracting model embeddings...





In [4]:
# Save results
results.to_csv('results/scNym_hvgs_output.csv', index=True)

In [5]:
# Extract the unique labels
unique_labels1 = np.unique(results.true_label)
unique_labels2 = np.unique(results.pred)
unique_labels = np.unique(np.concatenate([unique_labels1,unique_labels2]))

# Convert string labels to numerical labels
label_encoder_temp = LabelEncoder()
label_encoder_temp.fit(unique_labels)
y_true = label_encoder_temp.transform(results.true_label)
y_pred = label_encoder_temp.transform(results.pred)

# Calculate accuracy
accuracy = accuracy_score(y_true, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.946470866887731
