## scNym: https://github.com/calico/scnym 

In [1]:
import scnym
from scnym.api import scnym_api
import torch
import random
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import scanpy as sc




In [2]:
def split_data(data_path: str, 
               fold: int=1,
               folds: int=5,
               batch_key: str="patientID", 
               label_key: str="cell_type",
               seed: int=42,
               HVG: bool=True,
               HVGs: int=2000):
    
    adata = sc.read(data_path, cache=True)

    adata.obs["batch"] = adata.obs[batch_key]

    # Ensure reproducibility
    def rep_seed(seed):
        # Check if a GPU is available
        if torch.cuda.is_available():
            # Set the random seed for PyTorch CUDA (GPU) operations
            torch.cuda.manual_seed(seed)
            # Set the random seed for all CUDA devices (if multiple GPUs are available)
            torch.cuda.manual_seed_all(seed)
        
        # Set the random seed for CPU-based PyTorch operations
        torch.manual_seed(seed)
        
        # Set the random seed for NumPy
        np.random.seed(seed)
        
        # Set the random seed for Python's built-in 'random' module
        random.seed(seed)
        
        # Set the random seed for TensorFlow
        tf.random.set_seed(seed)
        
        # Set CuDNN to deterministic mode for PyTorch (GPU)
        torch.backends.cudnn.deterministic = True
        
        # Disable CuDNN's benchmarking mode for deterministic behavior
        torch.backends.cudnn.benchmark = False

    rep_seed(seed)

    # Initialize Stratified K-Fold
    stratified_kfold = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)

    # Iterate through the folds
    adata_original = adata.copy()
    test_adata_original = adata.copy()
    fold_counter = 0
    for train_index, test_index in stratified_kfold.split(adata_original.X, adata_original.obs[label_key]):
        fold_counter += 1
        
        adata = adata_original[train_index, :].copy()
        test_adata = test_adata_original[test_index, :].copy()

        # Define cell types to exclude
        exclude_cell_types = ['Mature_B_Cells', 
                            'Plasma_Cells', 
                            'alpha-beta_T_Cells', 
                            'gamma-delta_T_Cells_1', 
                            'gamma-delta_T_Cells_2']

        # Create a boolean mask to select cells that are not in the exclude list
        mask = ~adata.obs['cell_type'].isin(exclude_cell_types)

        # Apply the mask to AnnData object
        adata = adata[mask]

        if HVG:
            sc.pp.highly_variable_genes(adata, n_top_genes=HVGs, flavor="cell_ranger")
            test_adata = test_adata[:, adata.var["highly_variable"]].copy()
            adata = adata[:, adata.var["highly_variable"]].copy()
        
        if fold_counter == fold:
            
            return adata, test_adata


In [3]:
data_path = "../../../data/processed/data_for_evaluating_cell_type_annotation/MacParland.h5ad"

# Iterate through folds
results = None
for fold_idx in range(5):
    fold = fold_idx + 1

    adata_train, adata_test = split_data(data_path=data_path, fold=fold)

    # Train model
    scnym_api(
        adata=adata_train,
        task='train',
        groupby='cell_type',
        out_path='./scnym_outputs',
        config='no_new_identity',
    )

    # Predict test data
    scnym_api(
        adata=adata_test,
        task='predict',
        key_added='scNym',
        config='no_new_identity',
        trained_model='./scnym_outputs'
    )

    # Define cell as Novel if the confidence is below 0.5
    max_scores = adata_test.obs['scNym_confidence']
    for i, max_score in enumerate(max_scores):
        if max_score < 0.5:
            adata_test.obs['scNym'].iloc[i] = "Novel"

    # Save results in dataframe
    if fold == 1:
        results = pd.DataFrame({"pred": adata_test.obs['scNym'].to_list(), 
                                "true_label": adata_test.obs["cell_type"].to_list(), 
                                "fold": [fold]*adata_test.obs['scNym'].shape[0]})
    else:   
        dataframe_temp = pd.DataFrame({"pred": adata_test.obs['scNym'].to_list(), 
                                    "true_label": adata_test.obs["cell_type"].to_list(), 
                                    "fold": [fold]*adata_test.obs['scNym'].shape[0]})
        results = pd.concat([results,dataframe_temp], axis=0)

# Reset index
results.reset_index(drop=True, inplace=True)

  disp_grouped = df.groupby('mean_bin')['dispersions']
  adata.uns['hvg'] = {'flavor': flavor}


No CUDA device found.
Computations will be performed on the CPU.
Add a CUDA compute device to improve speed dramatically.

No unlabeled data was found.
Did you forget to set some examples as `"Unlabeled"`?
Proceeding with purely supervised training.

X:  (5018, 2000)
y:  (5018,)
Not weighting classes and not balancing classes.
Training...
Epoch 0/99
----------
Iter :  0
running_loss :  0.011131412349641323
running_acc  :  0.0625
corrects: 16.000000 | total: 256.000000
train Loss : 0.0055
train Acc : 0.5871
TRAIN EPOCH corrects: 2386.000000 | total: 4064.000000
val Loss : 0.0033
val Acc : 0.8119
VAL EPOCH corrects: 367.000000 | total: 452.000000
Epoch 1/99
----------
Iter :  0
running_loss :  0.0026288963854312897
running_acc  :  0.84375
corrects: 216.000000 | total: 256.000000
train Loss : 0.0021
train Acc : 0.8509
TRAIN EPOCH corrects: 3458.000000 | total: 4064.000000
val Loss : 0.0016
val Acc : 0.9027
VAL EPOCH corrects: 408.000000 | total: 452.000000
Epoch 2/99
----------
Iter :  0


Finding cell types: 100%|██████████| 2/2 [00:00<00:00, 37.74it/s]

Extracting model embeddings...



You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  adata_test.obs['scNym'].iloc[i] = "Novel"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adata_test.obs['scNy

No CUDA device found.
Computations will be performed on the CPU.
Add a CUDA compute device to improve speed dramatically.

No unlabeled data was found.
Did you forget to set some examples as `"Unlabeled"`?
Proceeding with purely supervised training.

X:  (5018, 2000)
y:  (5018,)
Not weighting classes and not balancing classes.
Training...
Epoch 0/99
----------
Iter :  0
running_loss :  0.010900573804974556
running_acc  :  0.08984375
corrects: 23.000000 | total: 256.000000
train Loss : 0.0055
train Acc : 0.5829
TRAIN EPOCH corrects: 2369.000000 | total: 4064.000000
val Loss : 0.0032
val Acc : 0.8363
VAL EPOCH corrects: 378.000000 | total: 452.000000
Epoch 1/99
----------
Iter :  0
running_loss :  0.0031315451487898827
running_acc  :  0.73828125
corrects: 189.000000 | total: 256.000000
train Loss : 0.0022
train Acc : 0.8356
TRAIN EPOCH corrects: 3396.000000 | total: 4064.000000
val Loss : 0.0014
val Acc : 0.9159
VAL EPOCH corrects: 414.000000 | total: 452.000000
Epoch 2/99
----------
Ite

Finding cell types: 100%|██████████| 2/2 [00:00<00:00, 45.46it/s]

Extracting model embeddings...



You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  adata_test.obs['scNym'].iloc[i] = "Novel"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adata_test.obs['scNy

No CUDA device found.
Computations will be performed on the CPU.
Add a CUDA compute device to improve speed dramatically.

No unlabeled data was found.
Did you forget to set some examples as `"Unlabeled"`?
Proceeding with purely supervised training.

X:  (5019, 2000)
y:  (5019,)
Not weighting classes and not balancing classes.
Training...
Epoch 0/99
----------
Iter :  0
running_loss :  0.011239047162234783
running_acc  :  0.0546875
corrects: 14.000000 | total: 256.000000
train Loss : 0.0055
train Acc : 0.5946
TRAIN EPOCH corrects: 2417.000000 | total: 4065.000000
val Loss : 0.0033
val Acc : 0.7810
VAL EPOCH corrects: 353.000000 | total: 452.000000
Epoch 1/99
----------
Iter :  0
running_loss :  0.002688691485673189
running_acc  :  0.796875
corrects: 204.000000 | total: 256.000000
train Loss : 0.0021
train Acc : 0.8539
TRAIN EPOCH corrects: 3471.000000 | total: 4065.000000
val Loss : 0.0024
val Acc : 0.8606
VAL EPOCH corrects: 389.000000 | total: 452.000000
Epoch 2/99
----------
Iter : 

Finding cell types: 100%|██████████| 2/2 [00:00<00:00, 52.63it/s]

Extracting model embeddings...



You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  adata_test.obs['scNym'].iloc[i] = "Novel"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adata_test.obs['scNy

No CUDA device found.
Computations will be performed on the CPU.
Add a CUDA compute device to improve speed dramatically.

No unlabeled data was found.
Did you forget to set some examples as `"Unlabeled"`?
Proceeding with purely supervised training.

X:  (5020, 2000)
y:  (5020,)
Not weighting classes and not balancing classes.
Training...
Epoch 0/99
----------
Iter :  0
running_loss :  0.01093212328851223
running_acc  :  0.08203125
corrects: 21.000000 | total: 256.000000
train Loss : 0.0055
train Acc : 0.5730
TRAIN EPOCH corrects: 2330.000000 | total: 4066.000000
val Loss : 0.0041
val Acc : 0.6748
VAL EPOCH corrects: 305.000000 | total: 452.000000
Epoch 1/99
----------
Iter :  0
running_loss :  0.0028401550371199846
running_acc  :  0.7421875
corrects: 190.000000 | total: 256.000000
train Loss : 0.0022
train Acc : 0.8387
TRAIN EPOCH corrects: 3410.000000 | total: 4066.000000
val Loss : 0.0024
val Acc : 0.8097
VAL EPOCH corrects: 366.000000 | total: 452.000000
Epoch 2/99
----------
Iter 

Finding cell types: 100%|██████████| 2/2 [00:00<00:00, 39.22it/s]

Extracting model embeddings...



You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  adata_test.obs['scNym'].iloc[i] = "Novel"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adata_test.obs['scNy

No CUDA device found.
Computations will be performed on the CPU.
Add a CUDA compute device to improve speed dramatically.

No unlabeled data was found.
Did you forget to set some examples as `"Unlabeled"`?
Proceeding with purely supervised training.

X:  (5021, 2000)
y:  (5021,)
Not weighting classes and not balancing classes.
Training...
Epoch 0/99
----------
Iter :  0
running_loss :  0.011169207282364368
running_acc  :  0.0703125
corrects: 18.000000 | total: 256.000000
train Loss : 0.0056
train Acc : 0.5723
TRAIN EPOCH corrects: 2327.000000 | total: 4066.000000
val Loss : 0.0036
val Acc : 0.6881
VAL EPOCH corrects: 311.000000 | total: 452.000000
Epoch 1/99
----------
Iter :  0
running_loss :  0.0032073098700493574
running_acc  :  0.71875
corrects: 184.000000 | total: 256.000000
train Loss : 0.0022
train Acc : 0.8337
TRAIN EPOCH corrects: 3390.000000 | total: 4066.000000
val Loss : 0.0016
val Acc : 0.8805
VAL EPOCH corrects: 398.000000 | total: 452.000000
Epoch 2/99
----------
Iter : 

Finding cell types: 100%|██████████| 2/2 [00:00<00:00, 36.36it/s]

Extracting model embeddings...



You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  adata_test.obs['scNym'].iloc[i] = "Novel"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adata_test.obs['scNy

In [4]:
# Save results
results.to_csv('results/scNym_hvgs_output.csv', index=True)