## scNym: https://github.com/calico/scnym 

In [1]:
import scnym
from scnym.api import scnym_api
import torch
import random
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import scanpy as sc




In [2]:
def split_data(data_path: str, 
               fold: int=1,
               folds: int=5,
               batch_key: str="patientID", 
               label_key: str="cell_type",
               seed: int=42,
               HVG: bool=True,
               HVGs: int=2000):
    
    adata = sc.read(data_path, cache=True)

    adata.obs["batch"] = adata.obs[batch_key]

    # Ensure reproducibility
    def rep_seed(seed):
        # Check if a GPU is available
        if torch.cuda.is_available():
            # Set the random seed for PyTorch CUDA (GPU) operations
            torch.cuda.manual_seed(seed)
            # Set the random seed for all CUDA devices (if multiple GPUs are available)
            torch.cuda.manual_seed_all(seed)
        
        # Set the random seed for CPU-based PyTorch operations
        torch.manual_seed(seed)
        
        # Set the random seed for NumPy
        np.random.seed(seed)
        
        # Set the random seed for Python's built-in 'random' module
        random.seed(seed)
        
        # Set the random seed for TensorFlow
        tf.random.set_seed(seed)
        
        # Set CuDNN to deterministic mode for PyTorch (GPU)
        torch.backends.cudnn.deterministic = True
        
        # Disable CuDNN's benchmarking mode for deterministic behavior
        torch.backends.cudnn.benchmark = False

    rep_seed(seed)

    # Initialize Stratified K-Fold
    stratified_kfold = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)

    # Iterate through the folds
    adata_original = adata.copy()
    test_adata_original = adata.copy()
    fold_counter = 0
    for train_index, test_index in stratified_kfold.split(adata_original.X, adata_original.obs[label_key]):
        fold_counter += 1
        
        adata = adata_original[train_index, :].copy()
        test_adata = test_adata_original[test_index, :].copy()

        if HVG:
            sc.pp.highly_variable_genes(adata, n_top_genes=HVGs, flavor="cell_ranger")
            test_adata = test_adata[:, adata.var["highly_variable"]].copy()
            adata = adata[:, adata.var["highly_variable"]].copy()

        # Define cell types to exclude
        exclude_cell_types = ['Mature_B_Cells', 
                            'Plasma_Cells', 
                            'alpha-beta_T_Cells', 
                            'gamma-delta_T_Cells_1', 
                            'gamma-delta_T_Cells_2']

        # Create a boolean mask to select cells that are not in the exclude list
        mask = ~adata.obs['cell_type'].isin(exclude_cell_types)

        # Apply the mask to AnnData object
        adata = adata[mask]
        
        if fold_counter == fold:
            
            return adata, test_adata


In [3]:
data_path = "../../../data/processed/data_for_evaluating_cell_type_annotation/MacParland.h5ad"

# Iterate through folds
results = None
for fold_idx in range(5):
    fold = fold_idx + 1

    adata_train, adata_test = split_data(data_path=data_path, fold=fold)

    # Train model
    scnym_api(
        adata=adata_train,
        task='train',
        groupby='cell_type',
        out_path='./scnym_outputs',
        config='no_new_identity',
    )

    # Predict test data
    scnym_api(
        adata=adata_test,
        task='predict',
        key_added='scNym',
        config='no_new_identity',
        trained_model='./scnym_outputs'
    )

    # Define cell as Novel if the confidence is below 0.5
    max_scores = adata_test.obs['scNym_confidence']
    for i, max_score in enumerate(max_scores):
        if max_score < 0.5:
            adata_test.obs['scNym'].iloc[i] = "Novel"

    # Save results in dataframe
    if fold == 1:
        results = pd.DataFrame({"pred": adata_test.obs['scNym'].to_list(), 
                                "true_label": adata_test.obs["cell_type"].to_list(), 
                                "fold": [fold]*adata_test.obs['scNym'].shape[0]})
    else:   
        dataframe_temp = pd.DataFrame({"pred": adata_test.obs['scNym'].to_list(), 
                                    "true_label": adata_test.obs["cell_type"].to_list(), 
                                    "fold": [fold]*adata_test.obs['scNym'].shape[0]})
        results = pd.concat([results,dataframe_temp], axis=0)

# Reset index
results.reset_index(drop=True, inplace=True)

  disp_grouped = df.groupby('mean_bin')['dispersions']


No CUDA device found.
Computations will be performed on the CPU.
Add a CUDA compute device to improve speed dramatically.

No unlabeled data was found.
Did you forget to set some examples as `"Unlabeled"`?
Proceeding with purely supervised training.

X:  (5018, 2000)
y:  (5018,)
Not weighting classes and not balancing classes.
Training...
Epoch 0/99
----------
Iter :  0
running_loss :  0.011172466911375523
running_acc  :  0.07421875
corrects: 19.000000 | total: 256.000000
train Loss : 0.0055
train Acc : 0.5987
TRAIN EPOCH corrects: 2433.000000 | total: 4064.000000
val Loss : 0.0033
val Acc : 0.7788
VAL EPOCH corrects: 352.000000 | total: 452.000000
Epoch 1/99
----------
Iter :  0
running_loss :  0.002683804137632251
running_acc  :  0.796875
corrects: 204.000000 | total: 256.000000
train Loss : 0.0021
train Acc : 0.8452
TRAIN EPOCH corrects: 3435.000000 | total: 4064.000000
val Loss : 0.0019
val Acc : 0.8761
VAL EPOCH corrects: 396.000000 | total: 452.000000
Epoch 2/99
----------
Iter :

  adata.uns['scNym_train_results'] = results


train Loss : 0.0000
train Acc : 0.9995
TRAIN EPOCH corrects: 4062.000000 | total: 4064.000000
val Loss : 0.0019
val Acc : 0.9248
VAL EPOCH corrects: 418.000000 | total: 452.000000
Training complete.

Evaluating model.
EVAL LOSS:  0.19568521529436111
EVAL ACC :  0.9482071713147411
Predictions | Labels
[[10 12]
 [ 0  0]
 [10 10]
 [ 3  3]
 [12 12]
 [10 10]
 [ 3  3]
 [11 11]
 [ 0  0]
 [ 0  0]
 [10 10]
 [ 8  8]
 [ 0  0]
 [10 10]
 [ 3  3]]
No CUDA device found.
Computations will be performed on the CPU.
Add a CUDA compute device to improve speed dramatically.

Loaded model predicting 15 classes from 2000 features
['Central_venous_LSECs' 'Cholangiocytes' 'Erythroid_Cells'
 'Hepatic_Stellate_Cells' 'Hepatocyte_1' 'Hepatocyte_2' 'Hepatocyte_3'
 'Hepatocyte_4' 'Hepatocyte_5' 'Hepatocyte_6' 'Inflammatory_Macrophage'
 'NK-like_Cells' 'Non-inflammatory_Macrophage' 'Periportal_LSECs'
 'Portal_endothelial_Cells']
Building a classification matrix...
Gene names match exactly, returning input.
Predictin

Finding cell types: 100%|██████████| 2/2 [00:00<00:00, 43.48it/s]

Extracting model embeddings...



You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  adata_test.obs['scNym'].iloc[i] = "Novel"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adata_test.obs['scNy

No CUDA device found.
Computations will be performed on the CPU.
Add a CUDA compute device to improve speed dramatically.

No unlabeled data was found.
Did you forget to set some examples as `"Unlabeled"`?
Proceeding with purely supervised training.

X:  (5018, 2000)
y:  (5018,)
Not weighting classes and not balancing classes.
Training...
Epoch 0/99
----------
Iter :  0
running_loss :  0.011121857911348343
running_acc  :  0.06640625
corrects: 17.000000 | total: 256.000000
train Loss : 0.0055
train Acc : 0.5898
TRAIN EPOCH corrects: 2397.000000 | total: 4064.000000
val Loss : 0.0029
val Acc : 0.8274
VAL EPOCH corrects: 374.000000 | total: 452.000000
Epoch 1/99
----------
Iter :  0
running_loss :  0.0028940504416823387
running_acc  :  0.7578125
corrects: 194.000000 | total: 256.000000
train Loss : 0.0022
train Acc : 0.8408
TRAIN EPOCH corrects: 3417.000000 | total: 4064.000000
val Loss : 0.0015
val Acc : 0.8827
VAL EPOCH corrects: 399.000000 | total: 452.000000
Epoch 2/99
----------
Iter

  adata.uns['scNym_train_results'] = results


train Loss : 0.0000
train Acc : 0.9980
TRAIN EPOCH corrects: 4056.000000 | total: 4064.000000
val Loss : 0.0017
val Acc : 0.9381
VAL EPOCH corrects: 424.000000 | total: 452.000000
Training complete.

Evaluating model.
EVAL LOSS:  0.1657213792204857
EVAL ACC :  0.9442231075697212
Predictions | Labels
[[10 12]
 [10 10]
 [ 0  0]
 [ 8  8]
 [ 0  0]
 [13 13]
 [11 11]
 [14 14]
 [ 8  8]
 [11 11]
 [12 12]
 [10 10]
 [ 8  8]
 [12 12]
 [12 12]]
No CUDA device found.
Computations will be performed on the CPU.
Add a CUDA compute device to improve speed dramatically.

Loaded model predicting 15 classes from 2000 features
['Central_venous_LSECs' 'Cholangiocytes' 'Erythroid_Cells'
 'Hepatic_Stellate_Cells' 'Hepatocyte_1' 'Hepatocyte_2' 'Hepatocyte_3'
 'Hepatocyte_4' 'Hepatocyte_5' 'Hepatocyte_6' 'Inflammatory_Macrophage'
 'NK-like_Cells' 'Non-inflammatory_Macrophage' 'Periportal_LSECs'
 'Portal_endothelial_Cells']
Building a classification matrix...
Gene names match exactly, returning input.
Predicting

Finding cell types: 100%|██████████| 2/2 [00:00<00:00, 43.48it/s]

Extracting model embeddings...



You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  adata_test.obs['scNym'].iloc[i] = "Novel"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adata_test.obs['scNy

No CUDA device found.
Computations will be performed on the CPU.
Add a CUDA compute device to improve speed dramatically.

No unlabeled data was found.
Did you forget to set some examples as `"Unlabeled"`?
Proceeding with purely supervised training.

X:  (5019, 2000)
y:  (5019,)
Not weighting classes and not balancing classes.
Training...
Epoch 0/99
----------
Iter :  0
running_loss :  0.011205763556063175
running_acc  :  0.07421875
corrects: 19.000000 | total: 256.000000
train Loss : 0.0055
train Acc : 0.5948
TRAIN EPOCH corrects: 2418.000000 | total: 4065.000000
val Loss : 0.0034
val Acc : 0.7500
VAL EPOCH corrects: 339.000000 | total: 452.000000
Epoch 1/99
----------
Iter :  0
running_loss :  0.002584644127637148
running_acc  :  0.81640625
corrects: 209.000000 | total: 256.000000
train Loss : 0.0021
train Acc : 0.8526
TRAIN EPOCH corrects: 3466.000000 | total: 4065.000000
val Loss : 0.0018
val Acc : 0.8761
VAL EPOCH corrects: 396.000000 | total: 452.000000
Epoch 2/99
----------
Iter

  adata.uns['scNym_train_results'] = results


train Loss : 0.0000
train Acc : 0.9995
TRAIN EPOCH corrects: 4063.000000 | total: 4065.000000
val Loss : 0.0018
val Acc : 0.9403
VAL EPOCH corrects: 425.000000 | total: 452.000000
Training complete.

Evaluating model.
EVAL LOSS:  0.17363597452640533
EVAL ACC :  0.9442231075697212
Predictions | Labels
[[10 10]
 [ 0  0]
 [ 0  0]
 [ 8  8]
 [ 0  0]
 [ 9  9]
 [ 0  0]
 [10 10]
 [10 10]
 [ 0  0]
 [10 10]
 [ 0  0]
 [10 10]
 [12 12]
 [10 10]]
No CUDA device found.
Computations will be performed on the CPU.
Add a CUDA compute device to improve speed dramatically.

Loaded model predicting 15 classes from 2000 features
['Central_venous_LSECs' 'Cholangiocytes' 'Erythroid_Cells'
 'Hepatic_Stellate_Cells' 'Hepatocyte_1' 'Hepatocyte_2' 'Hepatocyte_3'
 'Hepatocyte_4' 'Hepatocyte_5' 'Hepatocyte_6' 'Inflammatory_Macrophage'
 'NK-like_Cells' 'Non-inflammatory_Macrophage' 'Periportal_LSECs'
 'Portal_endothelial_Cells']
Building a classification matrix...
Gene names match exactly, returning input.
Predictin

Finding cell types: 100%|██████████| 2/2 [00:00<00:00, 43.48it/s]

Extracting model embeddings...



You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  adata_test.obs['scNym'].iloc[i] = "Novel"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adata_test.obs['scNy

No CUDA device found.
Computations will be performed on the CPU.
Add a CUDA compute device to improve speed dramatically.

No unlabeled data was found.
Did you forget to set some examples as `"Unlabeled"`?
Proceeding with purely supervised training.

X:  (5020, 2000)
y:  (5020,)
Not weighting classes and not balancing classes.
Training...
Epoch 0/99
----------
Iter :  0
running_loss :  0.011012140661478043
running_acc  :  0.0625
corrects: 16.000000 | total: 256.000000
train Loss : 0.0055
train Acc : 0.5883
TRAIN EPOCH corrects: 2392.000000 | total: 4066.000000
val Loss : 0.0036
val Acc : 0.7522
VAL EPOCH corrects: 340.000000 | total: 452.000000
Epoch 1/99
----------
Iter :  0
running_loss :  0.002647967776283622
running_acc  :  0.80078125
corrects: 205.000000 | total: 256.000000
train Loss : 0.0021
train Acc : 0.8433
TRAIN EPOCH corrects: 3429.000000 | total: 4066.000000
val Loss : 0.0017
val Acc : 0.8606
VAL EPOCH corrects: 389.000000 | total: 452.000000
Epoch 2/99
----------
Iter :  

  adata.uns['scNym_train_results'] = results


train Loss : 0.0000
train Acc : 0.9970
TRAIN EPOCH corrects: 4054.000000 | total: 4066.000000
val Loss : 0.0017
val Acc : 0.9336
VAL EPOCH corrects: 422.000000 | total: 452.000000
Training complete.

Evaluating model.
EVAL LOSS:  0.1939176768064499
EVAL ACC :  0.9402390438247012
Predictions | Labels
[[10 10]
 [10 10]
 [11 11]
 [13  3]
 [ 8  8]
 [12 12]
 [10 10]
 [ 0  0]
 [ 0  0]
 [ 2  2]
 [ 8  8]
 [10 10]
 [ 0  0]
 [12 12]
 [ 0 10]]
No CUDA device found.
Computations will be performed on the CPU.
Add a CUDA compute device to improve speed dramatically.

Loaded model predicting 15 classes from 2000 features
['Central_venous_LSECs' 'Cholangiocytes' 'Erythroid_Cells'
 'Hepatic_Stellate_Cells' 'Hepatocyte_1' 'Hepatocyte_2' 'Hepatocyte_3'
 'Hepatocyte_4' 'Hepatocyte_5' 'Hepatocyte_6' 'Inflammatory_Macrophage'
 'NK-like_Cells' 'Non-inflammatory_Macrophage' 'Periportal_LSECs'
 'Portal_endothelial_Cells']
Building a classification matrix...
Gene names match exactly, returning input.
Predicting

Finding cell types: 100%|██████████| 2/2 [00:00<00:00, 39.22it/s]

Extracting model embeddings...



You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  adata_test.obs['scNym'].iloc[i] = "Novel"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adata_test.obs['scNy

No CUDA device found.
Computations will be performed on the CPU.
Add a CUDA compute device to improve speed dramatically.

No unlabeled data was found.
Did you forget to set some examples as `"Unlabeled"`?
Proceeding with purely supervised training.

X:  (5021, 2000)
y:  (5021,)
Not weighting classes and not balancing classes.
Training...
Epoch 0/99
----------
Iter :  0
running_loss :  0.011009304784238338
running_acc  :  0.046875
corrects: 12.000000 | total: 256.000000
train Loss : 0.0056
train Acc : 0.5903
TRAIN EPOCH corrects: 2400.000000 | total: 4066.000000
val Loss : 0.0034
val Acc : 0.7058
VAL EPOCH corrects: 319.000000 | total: 452.000000
Epoch 1/99
----------
Iter :  0
running_loss :  0.0032759765163064003
running_acc  :  0.734375
corrects: 188.000000 | total: 256.000000
train Loss : 0.0021
train Acc : 0.8441
TRAIN EPOCH corrects: 3432.000000 | total: 4066.000000
val Loss : 0.0016
val Acc : 0.8827
VAL EPOCH corrects: 399.000000 | total: 452.000000
Epoch 2/99
----------
Iter : 

  adata.uns['scNym_train_results'] = results


train Loss : 0.0000
train Acc : 0.9988
TRAIN EPOCH corrects: 4061.000000 | total: 4066.000000
val Loss : 0.0013
val Acc : 0.9381
VAL EPOCH corrects: 424.000000 | total: 452.000000
Training complete.

Evaluating model.
EVAL LOSS:  0.20550469309091568
EVAL ACC :  0.9383697813121272
Predictions | Labels
[[10 12]
 [10 10]
 [ 0  0]
 [10 10]
 [11 11]
 [ 0  0]
 [10 10]
 [ 0  0]
 [ 8  8]
 [ 0  0]
 [ 8  8]
 [ 0  0]
 [10 10]
 [ 0  0]
 [10 10]]
No CUDA device found.
Computations will be performed on the CPU.
Add a CUDA compute device to improve speed dramatically.

Loaded model predicting 15 classes from 2000 features
['Central_venous_LSECs' 'Cholangiocytes' 'Erythroid_Cells'
 'Hepatic_Stellate_Cells' 'Hepatocyte_1' 'Hepatocyte_2' 'Hepatocyte_3'
 'Hepatocyte_4' 'Hepatocyte_5' 'Hepatocyte_6' 'Inflammatory_Macrophage'
 'NK-like_Cells' 'Non-inflammatory_Macrophage' 'Periportal_LSECs'
 'Portal_endothelial_Cells']
Building a classification matrix...
Gene names match exactly, returning input.
Predictin

Finding cell types: 100%|██████████| 2/2 [00:00<00:00, 30.77it/s]

Extracting model embeddings...



You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  adata_test.obs['scNym'].iloc[i] = "Novel"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adata_test.obs['scNy

In [4]:
# Save results
results.to_csv('results/scNym_hvgs_output.csv', index=True)