## scNym: https://github.com/calico/scnym 

In [1]:
import scnym
from scnym.api import scnym_api
import torch
import random
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import scanpy as sc




In [2]:
def split_data(data_path: str, 
               fold: int=1,
               folds: int=5,
               batch_key: str="patientID", 
               label_key: str="cell_type",
               seed: int=42,
               HVG: bool=False,
               HVGs: int=2000):
    
    adata = sc.read(data_path, cache=True)

    adata.obs["batch"] = adata.obs[batch_key]

    # Ensure reproducibility
    def rep_seed(seed):
        # Check if a GPU is available
        if torch.cuda.is_available():
            # Set the random seed for PyTorch CUDA (GPU) operations
            torch.cuda.manual_seed(seed)
            # Set the random seed for all CUDA devices (if multiple GPUs are available)
            torch.cuda.manual_seed_all(seed)
        
        # Set the random seed for CPU-based PyTorch operations
        torch.manual_seed(seed)
        
        # Set the random seed for NumPy
        np.random.seed(seed)
        
        # Set the random seed for Python's built-in 'random' module
        random.seed(seed)
        
        # Set the random seed for TensorFlow
        tf.random.set_seed(seed)
        
        # Set CuDNN to deterministic mode for PyTorch (GPU)
        torch.backends.cudnn.deterministic = True
        
        # Disable CuDNN's benchmarking mode for deterministic behavior
        torch.backends.cudnn.benchmark = False

    rep_seed(seed)

    # Initialize Stratified K-Fold
    stratified_kfold = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)

    # Iterate through the folds
    adata_original = adata.copy()
    test_adata_original = adata.copy()
    fold_counter = 0
    for train_index, test_index in stratified_kfold.split(adata_original.X, adata_original.obs[label_key]):
        fold_counter += 1
        
        adata = adata_original[train_index, :].copy()
        test_adata = test_adata_original[test_index, :].copy()

        # Define cell types to exclude
        exclude_cell_types = ['Mature_B_Cells', 
                            'Plasma_Cells', 
                            'alpha-beta_T_Cells', 
                            'gamma-delta_T_Cells_1', 
                            'gamma-delta_T_Cells_2']

        # Create a boolean mask to select cells that are not in the exclude list
        mask = ~adata.obs['cell_type'].isin(exclude_cell_types)

        # Apply the mask to AnnData object
        adata = adata[mask]

        if HVG:
            sc.pp.highly_variable_genes(adata, n_top_genes=HVGs, flavor="cell_ranger")
            test_adata = test_adata[:, adata.var["highly_variable"]].copy()
            adata = adata[:, adata.var["highly_variable"]].copy()
        
        if fold_counter == fold:
            
            return adata, test_adata


In [3]:
data_path = "../../../data/processed/data_for_evaluating_cell_type_annotation/MacParland.h5ad"

# Iterate through folds
results = None
for fold_idx in range(5):
    fold = fold_idx + 1

    adata_train, adata_test = split_data(data_path=data_path, fold=fold)

    # Train model
    scnym_api(
        adata=adata_train,
        task='train',
        groupby='cell_type',
        out_path='./scnym_outputs',
        config='no_new_identity',
    )

    # Predict test data
    scnym_api(
        adata=adata_test,
        task='predict',
        key_added='scNym',
        config='no_new_identity',
        trained_model='./scnym_outputs'
    )

    # Define cell as Novel if the confidence is below 0.5
    max_scores = adata_test.obs['scNym_confidence']
    for i, max_score in enumerate(max_scores):
        if max_score < 0.5:
            adata_test.obs['scNym'].iloc[i] = "Novel"

    # Save results in dataframe
    if fold == 1:
        results = pd.DataFrame({"pred": adata_test.obs['scNym'].to_list(), 
                                "true_label": adata_test.obs["cell_type"].to_list(), 
                                "fold": [fold]*adata_test.obs['scNym'].shape[0]})
    else:   
        dataframe_temp = pd.DataFrame({"pred": adata_test.obs['scNym'].to_list(), 
                                    "true_label": adata_test.obs["cell_type"].to_list(), 
                                    "fold": [fold]*adata_test.obs['scNym'].shape[0]})
        results = pd.concat([results,dataframe_temp], axis=0)

# Reset index
results.reset_index(drop=True, inplace=True)



No CUDA device found.
Computations will be performed on the CPU.
Add a CUDA compute device to improve speed dramatically.

No unlabeled data was found.
Did you forget to set some examples as `"Unlabeled"`?
Proceeding with purely supervised training.

X:  (5018, 15386)
y:  (5018,)
Not weighting classes and not balancing classes.
Training...
Epoch 0/99
----------
Iter :  0
running_loss :  0.011137683875858784
running_acc  :  0.0546875
corrects: 14.000000 | total: 256.000000
train Loss : 0.0053
train Acc : 0.6139
TRAIN EPOCH corrects: 2495.000000 | total: 4064.000000
val Loss : 0.0030
val Acc : 0.7854
VAL EPOCH corrects: 355.000000 | total: 452.000000
Epoch 1/99
----------
Iter :  0
running_loss :  0.0025729616172611713
running_acc  :  0.80078125
corrects: 205.000000 | total: 256.000000
train Loss : 0.0019
train Acc : 0.8625
TRAIN EPOCH corrects: 3505.000000 | total: 4064.000000
val Loss : 0.0027
val Acc : 0.8230
VAL EPOCH corrects: 372.000000 | total: 452.000000
Epoch 2/99
----------
Ite

  adata.uns['scNym_train_results'] = results


No CUDA device found.
Computations will be performed on the CPU.
Add a CUDA compute device to improve speed dramatically.

Loaded model predicting 15 classes from 15386 features
['Central_venous_LSECs' 'Cholangiocytes' 'Erythroid_Cells'
 'Hepatic_Stellate_Cells' 'Hepatocyte_1' 'Hepatocyte_2' 'Hepatocyte_3'
 'Hepatocyte_4' 'Hepatocyte_5' 'Hepatocyte_6' 'Inflammatory_Macrophage'
 'NK-like_Cells' 'Non-inflammatory_Macrophage' 'Periportal_LSECs'
 'Portal_endothelial_Cells']
Building a classification matrix...
Gene names match exactly, returning input.
Predicting cell types...


Finding cell types: 100%|██████████| 2/2 [00:00<00:00, 16.26it/s]
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  adata_test.obs['scNym'].iloc[i] = "Novel"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/in

Extracting model embeddings...
No CUDA device found.
Computations will be performed on the CPU.
Add a CUDA compute device to improve speed dramatically.

No unlabeled data was found.
Did you forget to set some examples as `"Unlabeled"`?
Proceeding with purely supervised training.

X:  (5018, 15386)
y:  (5018,)
Not weighting classes and not balancing classes.
Training...
Epoch 0/99
----------
Iter :  0
running_loss :  0.01104919146746397
running_acc  :  0.08984375
corrects: 23.000000 | total: 256.000000
train Loss : 0.0055
train Acc : 0.5987
TRAIN EPOCH corrects: 2433.000000 | total: 4064.000000
val Loss : 0.0034
val Acc : 0.7478
VAL EPOCH corrects: 338.000000 | total: 452.000000
Epoch 1/99
----------
Iter :  0
running_loss :  0.0026039942167699337
running_acc  :  0.83203125
corrects: 213.000000 | total: 256.000000
train Loss : 0.0020
train Acc : 0.8632
TRAIN EPOCH corrects: 3508.000000 | total: 4064.000000
val Loss : 0.0018
val Acc : 0.8938
VAL EPOCH corrects: 404.000000 | total: 452.0

  adata.uns['scNym_train_results'] = results


No CUDA device found.
Computations will be performed on the CPU.
Add a CUDA compute device to improve speed dramatically.

Loaded model predicting 15 classes from 15386 features
['Central_venous_LSECs' 'Cholangiocytes' 'Erythroid_Cells'
 'Hepatic_Stellate_Cells' 'Hepatocyte_1' 'Hepatocyte_2' 'Hepatocyte_3'
 'Hepatocyte_4' 'Hepatocyte_5' 'Hepatocyte_6' 'Inflammatory_Macrophage'
 'NK-like_Cells' 'Non-inflammatory_Macrophage' 'Periportal_LSECs'
 'Portal_endothelial_Cells']
Building a classification matrix...
Gene names match exactly, returning input.
Predicting cell types...


Finding cell types: 100%|██████████| 2/2 [00:00<00:00, 14.09it/s]
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  adata_test.obs['scNym'].iloc[i] = "Novel"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/in

Extracting model embeddings...
No CUDA device found.
Computations will be performed on the CPU.
Add a CUDA compute device to improve speed dramatically.

No unlabeled data was found.
Did you forget to set some examples as `"Unlabeled"`?
Proceeding with purely supervised training.

X:  (5019, 15386)
y:  (5019,)
Not weighting classes and not balancing classes.
Training...
Epoch 0/99
----------
Iter :  0
running_loss :  0.011101024225354195
running_acc  :  0.0625
corrects: 16.000000 | total: 256.000000
train Loss : 0.0051
train Acc : 0.6303
TRAIN EPOCH corrects: 2562.000000 | total: 4065.000000
val Loss : 0.0033
val Acc : 0.7743
VAL EPOCH corrects: 350.000000 | total: 452.000000
Epoch 1/99
----------
Iter :  0
running_loss :  0.00224399589933455
running_acc  :  0.84765625
corrects: 217.000000 | total: 256.000000
train Loss : 0.0018
train Acc : 0.8740
TRAIN EPOCH corrects: 3553.000000 | total: 4065.000000
val Loss : 0.0019
val Acc : 0.8916
VAL EPOCH corrects: 403.000000 | total: 452.000000

  adata.uns['scNym_train_results'] = results


No CUDA device found.
Computations will be performed on the CPU.
Add a CUDA compute device to improve speed dramatically.

Loaded model predicting 15 classes from 15386 features
['Central_venous_LSECs' 'Cholangiocytes' 'Erythroid_Cells'
 'Hepatic_Stellate_Cells' 'Hepatocyte_1' 'Hepatocyte_2' 'Hepatocyte_3'
 'Hepatocyte_4' 'Hepatocyte_5' 'Hepatocyte_6' 'Inflammatory_Macrophage'
 'NK-like_Cells' 'Non-inflammatory_Macrophage' 'Periportal_LSECs'
 'Portal_endothelial_Cells']
Building a classification matrix...
Gene names match exactly, returning input.
Predicting cell types...


Finding cell types: 100%|██████████| 2/2 [00:00<00:00, 11.83it/s]
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  adata_test.obs['scNym'].iloc[i] = "Novel"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/in

Extracting model embeddings...
No CUDA device found.
Computations will be performed on the CPU.
Add a CUDA compute device to improve speed dramatically.

No unlabeled data was found.
Did you forget to set some examples as `"Unlabeled"`?
Proceeding with purely supervised training.

X:  (5020, 15386)
y:  (5020,)
Not weighting classes and not balancing classes.
Training...
Epoch 0/99
----------
Iter :  0
running_loss :  0.011074228212237358
running_acc  :  0.1015625
corrects: 26.000000 | total: 256.000000
train Loss : 0.0054
train Acc : 0.6171
TRAIN EPOCH corrects: 2509.000000 | total: 4066.000000
val Loss : 0.0036
val Acc : 0.8230
VAL EPOCH corrects: 372.000000 | total: 452.000000
Epoch 1/99
----------
Iter :  0
running_loss :  0.002598348306491971
running_acc  :  0.80859375
corrects: 207.000000 | total: 256.000000
train Loss : 0.0020
train Acc : 0.8561
TRAIN EPOCH corrects: 3481.000000 | total: 4066.000000
val Loss : 0.0014
val Acc : 0.9004
VAL EPOCH corrects: 407.000000 | total: 452.00

  adata.uns['scNym_train_results'] = results


No CUDA device found.
Computations will be performed on the CPU.
Add a CUDA compute device to improve speed dramatically.

Loaded model predicting 15 classes from 15386 features
['Central_venous_LSECs' 'Cholangiocytes' 'Erythroid_Cells'
 'Hepatic_Stellate_Cells' 'Hepatocyte_1' 'Hepatocyte_2' 'Hepatocyte_3'
 'Hepatocyte_4' 'Hepatocyte_5' 'Hepatocyte_6' 'Inflammatory_Macrophage'
 'NK-like_Cells' 'Non-inflammatory_Macrophage' 'Periportal_LSECs'
 'Portal_endothelial_Cells']
Building a classification matrix...
Gene names match exactly, returning input.
Predicting cell types...


Finding cell types: 100%|██████████| 2/2 [00:00<00:00,  9.85it/s]
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  adata_test.obs['scNym'].iloc[i] = "Novel"


Extracting model embeddings...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adata_test.obs['scNym'].iloc[i] = "Novel"


No CUDA device found.
Computations will be performed on the CPU.
Add a CUDA compute device to improve speed dramatically.

No unlabeled data was found.
Did you forget to set some examples as `"Unlabeled"`?
Proceeding with purely supervised training.

X:  (5021, 15386)
y:  (5021,)
Not weighting classes and not balancing classes.
Training...
Epoch 0/99
----------
Iter :  0
running_loss :  0.011204532347619534
running_acc  :  0.05859375
corrects: 15.000000 | total: 256.000000
train Loss : 0.0053
train Acc : 0.6107
TRAIN EPOCH corrects: 2483.000000 | total: 4066.000000
val Loss : 0.0029
val Acc : 0.7788
VAL EPOCH corrects: 352.000000 | total: 452.000000
Epoch 1/99
----------
Iter :  0
running_loss :  0.0025209200102835894
running_acc  :  0.8359375
corrects: 214.000000 | total: 256.000000
train Loss : 0.0019
train Acc : 0.8788
TRAIN EPOCH corrects: 3573.000000 | total: 4066.000000
val Loss : 0.0023
val Acc : 0.8363
VAL EPOCH corrects: 378.000000 | total: 452.000000
Epoch 2/99
----------
Ite

  adata.uns['scNym_train_results'] = results


No CUDA device found.
Computations will be performed on the CPU.
Add a CUDA compute device to improve speed dramatically.

Loaded model predicting 15 classes from 15386 features
['Central_venous_LSECs' 'Cholangiocytes' 'Erythroid_Cells'
 'Hepatic_Stellate_Cells' 'Hepatocyte_1' 'Hepatocyte_2' 'Hepatocyte_3'
 'Hepatocyte_4' 'Hepatocyte_5' 'Hepatocyte_6' 'Inflammatory_Macrophage'
 'NK-like_Cells' 'Non-inflammatory_Macrophage' 'Periportal_LSECs'
 'Portal_endothelial_Cells']
Building a classification matrix...
Gene names match exactly, returning input.
Predicting cell types...


Finding cell types: 100%|██████████| 2/2 [00:00<00:00,  8.33it/s]
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  adata_test.obs['scNym'].iloc[i] = "Novel"


Extracting model embeddings...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adata_test.obs['scNym'].iloc[i] = "Novel"


In [4]:
# Save results
results.to_csv('results/scNym_output.csv', index=True)