In [12]:
### training a doublet classifier on DNA barcoded "ground truth" singlets for Zhang Melzer et al. 2023
### Created by Madeline E Melzer on 20231110
### Last edited by Madeline E Melzer on 20240516

In [1]:
import os
import xgboost
import sklearn
import shutil
import numpy as np
import pandas as pd
import scanpy as sc
import scipy
from scipy.io import mmread
import anndata
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.metrics import accuracy_score, roc_auc_score, average_precision_score, f1_score, precision_score, recall_score, confusion_matrix, classification_report
from xgboost import XGBClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from joblib import dump, load
print(xgboost.__version__)

np.random.seed(23)

2.0.2


In [None]:
### variable doublet rates

dataset_list = ["0.05", "0.08", "0.10", "0.15", "0.20", "0.25"]
results_dir = "/Users/mem3579/Library/CloudStorage/OneDrive-NorthwesternUniversity/Arispe and Goyal Labs/ZhangMelzerEtAl/data/classifier/results/"
classifiers_dir = "/Users/mem3579/Library/CloudStorage/OneDrive-NorthwesternUniversity/Arispe and Goyal Labs/ZhangMelzerEtAl/data/classifier/classifiers/"

#summary = pd.DataFrame(columns = ["dataset", "condition", "auroc", "auprc", "accuracy", "best_params"])

for dataset in dataset_list:
    for count in range(10):
        dataset_summary_df = main(dataset)
        summary = pd.concat([summary, dataset_summary_df])
    summary.to_csv(f"/Users/mem3579/Library/CloudStorage/OneDrive-NorthwesternUniversity/Arispe and Goyal Labs/ZhangMelzerEtAl/data/classifier/results/datasetSummaries/{dataset}.csv", index = False)

summary.to_csv("/Users/mem3579/Library/CloudStorage/OneDrive-NorthwesternUniversity/Arispe and Goyal Labs/ZhangMelzerEtAl/data/classifier/results/summary_variableDoubletRates.csv", index = False)

In [20]:
### different datasets
np.random.seed(23)

dataset_main = "TREX_minusCluster"

dataset_list = ["sample1", "sample2"]

data_dir = os.path.join("/Volumes/fsmresfiles/Basic_Sciences/CDB/GoyalLab/People/MadelineMelzer/ZhangMelzerEtAl/data/classifier/crossExperiment/", dataset_main)
results_dir = "/Volumes/fsmresfiles/Basic_Sciences/CDB/GoyalLab/People/MadelineMelzer/ZhangMelzerEtAl/data/classifier/crossExperiment/results/predictions/"
summaries_dir = "/Volumes/fsmresfiles/Basic_Sciences/CDB/GoyalLab/People/MadelineMelzer/ZhangMelzerEtAl/data/classifier/crossExperiment/results/datasetSummaries/"
classifiers_dir = os.path.join("/Volumes/fsmresfiles/Basic_Sciences/CDB/GoyalLab/People/MadelineMelzer/ZhangMelzerEtAl/data/classifier/crossExperiment/classifiers/", dataset_main)

for dataset in dataset_list:

    summary = pd.DataFrame(columns = ["dataset", "condition", "auroc", "auprc", "accuracy", "best_params"])

    for count in range(10):
        np.random.seed(count)
        dataset_summary_df = main(dataset)
        summary = pd.concat([summary, dataset_summary_df])

    summary.to_csv(f"{summaries_dir}/{dataset_main}/{dataset}.csv", index = False)

#summary.to_csv(os.path.join(summaries_dir, dataset_main, f"{dataset_main}.csv"), index = False)

Index([], dtype='object')
Index(['barcode', 'label', 'sample'], dtype='object')
{'doublet': 285, 'singlet': 2591}
100%|██████████| 10/10 [00:33<00:00,  3.31s/trial, best loss: -0.8182284681078825]
sample1 Best parameters for tree: {'colsample_bytree': 0.6791383453771085, 'gamma': 0.2640188975266044, 'learning_rate': 0.2764557012206551, 'max_depth': 4, 'min_child_weight': 5, 'n_estimators': 80, 'reg_alpha': 0.47664367717913625, 'reg_lambda': 2.074156653860055, 'scale_pos_weight': 61.61526825026569, 'subsample': 0.8911530928797715}
sample1 Best tree score: {'loss': -0.8182284681078825, 'accuracy': 0.9513888888888888, 'status': 'ok', 'auroc': 0.950782544028665, 'auprc': 0.8182284681078825}
sample1 AUROC: 0.9662331504436767
sample1 AUPRC: 0.8712190274752258
sample1 Accuracy: 0.9669565217391304
   dataset condition     auroc     auprc  accuracy  \
0  sample1   sample1  0.966233  0.871219  0.966957   

                                         best_params  
0  {'n_estimators': 81, 'max_depth'

  summary_df = pd.concat([summary_df, pd.DataFrame([self_dict])], ignore_index=True)


100%|██████████| 10/10 [00:37<00:00,  3.78s/trial, best loss: -0.1353671055298151]
sample1_neg_control_scrambled Best parameters for tree: {'colsample_bytree': 0.8827708529882821, 'gamma': 0.7692207916253541, 'learning_rate': 0.3662598181760541, 'max_depth': 6, 'min_child_weight': 4, 'n_estimators': 87, 'reg_alpha': 0.21092288617152333, 'reg_lambda': 2.8355245401750797, 'scale_pos_weight': 3.9311173546777045, 'subsample': 0.6679245342241802}
sample1_neg_control_scrambled Best tree score: {'loss': -0.1353671055298151, 'accuracy': 0.8975694444444444, 'status': 'ok', 'auroc': 0.6334719264442417, 'auprc': 0.1353671055298151}
sample1_neg_control_scrambled AUROC: 0.45888369572580096
sample1_neg_control_scrambled AUPRC: 0.09643445223353081
sample1_neg_control_scrambled Accuracy: 0.9008695652173913
   dataset                      condition     auroc     auprc  accuracy  \
0  sample1                        sample1  0.966233  0.871219  0.966957   
1  sample1  sample1_neg_control_scrambled  0.458

  summary = pd.concat([summary, dataset_summary_df])


Index([], dtype='object')
Index(['barcode', 'label', 'sample'], dtype='object')
{'doublet': 285, 'singlet': 2591}
100%|██████████| 10/10 [00:27<00:00,  2.70s/trial, best loss: -0.8469214607346068]
sample1 Best parameters for tree: {'colsample_bytree': 0.7609286216609517, 'gamma': 0.13545222290254058, 'learning_rate': 0.4802009299962159, 'max_depth': 17, 'min_child_weight': 2, 'n_estimators': 42, 'reg_alpha': 0.7497926624163822, 'reg_lambda': 1.9621895371240643, 'scale_pos_weight': 30.71593993080512, 'subsample': 0.7910143139619341}
sample1 Best tree score: {'loss': -0.8469214607346068, 'accuracy': 0.9496527777777778, 'status': 'ok', 'auroc': 0.965486935064057, 'auprc': 0.8469214607346068}
sample1 AUROC: 0.934633882002303
sample1 AUPRC: 0.7628154979197006
sample1 Accuracy: 0.9443478260869566
   dataset condition     auroc     auprc  accuracy  \
0  sample1   sample1  0.934634  0.762815  0.944348   

                                         best_params  
0  {'n_estimators': 43, 'max_depth

  summary_df = pd.concat([summary_df, pd.DataFrame([self_dict])], ignore_index=True)


100%|██████████| 10/10 [00:39<00:00,  3.95s/trial, best loss: -0.1231581692401844]
sample1_neg_control_scrambled Best parameters for tree: {'colsample_bytree': 0.7158280005517805, 'gamma': 0.8008779101458102, 'learning_rate': 0.6124462765306283, 'max_depth': 7, 'min_child_weight': 3, 'n_estimators': 97, 'reg_alpha': 0.8356106952169191, 'reg_lambda': 1.683062212234173, 'scale_pos_weight': 92.76097045290388, 'subsample': 0.8241033710544237}
sample1_neg_control_scrambled Best tree score: {'loss': -0.1231581692401844, 'accuracy': 0.8333333333333334, 'status': 'ok', 'auroc': 0.5644119933745732, 'auprc': 0.1231581692401844}
sample1_neg_control_scrambled AUROC: 0.5934769355821987
sample1_neg_control_scrambled AUPRC: 0.12747717990995638
sample1_neg_control_scrambled Accuracy: 0.8330434782608696
   dataset                      condition     auroc     auprc  accuracy  \
0  sample1                        sample1  0.934634  0.762815  0.944348   
1  sample1  sample1_neg_control_scrambled  0.593477 

  summary_df = pd.concat([summary_df, pd.DataFrame([self_dict])], ignore_index=True)


100%|██████████| 10/10 [00:41<00:00,  4.17s/trial, best loss: -0.12611982365465058]
sample1_neg_control_scrambled Best parameters for tree: {'colsample_bytree': 0.9809165635336575, 'gamma': 0.2893330469759434, 'learning_rate': 0.9854453924872291, 'max_depth': 8, 'min_child_weight': 6, 'n_estimators': 50, 'reg_alpha': 0.9168022896259536, 'reg_lambda': 1.593461704824197, 'scale_pos_weight': 77.90798506872724, 'subsample': 0.7899829342615585}
sample1_neg_control_scrambled Best tree score: {'loss': -0.12611982365465058, 'accuracy': 0.8090277777777778, 'status': 'ok', 'auroc': 0.5682317547239969, 'auprc': 0.12611982365465058}
sample1_neg_control_scrambled AUROC: 0.49187157081893923
sample1_neg_control_scrambled AUPRC: 0.09860811255307289
sample1_neg_control_scrambled Accuracy: 0.8260869565217391
   dataset                      condition     auroc     auprc  accuracy  \
0  sample1                        sample1  0.982388  0.884985  0.956522   
1  sample1  sample1_neg_control_scrambled  0.491

  summary_df = pd.concat([summary_df, pd.DataFrame([self_dict])], ignore_index=True)


100%|██████████| 10/10 [00:31<00:00,  3.14s/trial, best loss: -0.12668609528509692]
sample1_neg_control_scrambled Best parameters for tree: {'colsample_bytree': 0.744675626254282, 'gamma': 0.600203304975212, 'learning_rate': 0.6807694368956706, 'max_depth': 15, 'min_child_weight': 1, 'n_estimators': 51, 'reg_alpha': 0.7883415902219757, 'reg_lambda': 2.3605584166406235, 'scale_pos_weight': 31.671698670566553, 'subsample': 0.8193678745220001}
sample1_neg_control_scrambled Best tree score: {'loss': -0.12668609528509692, 'accuracy': 0.8715277777777778, 'status': 'ok', 'auroc': 0.5852347632085995, 'auprc': 0.12668609528509692}
sample1_neg_control_scrambled AUROC: 0.5196437038542301
sample1_neg_control_scrambled AUPRC: 0.10740978338717923
sample1_neg_control_scrambled Accuracy: 0.8695652173913043
   dataset                      condition     auroc     auprc  accuracy  \
0  sample1                        sample1  0.899919  0.660095  0.933913   
1  sample1  sample1_neg_control_scrambled  0.519

  summary_df = pd.concat([summary_df, pd.DataFrame([self_dict])], ignore_index=True)


100%|██████████| 10/10 [00:33<00:00,  3.33s/trial, best loss: -0.12949846298614512]
sample1_neg_control_scrambled Best parameters for tree: {'colsample_bytree': 0.8701024425301858, 'gamma': 0.9456604522038148, 'learning_rate': 0.7925533788700677, 'max_depth': 16, 'min_child_weight': 6, 'n_estimators': 74, 'reg_alpha': 0.44736260614619117, 'reg_lambda': 1.852947162436884, 'scale_pos_weight': 35.349472801706575, 'subsample': 0.7924696621683596}
sample1_neg_control_scrambled Best tree score: {'loss': -0.12949846298614512, 'accuracy': 0.8263888888888888, 'status': 'ok', 'auroc': 0.5295270932630227, 'auprc': 0.12949846298614512}
sample1_neg_control_scrambled AUROC: 0.5053512158775316
sample1_neg_control_scrambled AUPRC: 0.10676817231806868
sample1_neg_control_scrambled Accuracy: 0.8226086956521739
   dataset                      condition     auroc     auprc  accuracy  \
0  sample1                        sample1  0.958037  0.797058  0.937391   
1  sample1  sample1_neg_control_scrambled  0.5

  summary_df = pd.concat([summary_df, pd.DataFrame([self_dict])], ignore_index=True)


100%|██████████| 10/10 [00:26<00:00,  2.68s/trial, best loss: -0.12824189978969455]
sample1_neg_control_scrambled Best parameters for tree: {'colsample_bytree': 0.6202598348165553, 'gamma': 0.5575329187701609, 'learning_rate': 0.22592500619236602, 'max_depth': 8, 'min_child_weight': 7, 'n_estimators': 56, 'reg_alpha': 0.6169537996572592, 'reg_lambda': 2.9163029495369672, 'scale_pos_weight': 6.38085083610561, 'subsample': 0.5942226574836273}
sample1_neg_control_scrambled Best tree score: {'loss': -0.12824189978969455, 'accuracy': 0.8993055555555556, 'status': 'ok', 'auroc': 0.5460906601764527, 'auprc': 0.12824189978969455}
sample1_neg_control_scrambled AUROC: 0.535257061572851
sample1_neg_control_scrambled AUPRC: 0.10780351753921841
sample1_neg_control_scrambled Accuracy: 0.8921739130434783
   dataset                      condition     auroc     auprc  accuracy  \
0  sample1                        sample1  0.947335  0.818306  0.947826   
1  sample1  sample1_neg_control_scrambled  0.5352

  summary_df = pd.concat([summary_df, pd.DataFrame([self_dict])], ignore_index=True)


100%|██████████| 10/10 [00:37<00:00,  3.74s/trial, best loss: -0.15308393917420932]
sample1_neg_control_scrambled Best parameters for tree: {'colsample_bytree': 0.5859977717371869, 'gamma': 0.9070759950603227, 'learning_rate': 0.6897088461594026, 'max_depth': 18, 'min_child_weight': 5, 'n_estimators': 51, 'reg_alpha': 0.7863958538705595, 'reg_lambda': 1.442781468157694, 'scale_pos_weight': 53.54930372728957, 'subsample': 0.5746617024920584}
sample1_neg_control_scrambled Best tree score: {'loss': -0.15308393917420932, 'accuracy': 0.8454861111111112, 'status': 'ok', 'auroc': 0.5686711962951695, 'auprc': 0.15308393917420932}
sample1_neg_control_scrambled AUROC: 0.4945471787577051
sample1_neg_control_scrambled AUPRC: 0.10309885649653966
sample1_neg_control_scrambled Accuracy: 0.8208695652173913
   dataset                      condition     auroc     auprc  accuracy  \
0  sample1                        sample1  0.982185  0.924209  0.973913   
1  sample1  sample1_neg_control_scrambled  0.494

  summary_df = pd.concat([summary_df, pd.DataFrame([self_dict])], ignore_index=True)


100%|██████████| 10/10 [00:38<00:00,  3.89s/trial, best loss: -0.13024092800132572]
sample1_neg_control_scrambled Best parameters for tree: {'colsample_bytree': 0.5231234334461159, 'gamma': 0.9582148074182243, 'learning_rate': 0.20869092340139273, 'max_depth': 3, 'min_child_weight': 6, 'n_estimators': 93, 'reg_alpha': 0.14360801198798145, 'reg_lambda': 2.318391746516574, 'scale_pos_weight': 23.095783971545444, 'subsample': 0.635537698007432}
sample1_neg_control_scrambled Best tree score: {'loss': -0.13024092800132572, 'accuracy': 0.8854166666666666, 'status': 'ok', 'auroc': 0.525707331913599, 'auprc': 0.13024092800132572}
sample1_neg_control_scrambled AUROC: 0.49231186073291333
sample1_neg_control_scrambled AUPRC: 0.11955131286303837
sample1_neg_control_scrambled Accuracy: 0.8834782608695653
   dataset                      condition     auroc     auprc  accuracy  \
0  sample1                        sample1  0.955192  0.819577  0.951304   
1  sample1  sample1_neg_control_scrambled  0.49

  summary_df = pd.concat([summary_df, pd.DataFrame([self_dict])], ignore_index=True)


100%|██████████| 10/10 [00:28<00:00,  2.82s/trial, best loss: -0.14134482539672952]
sample1_neg_control_scrambled Best parameters for tree: {'colsample_bytree': 0.6201400912736421, 'gamma': 0.25847793505181665, 'learning_rate': 0.4075823243252942, 'max_depth': 0, 'min_child_weight': 6, 'n_estimators': 54, 'reg_alpha': 0.830541452637658, 'reg_lambda': 2.828225312727646, 'scale_pos_weight': 28.895374318835472, 'subsample': 0.5316206920129072}
sample1_neg_control_scrambled Best tree score: {'loss': -0.14134482539672952, 'accuracy': 0.59375, 'status': 'ok', 'auroc': 0.5414596220802488, 'auprc': 0.14134482539672952}
sample1_neg_control_scrambled AUROC: 0.4827609564451669
sample1_neg_control_scrambled AUPRC: 0.09801101965932554
sample1_neg_control_scrambled Accuracy: 0.5843478260869566
   dataset                      condition     auroc     auprc  accuracy  \
0  sample1                        sample1  0.972905  0.870873  0.942609   
1  sample1  sample1_neg_control_scrambled  0.482761  0.0980

  summary_df = pd.concat([summary_df, pd.DataFrame([self_dict])], ignore_index=True)


100%|██████████| 10/10 [00:37<00:00,  3.79s/trial, best loss: -0.1375496814064651]
sample1_neg_control_scrambled Best parameters for tree: {'colsample_bytree': 0.6745114579794532, 'gamma': 0.2658087802497519, 'learning_rate': 0.6919247056023231, 'max_depth': 17, 'min_child_weight': 4, 'n_estimators': 30, 'reg_alpha': 0.8480216988813162, 'reg_lambda': 1.9913049641478509, 'scale_pos_weight': 78.18027307303264, 'subsample': 0.7909695042397925}
sample1_neg_control_scrambled Best tree score: {'loss': -0.1375496814064651, 'accuracy': 0.8524305555555556, 'status': 'ok', 'auroc': 0.5497752087347463, 'auprc': 0.1375496814064651}
sample1_neg_control_scrambled AUROC: 0.49244733455259776
sample1_neg_control_scrambled AUPRC: 0.10991890508385999
sample1_neg_control_scrambled Accuracy: 0.8260869565217391
   dataset                      condition     auroc     auprc  accuracy  \
0  sample1                        sample1  0.974903  0.852012  0.953043   
1  sample1  sample1_neg_control_scrambled  0.4924

  summary_df = pd.concat([summary_df, pd.DataFrame([self_dict])], ignore_index=True)


100%|██████████| 10/10 [00:28<00:00,  2.87s/trial, best loss: -0.15463794667700226]
sample2_neg_control_scrambled Best parameters for tree: {'colsample_bytree': 0.9934997794353899, 'gamma': 0.19664584965392037, 'learning_rate': 0.7479403666398893, 'max_depth': 17, 'min_child_weight': 6, 'n_estimators': 17, 'reg_alpha': 0.3566833300858313, 'reg_lambda': 1.0733366303425596, 'scale_pos_weight': 35.50625512858694, 'subsample': 0.5242054046883757}
sample2_neg_control_scrambled Best tree score: {'loss': -0.15463794667700226, 'accuracy': 0.7685185185185185, 'status': 'ok', 'auroc': 0.5898279352226721, 'auprc': 0.15463794667700226}
sample2_neg_control_scrambled AUROC: 0.47665922468033284
sample2_neg_control_scrambled AUPRC: 0.11884671649723248
sample2_neg_control_scrambled Accuracy: 0.7610208816705336
   dataset                      condition     auroc     auprc  accuracy  \
0  sample2                        sample2  0.979501  0.947595  0.965197   
1  sample2  sample2_neg_control_scrambled  0.

  summary = pd.concat([summary, dataset_summary_df])


Index([], dtype='object')
Index(['barcode', 'label', 'sample'], dtype='object')
{'doublet': 261, 'singlet': 1896}
100%|██████████| 10/10 [00:21<00:00,  2.18s/trial, best loss: -0.9368184736063129]
sample2 Best parameters for tree: {'colsample_bytree': 0.806967040988943, 'gamma': 0.8368856063118089, 'learning_rate': 0.1831550221167391, 'max_depth': 4, 'min_child_weight': 0, 'n_estimators': 66, 'reg_alpha': 0.9125520952126791, 'reg_lambda': 1.076755406162537, 'scale_pos_weight': 21.464024235615245, 'subsample': 0.8851540278700618}
sample2 Best tree score: {'loss': -0.9368184736063129, 'accuracy': 0.9513888888888888, 'status': 'ok', 'auroc': 0.9816801619433199, 'auprc': 0.9368184736063129}
sample2 AUROC: 0.9512380759082606
sample2 AUPRC: 0.8907910368934304
sample2 Accuracy: 0.9675174013921114
   dataset condition     auroc     auprc  accuracy  \
0  sample2   sample2  0.951238  0.890791  0.967517   

                                         best_params  
0  {'n_estimators': 67, 'max_depth'

  summary_df = pd.concat([summary_df, pd.DataFrame([self_dict])], ignore_index=True)


100%|██████████| 10/10 [00:22<00:00,  2.23s/trial, best loss: -0.17411415101198147]
sample2_neg_control_scrambled Best parameters for tree: {'colsample_bytree': 0.5286254946497734, 'gamma': 0.3562120569458226, 'learning_rate': 0.6713553032885272, 'max_depth': 12, 'min_child_weight': 6, 'n_estimators': 40, 'reg_alpha': 0.6048777942312696, 'reg_lambda': 2.410678766459106, 'scale_pos_weight': 98.5466150302882, 'subsample': 0.6632423554883657}
sample2_neg_control_scrambled Best tree score: {'loss': -0.17411415101198147, 'accuracy': 0.7847222222222222, 'status': 'ok', 'auroc': 0.6000506072874494, 'auprc': 0.17411415101198147}
sample2_neg_control_scrambled AUROC: 0.5420641363913132
sample2_neg_control_scrambled AUPRC: 0.12384411838566318
sample2_neg_control_scrambled Accuracy: 0.7749419953596288
   dataset                      condition     auroc     auprc  accuracy  \
0  sample2                        sample2  0.951238  0.890791  0.967517   
1  sample2  sample2_neg_control_scrambled  0.5420

  summary_df = pd.concat([summary_df, pd.DataFrame([self_dict])], ignore_index=True)


100%|██████████| 10/10 [00:28<00:00,  2.84s/trial, best loss: -0.14315306663621097]
sample2_neg_control_scrambled Best parameters for tree: {'colsample_bytree': 0.7006032595534895, 'gamma': 0.2118167184067639, 'learning_rate': 0.4438432452119713, 'max_depth': 16, 'min_child_weight': 6, 'n_estimators': 52, 'reg_alpha': 0.2065573631852996, 'reg_lambda': 2.891244625046811, 'scale_pos_weight': 12.857939547057784, 'subsample': 0.6532411019570743}
sample2_neg_control_scrambled Best tree score: {'loss': -0.14315306663621097, 'accuracy': 0.8333333333333334, 'status': 'ok', 'auroc': 0.479251012145749, 'auprc': 0.14315306663621097}
sample2_neg_control_scrambled AUROC: 0.5591637913537648
sample2_neg_control_scrambled AUPRC: 0.15051727933428208
sample2_neg_control_scrambled Accuracy: 0.8468677494199536
   dataset                      condition     auroc     auprc  accuracy  \
0  sample2                        sample2  0.968947  0.902860  0.965197   
1  sample2  sample2_neg_control_scrambled  0.559

  summary_df = pd.concat([summary_df, pd.DataFrame([self_dict])], ignore_index=True)


100%|██████████| 10/10 [00:23<00:00,  2.39s/trial, best loss: -0.1798196146730927]
sample2_neg_control_scrambled Best parameters for tree: {'colsample_bytree': 0.744928161789364, 'gamma': 0.3015320646245827, 'learning_rate': 0.1311273156125351, 'max_depth': 4, 'min_child_weight': 2, 'n_estimators': 84, 'reg_alpha': 0.5508193807638289, 'reg_lambda': 1.4102277269945196, 'scale_pos_weight': 94.27085668119365, 'subsample': 0.5974632088588407}
sample2_neg_control_scrambled Best tree score: {'loss': -0.1798196146730927, 'accuracy': 0.7986111111111112, 'status': 'ok', 'auroc': 0.5671558704453441, 'auprc': 0.1798196146730927}
sample2_neg_control_scrambled AUROC: 0.49436776943373245
sample2_neg_control_scrambled AUPRC: 0.13604712624015425
sample2_neg_control_scrambled Accuracy: 0.8074245939675174
   dataset                      condition     auroc     auprc  accuracy  \
0  sample2                        sample2  0.987213  0.951401  0.974478   
1  sample2  sample2_neg_control_scrambled  0.494368

  summary_df = pd.concat([summary_df, pd.DataFrame([self_dict])], ignore_index=True)


100%|██████████| 10/10 [00:41<00:00,  4.18s/trial, best loss: -0.1404148640070899]
sample2_neg_control_scrambled Best parameters for tree: {'colsample_bytree': 0.9719548582296155, 'gamma': 0.597704203945928, 'learning_rate': 0.49164726923185703, 'max_depth': 7, 'min_child_weight': 7, 'n_estimators': 86, 'reg_alpha': 0.6310371165951536, 'reg_lambda': 1.6026460854630824, 'scale_pos_weight': 79.01028781908917, 'subsample': 0.8344608637643643}
sample2_neg_control_scrambled Best tree score: {'loss': -0.1404148640070899, 'accuracy': 0.8194444444444444, 'status': 'ok', 'auroc': 0.5221659919028341, 'auprc': 0.1404148640070899}
sample2_neg_control_scrambled AUROC: 0.5824030850416074
sample2_neg_control_scrambled AUPRC: 0.15346285795183107
sample2_neg_control_scrambled Accuracy: 0.839907192575406
   dataset                      condition     auroc     auprc  accuracy  \
0  sample2                        sample2  0.947483  0.847218  0.953596   
1  sample2  sample2_neg_control_scrambled  0.582403 

  summary_df = pd.concat([summary_df, pd.DataFrame([self_dict])], ignore_index=True)


100%|██████████| 10/10 [00:40<00:00,  4.05s/trial, best loss: -0.14126781154051307]
sample2_neg_control_scrambled Best parameters for tree: {'colsample_bytree': 0.881349782221059, 'gamma': 0.4480606297159974, 'learning_rate': 0.4865646463394279, 'max_depth': 14, 'min_child_weight': 0, 'n_estimators': 17, 'reg_alpha': 0.3974270171764631, 'reg_lambda': 2.927503898818969, 'scale_pos_weight': 88.43332010042704, 'subsample': 0.7873575890233333}
sample2_neg_control_scrambled Best tree score: {'loss': -0.14126781154051307, 'accuracy': 0.8032407407407407, 'status': 'ok', 'auroc': 0.48537449392712556, 'auprc': 0.14126781154051307}
sample2_neg_control_scrambled AUROC: 0.5328800487111833
sample2_neg_control_scrambled AUPRC: 0.15932619748030732
sample2_neg_control_scrambled Accuracy: 0.8051044083526682
   dataset                      condition     auroc     auprc  accuracy  \
0  sample2                        sample2  0.996397  0.977313  0.983759   
1  sample2  sample2_neg_control_scrambled  0.532

  summary_df = pd.concat([summary_df, pd.DataFrame([self_dict])], ignore_index=True)


100%|██████████| 10/10 [00:24<00:00,  2.42s/trial, best loss: -0.18445355711566686]
sample2_neg_control_scrambled Best parameters for tree: {'colsample_bytree': 0.7874957045709474, 'gamma': 0.13414101104123627, 'learning_rate': 0.7167553861916275, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 26, 'reg_alpha': 0.9137767953408962, 'reg_lambda': 1.4045006951096042, 'scale_pos_weight': 67.89650618391099, 'subsample': 0.9088509300494692}
sample2_neg_control_scrambled Best tree score: {'loss': -0.18445355711566686, 'accuracy': 0.7824074074074074, 'status': 'ok', 'auroc': 0.5732793522267207, 'auprc': 0.18445355711566686}
sample2_neg_control_scrambled AUROC: 0.5418104323117515
sample2_neg_control_scrambled AUPRC: 0.136032566567042
sample2_neg_control_scrambled Accuracy: 0.8004640371229699
   dataset                      condition     auroc     auprc  accuracy  \
0  sample2                        sample2  0.981733  0.912356  0.948956   
1  sample2  sample2_neg_control_scrambled  0.5418

  summary_df = pd.concat([summary_df, pd.DataFrame([self_dict])], ignore_index=True)


100%|██████████| 10/10 [00:27<00:00,  2.77s/trial, best loss: -0.14152404407207309]
sample2_neg_control_scrambled Best parameters for tree: {'colsample_bytree': 0.5281709969023181, 'gamma': 0.22913329149507655, 'learning_rate': 0.9425257121048617, 'max_depth': 17, 'min_child_weight': 2, 'n_estimators': 34, 'reg_alpha': 0.34423491037938747, 'reg_lambda': 2.907516754428174, 'scale_pos_weight': 28.284891344109294, 'subsample': 0.6129561585224256}
sample2_neg_control_scrambled Best tree score: {'loss': -0.14152404407207309, 'accuracy': 0.8101851851851852, 'status': 'ok', 'auroc': 0.5288461538461539, 'auprc': 0.14152404407207309}
sample2_neg_control_scrambled AUROC: 0.4786381165009133
sample2_neg_control_scrambled AUPRC: 0.11353233812924525
sample2_neg_control_scrambled Accuracy: 0.7726218097447796
   dataset                      condition     auroc     auprc  accuracy  \
0  sample2                        sample2  0.995230  0.975824  0.981439   
1  sample2  sample2_neg_control_scrambled  0.

  summary_df = pd.concat([summary_df, pd.DataFrame([self_dict])], ignore_index=True)


100%|██████████| 10/10 [00:34<00:00,  3.43s/trial, best loss: -0.15917271045773007]
sample2_neg_control_scrambled Best parameters for tree: {'colsample_bytree': 0.9095138716908135, 'gamma': 0.3330104387324479, 'learning_rate': 0.18181120665111203, 'max_depth': 16, 'min_child_weight': 6, 'n_estimators': 60, 'reg_alpha': 0.7567296828327666, 'reg_lambda': 2.3549326646900397, 'scale_pos_weight': 60.76330849730732, 'subsample': 0.8692821119493064}
sample2_neg_control_scrambled Best tree score: {'loss': -0.15917271045773007, 'accuracy': 0.8611111111111112, 'status': 'ok', 'auroc': 0.581174089068826, 'auprc': 0.15917271045773007}
sample2_neg_control_scrambled AUROC: 0.43865435356200533
sample2_neg_control_scrambled AUPRC: 0.10666243604007654
sample2_neg_control_scrambled Accuracy: 0.8631090487238979
   dataset                      condition     auroc     auprc  accuracy  \
0  sample2                        sample2  0.965243  0.921167  0.972158   
1  sample2  sample2_neg_control_scrambled  0.4

  summary_df = pd.concat([summary_df, pd.DataFrame([self_dict])], ignore_index=True)


100%|██████████| 10/10 [00:39<00:00,  3.93s/trial, best loss: -0.1822437525965624]
sample2_neg_control_scrambled Best parameters for tree: {'colsample_bytree': 0.8230658875420314, 'gamma': 0.2237049696604586, 'learning_rate': 0.4260353490107853, 'max_depth': 4, 'min_child_weight': 8, 'n_estimators': 78, 'reg_alpha': 0.6802617685634006, 'reg_lambda': 1.7416922659991554, 'scale_pos_weight': 54.6933584627515, 'subsample': 0.9090297901255493}
sample2_neg_control_scrambled Best tree score: {'loss': -0.1822437525965624, 'accuracy': 0.8425925925925926, 'status': 'ok', 'auroc': 0.48248987854251013, 'auprc': 0.1822437525965624}
sample2_neg_control_scrambled AUROC: 0.5020296326364928
sample2_neg_control_scrambled AUPRC: 0.1477837243421046
sample2_neg_control_scrambled Accuracy: 0.8352668213457076
   dataset                      condition    auroc     auprc  accuracy  \
0  sample2                        sample2  0.97945  0.932417  0.969838   
1  sample2  sample2_neg_control_scrambled  0.50203  0.

In [2]:
def main(dataset):
    if dataset.startswith("0"): # denotes variable doublet rate datasets, where the dataset name is the doublet rate
        counts_dir = os.path.join("/Users/mem3579/Library/CloudStorage/OneDrive-NorthwesternUniversity/Arispe and Goyal Labs/ZhangMelzerEtAl/data/classifier/FM01/variable_doublet_rates_2", dataset)
        labels_dir = os.path.join("/Users/mem3579/Library/CloudStorage/OneDrive-NorthwesternUniversity/Arispe and Goyal Labs/ZhangMelzerEtAl/data/classifier/FM01/")
    else:
        counts_dir = os.path.join(data_dir, "10X", dataset)
        labels_dir = os.path.join(data_dir, "10X", dataset)

        #counts_dir = os.path.join("/Users/mem3579/Library/CloudStorage/OneDrive-NorthwesternUniversity/Arispe and Goyal Labs/ZhangMelzerEtAl/data/classifier/s1s2", dataset, "10X_doublets_2/")
        #labels_dir = os.path.join("/Users/mem3579/Library/CloudStorage/OneDrive-NorthwesternUniversity/Arispe and Goyal Labs/ZhangMelzerEtAl/data/classifier/s1s2", dataset)


    summary_df = pd.DataFrame(columns = ["dataset", "condition", "auroc", "auprc", "accuracy", "best_params"])

    features, labels_encoded, barcodes_1, matrix_array, genes = load_and_preprocess_data(counts_dir, labels_dir)
    #FM01_dict = use_FM01_classifier(features, labels_encoded, barcodes_1, results_dir, classifiers_dir)
    #summary_df = pd.concat([summary_df, pd.DataFrame([FM01_dict])], ignore_index=True)
    #print(summary_df)
    self_dict = train_classifier(features, labels_encoded, barcodes_1, dataset)
    summary_df = pd.concat([summary_df, pd.DataFrame([self_dict])], ignore_index=True)
    print(summary_df)
    scrambled_dict = negative_controls(features, labels_encoded, barcodes_1, dataset, matrix_array, genes)
    #singlets_dict, doublets_dict, scrambled_dict = negative_controls(features, labels_encoded, barcodes_1, dataset, matrix_array, genes)
    #summary_df = pd.concat([summary_df, pd.DataFrame([singlets_dict])], ignore_index=True)
    #summary_df = pd.concat([summary_df, pd.DataFrame([doublets_dict])], ignore_index=True)
    #summary_df = pd.concat([summary_df, pd.DataFrame([shuffled_dict])], ignore_index=True)
    summary_df = pd.concat([summary_df, pd.DataFrame([scrambled_dict])], ignore_index=True)
    print(summary_df)

    return summary_df
    

In [3]:
def load_and_preprocess_data(counts_dir, labels_dir):
    os.chdir(counts_dir)

    # Read the .mtx file
    matrix = scipy.io.mmread("matrix.mtx")
    matrix = matrix.transpose()
    matrix_array = matrix.toarray()

    # Read the features and barcodes files
    genes = pd.read_csv("features.tsv.gz", header=None, sep="\t")
    genes_list = genes.index.tolist()
    first_column = genes.columns[0]
    genes = genes.set_index(first_column) #this is to ensure the anndata object is created correctly and that there are no extra columns in the features or barcodes dfs
    barcodes = pd.read_csv("barcodes.tsv.gz", header=None, sep="\t")
    first_column = barcodes.columns[0]
    barcodes = barcodes.set_index(first_column)

    # Create the AnnData object
    data = anndata.AnnData(X=matrix_array, var=genes_list, obs=barcodes)

    # getting singlet and multiplet labels
    os.chdir(labels_dir)
    labels_df = pd.read_csv(f'labels_2.csv') #for "variable doublet rates", indicate _{dataset} here

    ############## Preprocessing data
    print(data.obs.columns)  # Check columns in data.obs
    print(labels_df.columns) 

    # combining features matrix and labels
    data.obs.index = data.obs.index.rename('barcode')
    merged = data.obs.merge(labels_df, on='barcode', how='inner')
    #print(merged.head()) #checking what the merged looks like

    # Extract the features matrix and labels
    features = data.X
    labels = merged['label'].values
    unique_labels, counts = np.unique(labels, return_counts=True)
    print(dict(zip(unique_labels, counts))) #checking the number of singlets and multiplets

    # Encode labels
    label_encoder = LabelEncoder()
    labels_encoded = label_encoder.fit_transform(labels)
    labels_encoded = 1 - labels_encoded #switching the labels so that 1s are multiplets and 0s are singlets, so correclty identified 1s are considered true positives
    counts = np.bincount(labels_encoded)
    #print(counts) #checking that the number of singlets and multiplets is the same as above

    barcodes_1 = data.obs.index.to_numpy() #getting the barcodes for the features matrix to identify the cells that are being classified

    return features, labels_encoded, barcodes_1, matrix_array, genes

In [4]:
def train_classifier(features, labels_encoded, barcodes_1, condition):
    # split into training, testing, and validation sets
    X_train, X_test_temp, y_train, y_test_temp, barcodes_train, barcodes_test = train_test_split(features, labels_encoded, barcodes_1, test_size=0.4, random_state=count, shuffle=True, stratify=labels_encoded) # 40% test set, 60% training set
    X_test, X_valid, y_test, y_valid, barcodes_test, barcodes_valid = train_test_split(X_test_temp, y_test_temp, barcodes_test, test_size=0.5, random_state=count, shuffle=True, stratify=y_test_temp) # Split 50% of the test set into a validation set
    #print the first 10 rows of the training set
    #print(X_train[:10])
    #print the first 10 rows of the test_temp set
    #print(X_test_temp[:10])


    # Define the hyperparameter space
    space_tree = {
        'n_estimators': hp.choice('n_estimators', range(1, 100)),
        'max_depth': hp.choice('max_depth', range(1, 20)),
        'learning_rate': hp.uniform('learning_rate', 0.01, 1),
        'objective': 'binary:logistic',
        'min_child_weight': hp.choice('min_child_weight', range(1, 10)),
        'gamma': hp.uniform('gamma', 0.1, 1.0),
        'subsample': hp.uniform('subsample', 0.5, 1),
        'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
        'reg_alpha': hp.uniform('reg_alpha', 0.0, 1.0),
        'reg_lambda': hp.uniform('reg_lambda', 1.0, 3.0),
        'scale_pos_weight': hp.uniform('scale_pos_weight', 1, 100),
        'booster': 'gbtree'
    }

    # Define objective function
    def objective(params):
        bst = XGBClassifier(**params, random_state=count)
        bst.fit(X_train, y_train)
        preds = bst.predict(X_valid)
        preds_proba = bst.predict_proba(X_valid)[:, 1]
        accuracy = accuracy_score(y_valid, preds)
        auroc = roc_auc_score(y_valid, preds_proba)  # Calculate AUROC
        auprc = average_precision_score(y_valid, preds_proba)  # Calculate AUPRC
        return {'loss': -auprc, 'accuracy': accuracy, 'status': STATUS_OK, 'auroc': auroc, 'auprc': auprc}

    # Run the hyperparameter optimization
    trials_tree = Trials()
    best_tree = fmin(fn=objective, space=space_tree, algo=tpe.suggest, max_evals=10, trials=trials_tree)
    print(f"{condition} Best parameters for tree: {best_tree}")

    # Summary of the success of the hyperparameter optimization
    best_tree_score = min(trials_tree.results, key=lambda x: x['loss'])
    print(f"{condition} Best tree score: {best_tree_score}")

    # Adjusting the hyperparameters
    best_params_tree = {
        'n_estimators': best_tree['n_estimators'] + 1,  # +1 because hp.choice returns an index
        'max_depth': best_tree['max_depth'] + 1,        # +1 for the same reason
        'learning_rate': best_tree['learning_rate'],
        'objective': 'binary:logistic',
        'min_child_weight': best_tree['min_child_weight'] + 1,  # Adjust if needed
        'gamma': best_tree['gamma'],
        'subsample': best_tree['subsample'],
        'reg_alpha': best_tree['reg_alpha'],
        'reg_lambda': best_tree['reg_lambda'],
        'scale_pos_weight': best_tree['scale_pos_weight'],
        'booster': 'gbtree'
    }

    # Retrain the classifier with the best hyperparameters
    bst_best = XGBClassifier(**best_params_tree, random_state=23)
    if condition == dataset:
        dump(bst_best, classifiers_dir + f'/{dataset}_{count}_unfit.joblib') #saving unfit classifier
    bst_best.fit(X_train, y_train)
    if condition == dataset:
        dump(bst_best, classifiers_dir + f'/{dataset}_{count}_fit.joblib') #saving fit classifier

    preds_proba = bst_best.predict_proba(X_test)[:,1]  # Get probabilities of the positive class (multiplets- 1)
    auroc = roc_auc_score(y_test, preds_proba) # Calculate AUROC
    print(f"{condition} AUROC: {auroc}")
    auprc = average_precision_score(y_test, preds_proba) # Calculate AUPRC
    print(f"{condition} AUPRC: {auprc}")
    y_preds = bst_best.predict(X_test) # Predict labels on the test set
    accuracy = accuracy_score(y_test, y_preds) # Calculate accuracy
    print(f"{condition} Accuracy: {accuracy}")
    results = pd.DataFrame({
        'barcode': barcodes_test,
        'prediction probability': preds_proba,
        'predicted': y_preds,
        'actual': y_test
    })
    results.to_csv(os.path.join(results_dir, dataset_main, f"{condition}_predicted_actual.csv"))

    summary_dict = {"dataset": dataset, "condition": condition, "auroc": auroc, "auprc": auprc, "accuracy": accuracy, "best_params": best_params_tree}

    return summary_dict

In [5]:
def negative_controls(features, labels_encoded, barcodes_1, dataset, matrix_array, genes):
    total_cells = len(features)
    num_doublets = int((len(features)*0.1)/0.9)
    num_singlets = int(total_cells - num_doublets)

    # ############### singlets only ###############

    # # selecting only singlets as a control
    # features_singlets = features[:(num_singlets), :] #singlets always first in the features matrix
    # # creating fake doublets from those singlets
    # labels_singlets = np.zeros(num_singlets)
    # fakedoubletsforsinglets = int((len(features_singlets)*0.1)/0.9)
    # # making 10% of the singlets into fake doublets
    # labels_singlets[0:fakedoubletsforsinglets] = 1 

    # barcodes_singlets = barcodes_1[:num_singlets]

    # singlets_dict = train_classifier(features_singlets, labels_singlets, barcodes_singlets, f"{dataset}_neg_control_singlets")
    
    # ############### doublets only ###############
    
    # # selecting only doublets as a control
    # features_doublets = features[-num_doublets:, :] #doublets always last in the features matrix
    # # creating fake singlets from those doublets
    # labels_doublets = np.ones(num_doublets)
    # fakesingletsfordoublets = int(num_doublets - ((len(features_doublets)*0.1)/0.9))
    # # making 90% of the doublets into fake singlets
    # labels_doublets[0:fakesingletsfordoublets] = 0

    # barcodes_doublets = barcodes_1[-num_doublets:]
    
    # doublets_dict = train_classifier(features_doublets, labels_doublets, barcodes_doublets, f"{dataset}_neg_control_doublets")
 
    ############### shuffling features ###############

    # shuffled_indices = np.random.permutation(matrix_array.shape[1]) # Generate a shuffled index
    # matrix_array_shuffled = matrix_array[:, shuffled_indices] # Shuffle the columns of the matrix
    # genes_shuffled = genes.iloc[shuffled_indices] # Reorder the features DataFrame to match the new column order (NOTE: this doesnt get used here)

    # shuffled_data = anndata.AnnData(X=matrix_array_shuffled, var=genes_shuffled, obs=barcodes_1) #(NOTE: this doesnt get used here-- should i use it? probably unnecessary to specify gene names)

    # features_shuffled = shuffled_data.X

    # shuffled_dict = train_classifier(features_shuffled, labels_encoded, barcodes_1, f"{dataset}_shuffled")

    ############### scrambling features ###############

    flattened_matrix = matrix_array.flatten()
    np.random.shuffle(flattened_matrix)
    scrambled_matrix_array = flattened_matrix.reshape(matrix_array.shape)

    features_scrambled = scrambled_matrix_array

    scrambled_dict = train_classifier(features_scrambled, labels_encoded, barcodes_1, f"{dataset}_neg_control_scrambled")

    return scrambled_dict #(removed shuffled_dict 20240513, singlets_dict, doublets_dict on 20240515)

In [None]:
dataset = "s1nc_positiveControl"
condition = f"{dataset}_standard"
results_dir = "/Users/mem3579/Library/CloudStorage/OneDrive-NorthwesternUniversity/Arispe and Goyal Labs/ZhangMelzerEtAl/data/classifier/results/"
classifiers_dir = "/Users/mem3579/Library/CloudStorage/OneDrive-NorthwesternUniversity/Arispe and Goyal Labs/ZhangMelzerEtAl/data/classifier/classifiers/"

counts_dir = os.path.join("/Users/mem3579/Library/CloudStorage/OneDrive-NorthwesternUniversity/Arispe and Goyal Labs/ZhangMelzerEtAl/data/classifier", dataset, "10X_doublets_2/")
#counts_dir = os.path.join("/Users/mem3579/Library/CloudStorage/OneDrive-NorthwesternUniversity/Arispe and Goyal Labs/ZhangMelzerEtAl/data/classifier/FM01/variable_doublet_rates", dataset)
labels_dir = os.path.join("/Users/mem3579/Library/CloudStorage/OneDrive-NorthwesternUniversity/Arispe and Goyal Labs/ZhangMelzerEtAl/data/classifier", dataset)
#labels_dir = os.path.join("/Users/mem3579/Library/CloudStorage/OneDrive-NorthwesternUniversity/Arispe and Goyal Labs/ZhangMelzerEtAl/data/classifier/FM01/")

summary_df = pd.DataFrame(columns = ["dataset", "condition", "auroc", "auprc", "accuracy", "best_params"])

features, labels_encoded, barcodes_1, matrix_array, genes = load_and_preprocess_data(counts_dir, labels_dir)
#FM01_dict = use_FM01_classifier(features, labels_encoded, barcodes_1, results_dir, classifiers_dir)
#summary_df = pd.concat([summary_df, pd.DataFrame([FM01_dict])], ignore_index=True)
#print(summary_df)
self_dict = train_classifier(features, labels_encoded, barcodes_1, condition)
summary_df = pd.concat([summary_df, pd.DataFrame([self_dict])], ignore_index=True)
print(summary_df)
singlets_dict, doublets_dict, shuffled_dict, scrambled_dict = negative_controls(features, labels_encoded, barcodes_1, condition, matrix_array, genes)
summary_df = pd.concat([summary_df, pd.DataFrame([singlets_dict])], ignore_index=True)
summary_df = pd.concat([summary_df, pd.DataFrame([doublets_dict])], ignore_index=True)
summary_df = pd.concat([summary_df, pd.DataFrame([shuffled_dict])], ignore_index=True)
summary_df = pd.concat([summary_df, pd.DataFrame([scrambled_dict])], ignore_index=True)
print(summary_df)


In [None]:
####### writing a function to scramble the data 

def negative_controls(features, labels_encoded, barcodes_1, dataset, matrix_array, genes):
    flattened_matrix = matrix_array.flatten()
    np.random.shuffle(flattened_matrix)
    scrambled_matrix_array = flattened_matrix.reshape(matrix_array.shape)

    features_scrambled = scrambled_matrix_array

    scrambled_dict = train_classifier(features_scrambled, labels_encoded, barcodes_1, f"{dataset}_neg_control_scrambled")

    return scrambled_dict #(removed shuffled_dict 20240513, singlets_dict, doublets_dict on 20240515)

In [None]:
######## writing a function to use the classifier on a 

In [30]:
######### testing classifiers on scrambled samples as a negative control

dataset_main = "FateMap"

classifiers_dir = os.path.join("/Volumes/fsmresfiles/Basic_Sciences/CDB/GoyalLab/People/MadelineMelzer/ZhangMelzerEtAl/data/classifier/crossExperiment/classifiers", dataset_main)
print(classifiers_dir)

# loading and formatting sample 2 data
dataset_classifier = "sample2"
dataset_predict = "sample1"

counts_dir = os.path.join("/Volumes/fsmresfiles/Basic_Sciences/CDB/GoyalLab/People/MadelineMelzer/ZhangMelzerEtAl/data/classifier/crossExperiment", dataset_main, "10X", dataset_predict)
labels_dir = os.path.join("/Volumes/fsmresfiles/Basic_Sciences/CDB/GoyalLab/People/MadelineMelzer/ZhangMelzerEtAl/data/classifier/crossExperiment", dataset_main, "10X", dataset_predict)
results_dir = "/Volumes/fsmresfiles/Basic_Sciences/CDB/GoyalLab/People/MadelineMelzer/ZhangMelzerEtAl/data/classifier/crossExperiment/results/predictions/"
summaries_dir = "/Volumes/fsmresfiles/Basic_Sciences/CDB/GoyalLab/People/MadelineMelzer/ZhangMelzerEtAl/data/classifier/crossExperiment/results/datasetSummaries/"
features, labels_encoded, barcodes_1, matrix_array, genes = load_and_preprocess_data(counts_dir, labels_dir)

#scerambling matrix
flattened_matrix = matrix_array.flatten()
np.random.shuffle(flattened_matrix)
scrambled_matrix_array = flattened_matrix.reshape(matrix_array.shape)
features_scrambled = scrambled_matrix_array

summary_scrambled = pd.DataFrame(columns = ["dataset", "condition", "auroc", "auprc", "accuracy", "best_params"])

#classifier list for Goyal et al. samples came from identifiying which classifiers points were randomly chosen to be plotted in classifierCrossExperimentPlots.R
FMClassifierList_sample1 = ["28", "27", "71", "42", "44", "33", "47", "16", "20", "92"]
FMClassifierList_sample2 = ["39", "35", "69", "97", "30", "85", "76", "41", "9", "88"]

for count in FMClassifierList_sample2 :
    sample1Classifier = load(classifiers_dir + f'/{dataset_classifier}_{count}_fit.joblib')
    # predict sample 2 outcome using sample 1's classifier
    preds_proba = sample1Classifier.predict_proba(features_scrambled)[:,1]  # Get probabilities of the positive class (multiplets- 1)
    auroc = roc_auc_score(labels_encoded, preds_proba) # Calculate AUROC
    print(f"AUROC: {auroc}")
    auprc = average_precision_score(labels_encoded, preds_proba) # Calculate AUPRC
    print(f"AUPRC: {auprc}")
    y_preds = sample1Classifier.predict(features_scrambled) # Predict labels on the test set
    accuracy = accuracy_score(labels_encoded, y_preds) # Calculate accuracy
    print(f"Accuracy: {accuracy}")
    summary_dict = {"dataset": dataset_predict, "condition": dataset_classifier, "auroc": auroc, "auprc": auprc, "accuracy": accuracy}  
    summary_scrambled = pd.concat([summary_scrambled, pd.DataFrame([summary_dict])], ignore_index=True)

summary_scrambled.to_csv(os.path.join(summaries_dir, dataset_main, f"{dataset_predict}Scrambled_{dataset_classifier}Classifier.csv"), index = False)


/Volumes/fsmresfiles/Basic_Sciences/CDB/GoyalLab/People/MadelineMelzer/ZhangMelzerEtAl/data/classifier/crossExperiment/classifiers/FateMap
Index([], dtype='object')
Index(['barcode', 'label', 'sample'], dtype='object')
{'doublet': 489, 'singlet': 4401}
AUROC: 0.5199415544617347
AUPRC: 0.1019402802787604
Accuracy: 0.12249488752556237
AUROC: 0.5090184467278073
AUPRC: 0.10371919621402345
Accuracy: 0.1310838445807771
AUROC: 0.5269368506599866
AUPRC: 0.1111185799670817
Accuracy: 0.4768916155419223


  summary_scrambled = pd.concat([summary_scrambled, pd.DataFrame([summary_dict])], ignore_index=True)


AUROC: 0.5090474882776689
AUPRC: 0.10551955713110982
Accuracy: 0.12740286298568507
AUROC: 0.512489725099659
AUPRC: 0.10460181871601035
Accuracy: 0.1554192229038855
AUROC: 0.49951279895952255
AUPRC: 0.10010035110362053
Accuracy: 0.17873210633946832
AUROC: 0.5106034183530515
AUPRC: 0.10098740969360868
Accuracy: 0.11697341513292434
AUROC: 0.5297708412616764
AUPRC: 0.10737775739888533
Accuracy: 0.28445807770961146
AUROC: 0.5061651725370094
AUPRC: 0.10005280424928553
Accuracy: 0.13680981595092023
AUROC: 0.5021502363517494
AUPRC: 0.0984297586246487
Accuracy: 0.15807770961145196


In [21]:
### classifier fit to sample 1 data to predict sample 2 outcome
dataset_main = "TREX_minusCluster"

classifiers_dir = os.path.join("/Volumes/fsmresfiles/Basic_Sciences/CDB/GoyalLab/People/MadelineMelzer/ZhangMelzerEtAl/data/classifier/crossExperiment/classifiers", dataset_main)
print(classifiers_dir)

# loading and formatting sample 2 data

dataset = "sample2"
counts_dir = os.path.join("/Volumes/fsmresfiles/Basic_Sciences/CDB/GoyalLab/People/MadelineMelzer/ZhangMelzerEtAl/data/classifier/crossExperiment", dataset_main, "10X", dataset)
labels_dir = os.path.join("/Volumes/fsmresfiles/Basic_Sciences/CDB/GoyalLab/People/MadelineMelzer/ZhangMelzerEtAl/data/classifier/crossExperiment", dataset_main, "10X", dataset)
results_dir = "/Volumes/fsmresfiles/Basic_Sciences/CDB/GoyalLab/People/MadelineMelzer/ZhangMelzerEtAl/data/classifier/crossExperiment/results/predictions/"
summaries_dir = "/Volumes/fsmresfiles/Basic_Sciences/CDB/GoyalLab/People/MadelineMelzer/ZhangMelzerEtAl/data/classifier/crossExperiment/results/datasetSummaries/"
features, labels_encoded, barcodes_1, matrix_array, genes = load_and_preprocess_data(counts_dir, labels_dir)

summary_df = pd.DataFrame(columns = ["dataset", "condition", "auroc", "auprc", "accuracy", "best_params"])

for count in range(10):
    sample1Classifier = load(classifiers_dir + f'/sample1_{count}_fit.joblib') #using the classifier that was fit to sample 1
    # predict sample 2 outcome using sample 1's classifier
    preds_proba = sample1Classifier.predict_proba(features)[:,1]  # Get probabilities of the positive class (multiplets- 1)
    auroc = roc_auc_score(labels_encoded, preds_proba) # Calculate AUROC
    print(f"AUROC: {auroc}")
    auprc = average_precision_score(labels_encoded, preds_proba) # Calculate AUPRC
    print(f"AUPRC: {auprc}")
    y_preds = sample1Classifier.predict(features) # Predict labels on the test set
    accuracy = accuracy_score(labels_encoded, y_preds) # Calculate accuracy
    print(f"Accuracy: {accuracy}")  
    summary_dict = {"dataset": dataset, "condition": "sample1", "auroc": auroc, "auprc": auprc, "accuracy": accuracy}  
    summary_df = pd.concat([summary_df, pd.DataFrame([summary_dict])], ignore_index=True)

#loading and formatting sample 2 data

dataset = "sample1"
counts_dir = os.path.join("/Volumes/fsmresfiles/Basic_Sciences/CDB/GoyalLab/People/MadelineMelzer/ZhangMelzerEtAl/data/classifier/crossExperiment", dataset_main, "10X", dataset)
labels_dir = os.path.join("/Volumes/fsmresfiles/Basic_Sciences/CDB/GoyalLab/People/MadelineMelzer/ZhangMelzerEtAl/data/classifier/crossExperiment", dataset_main, "10X", dataset)
results_dir = "/Volumes/fsmresfiles/Basic_Sciences/CDB/GoyalLab/People/MadelineMelzer/ZhangMelzerEtAl/data/classifier/crossExperiment/results/predictions/"
summaries_dir = "/Volumes/fsmresfiles/Basic_Sciences/CDB/GoyalLab/People/MadelineMelzer/ZhangMelzerEtAl/data/classifier/crossExperiment/results/datasetSummaries/"
features, labels_encoded, barcodes_1, matrix_array, genes = load_and_preprocess_data(counts_dir, labels_dir)

for count in range(10):
    sample1Classifier = load(classifiers_dir + f'/sample2_{count}_fit.joblib') #using the classifier that was fit to sample 1
    # predict sample 2 outcome using sample 1's classifier
    preds_proba = sample1Classifier.predict_proba(features)[:,1]  # Get probabilities of the positive class (multiplets- 1)
    auroc = roc_auc_score(labels_encoded, preds_proba) # Calculate AUROC
    print(f"AUROC: {auroc}")
    auprc = average_precision_score(labels_encoded, preds_proba) # Calculate AUPRC
    print(f"AUPRC: {auprc}")
    y_preds = sample1Classifier.predict(features) # Predict labels on the test set
    accuracy = accuracy_score(labels_encoded, y_preds) # Calculate accuracy
    print(f"Accuracy: {accuracy}")  
    summary_dict = {"dataset": dataset, "condition": "sample2", "auroc": auroc, "auprc": auprc, "accuracy": accuracy}  
    summary_df = pd.concat([summary_df, pd.DataFrame([summary_dict])], ignore_index=True)

# loading and formatting sample 3 data

# dataset = "TREX1_A_mouse"
# counts_dir = os.path.join("/Volumes/fsmresfiles/Basic_Sciences/CDB/GoyalLab/People/MadelineMelzer/ZhangMelzerEtAl/data/classifier/crossExperiment", dataset_main, "10X", dataset)
# labels_dir = os.path.join("/Volumes/fsmresfiles/Basic_Sciences/CDB/GoyalLab/People/MadelineMelzer/ZhangMelzerEtAl/data/classifier/crossExperiment", dataset_main, "10X", dataset)
# results_dir = "/Volumes/fsmresfiles/Basic_Sciences/CDB/GoyalLab/People/MadelineMelzer/ZhangMelzerEtAl/data/classifier/crossExperiment/results/predictions/"
# summaries_dir = "/Volumes/fsmresfiles/Basic_Sciences/CDB/GoyalLab/People/MadelineMelzer/ZhangMelzerEtAl/data/classifier/crossExperiment/results/datasetSummaries/"
# features, labels_encoded, barcodes_1, matrix_array, genes = load_and_preprocess_data(counts_dir, labels_dir)

# for count in range(10):
#     sample2Classifier = load(classifiers_dir + f'/B_mouse_{count}_fit.joblib')
#     # predict sample 1 outcome using sample 2's classifier
#     preds_proba = sample2Classifier.predict_proba(features)[:,1]  # Get probabilities of the positive class (multiplets- 1)
#     auroc = roc_auc_score(labels_encoded, preds_proba) # Calculate AUROC
#     print(f"AUROC: {auroc}")
#     auprc = average_precision_score(labels_encoded, preds_proba) # Calculate AUPRC
#     print(f"AUPRC: {auprc}")
#     y_preds = sample2Classifier.predict(features) # Predict labels on the test set
#     accuracy = accuracy_score(labels_encoded, y_preds) # Calculate accuracy
#     print(f"Accuracy: {accuracy}")  
#     summary_dict = {"dataset": dataset, "condition": "B_mouse", "auroc": auroc, "auprc": auprc, "accuracy": accuracy}  
#     summary_df = pd.concat([summary_df, pd.DataFrame([summary_dict])], ignore_index=True)
    
summary_df.to_csv(os.path.join(summaries_dir, dataset_main, f"summary_s1s2_bothClassifiers.csv"), index = False)


/Volumes/fsmresfiles/Basic_Sciences/CDB/GoyalLab/People/MadelineMelzer/ZhangMelzerEtAl/data/classifier/crossExperiment/classifiers/TREX_minusCluster
Index([], dtype='object')
Index(['barcode', 'label', 'sample'], dtype='object')
{'doublet': 261, 'singlet': 1896}
AUROC: 0.9510766768514478
AUPRC: 0.7707232602093118
Accuracy: 0.9318497913769124
AUROC: 0.9185662091598364
AUPRC: 0.6838144774997007
Accuracy: 0.9225776541492814
AUROC: 0.9545120196582441
AUPRC: 0.8114914466093169
Accuracy: 0.943439962911451
AUROC: 0.8617395767657661
AUPRC: 0.5252227961227719
Accuracy: 0.8525730180806675
AUROC: 0.9137668331797534
AUPRC: 0.724575773592798
Accuracy: 0.8451553082985628


  summary_df = pd.concat([summary_df, pd.DataFrame([summary_dict])], ignore_index=True)


AUROC: 0.9290925036778375
AUPRC: 0.7522399018261122
Accuracy: 0.9188687992582291
AUROC: 0.9535824563105227
AUPRC: 0.8142084184347304
Accuracy: 0.9397311080203987
AUROC: 0.8995788673876846
AUPRC: 0.6184635104884548
Accuracy: 0.9114510894761243
AUROC: 0.9421670142425271
AUPRC: 0.7482209475989734
Accuracy: 0.9049605934167826
AUROC: 0.9176972695087056
AUPRC: 0.6721937329922801
Accuracy: 0.8915159944367177
Index([], dtype='object')
Index(['barcode', 'label', 'sample'], dtype='object')
{'doublet': 285, 'singlet': 2591}
AUROC: 0.7310271046199056
AUPRC: 0.2776028345167208
Accuracy: 0.7576495132127955
AUROC: 0.6885331816612159
AUPRC: 0.19388688748460592
Accuracy: 0.7593880389429764
AUROC: 0.7325627848084124
AUPRC: 0.252638909611973
Accuracy: 0.7340055632823366
AUROC: 0.6836227968609289
AUPRC: 0.2012268288029272
Accuracy: 0.7868567454798331
AUROC: 0.7638749517560787
AUPRC: 0.2674988583599915
Accuracy: 0.8306675938803895
AUROC: 0.7679199929580802
AUPRC: 0.2724065236103098
Accuracy: 0.820931849791

In [44]:
# loading and formatting sample 3 data for sample 1 classifier (can only do when 1-4 are integrated together)

dataset_main = "SPLINTR"
classifiers_dir = os.path.join("/Volumes/fsmresfiles/Basic_Sciences/CDB/GoyalLab/People/MadelineMelzer/ZhangMelzerEtAl/data/classifier/crossExperiment/classifiers", dataset_main)

dataset = "sample3_s1s2s3s4"
counts_dir = os.path.join("/Volumes/fsmresfiles/Basic_Sciences/CDB/GoyalLab/People/MadelineMelzer/ZhangMelzerEtAl/data/classifier/crossExperiment", dataset_main, "10X", dataset)
labels_dir = os.path.join("/Volumes/fsmresfiles/Basic_Sciences/CDB/GoyalLab/People/MadelineMelzer/ZhangMelzerEtAl/data/classifier/crossExperiment", dataset_main, "10X", dataset)
results_dir = "/Volumes/fsmresfiles/Basic_Sciences/CDB/GoyalLab/People/MadelineMelzer/ZhangMelzerEtAl/data/classifier/crossExperiment/results/predictions/"
summaries_dir = "/Volumes/fsmresfiles/Basic_Sciences/CDB/GoyalLab/People/MadelineMelzer/ZhangMelzerEtAl/data/classifier/crossExperiment/results/datasetSummaries/"
features, labels_encoded, barcodes_1, matrix_array, genes = load_and_preprocess_data(counts_dir, labels_dir)

summary_df = pd.DataFrame(columns = ["dataset", "condition", "auroc", "auprc", "accuracy", "best_params"])

for count in range(10):
    sample1Classifier = load(classifiers_dir + f'/sample1_s1s2s3s4_{count}_fit.joblib') #using the classifier that was fit to sample 1
    # predict sample 2 outcome using sample 1's classifier
    preds_proba = sample1Classifier.predict_proba(features)[:,1]  # Get probabilities of the positive class (multiplets- 1)
    auroc = roc_auc_score(labels_encoded, preds_proba) # Calculate AUROC
    print(f"AUROC: {auroc}")
    auprc = average_precision_score(labels_encoded, preds_proba) # Calculate AUPRC
    print(f"AUPRC: {auprc}")
    y_preds = sample1Classifier.predict(features) # Predict labels on the test set
    accuracy = accuracy_score(labels_encoded, y_preds) # Calculate accuracy
    print(f"Accuracy: {accuracy}")  
    summary_dict = {"dataset": dataset, "condition": "sample1_s1s2s3s4", "auroc": auroc, "auprc": auprc, "accuracy": accuracy}  
    summary_df = pd.concat([summary_df, pd.DataFrame([summary_dict])], ignore_index=True)

# loading and formatting sample 1 data for sample 3 classifier

dataset = "sample1_s1s2s3s4"
counts_dir = os.path.join("/Volumes/fsmresfiles/Basic_Sciences/CDB/GoyalLab/People/MadelineMelzer/ZhangMelzerEtAl/data/classifier/crossExperiment", dataset_main, "10X", dataset)
labels_dir = os.path.join("/Volumes/fsmresfiles/Basic_Sciences/CDB/GoyalLab/People/MadelineMelzer/ZhangMelzerEtAl/data/classifier/crossExperiment", dataset_main, "10X", dataset)
results_dir = "/Volumes/fsmresfiles/Basic_Sciences/CDB/GoyalLab/People/MadelineMelzer/ZhangMelzerEtAl/data/classifier/crossExperiment/results/predictions/"
summaries_dir = "/Volumes/fsmresfiles/Basic_Sciences/CDB/GoyalLab/People/MadelineMelzer/ZhangMelzerEtAl/data/classifier/crossExperiment/results/datasetSummaries/"
features, labels_encoded, barcodes_1, matrix_array, genes = load_and_preprocess_data(counts_dir, labels_dir)

for count in range(10):
    sample3Classifier = load(classifiers_dir + f'/sample3_s1s2s3s4_{count}_fit.joblib')
    # predict sample 1 outcome using sample 2's classifier
    preds_proba = sample3Classifier.predict_proba(features)[:,1]  # Get probabilities of the positive class (multiplets- 1)
    auroc = roc_auc_score(labels_encoded, preds_proba) # Calculate AUROC
    print(f"AUROC: {auroc}")
    auprc = average_precision_score(labels_encoded, preds_proba) # Calculate AUPRC
    print(f"AUPRC: {auprc}")
    y_preds = sample3Classifier.predict(features) # Predict labels on the test set
    accuracy = accuracy_score(labels_encoded, y_preds) # Calculate accuracy
    print(f"Accuracy: {accuracy}")  
    summary_dict = {"dataset": dataset, "condition": "sample3_s1s2s3s4", "auroc": auroc, "auprc": auprc, "accuracy": accuracy}  
    summary_df = pd.concat([summary_df, pd.DataFrame([summary_dict])], ignore_index=True)
    
summary_df.to_csv(os.path.join(summaries_dir, dataset_main, f"summary_s1s3_bothClassifiers.csv"), index = False)

Index([], dtype='object')
Index(['barcode', 'label', 'sample'], dtype='object')
{'doublet': 135, 'singlet': 1215}
AUROC: 0.8515165371132449
AUPRC: 0.5098618964226356
Accuracy: 0.9096296296296297
AUROC: 0.8658802011888432
AUPRC: 0.5072343603720617
Accuracy: 0.9044444444444445
AUROC: 0.8347508001828989
AUPRC: 0.34345829549838924
Accuracy: 0.9014814814814814
AUROC: 0.8767931717725956
AUPRC: 0.5058655729883778
Accuracy: 0.9007407407407407
AUROC: 0.8410973936899863
AUPRC: 0.4090397503696277
Accuracy: 0.9007407407407407
AUROC: 0.7955067825026673
AUPRC: 0.36807780587320094
Accuracy: 0.9051851851851852


  summary_df = pd.concat([summary_df, pd.DataFrame([summary_dict])], ignore_index=True)


AUROC: 0.8939003200731596
AUPRC: 0.5099632387185461
Accuracy: 0.9081481481481481
AUROC: 0.8743789056546258
AUPRC: 0.5357379806832681
Accuracy: 0.902962962962963
AUROC: 0.8952903520804756
AUPRC: 0.538431626399674
Accuracy: 0.9044444444444445
AUROC: 0.8758847736625516
AUPRC: 0.47777191050308376
Accuracy: 0.9022222222222223
Index([], dtype='object')
Index(['barcode', 'label', 'sample'], dtype='object')
{'doublet': 96, 'singlet': 864}
AUROC: 0.8838252314814815
AUPRC: 0.5555276873563401
Accuracy: 0.9104166666666667
AUROC: 0.854275173611111
AUPRC: 0.48182607364034485
Accuracy: 0.8645833333333334
AUROC: 0.8705994405864197
AUPRC: 0.5382235540883269
Accuracy: 0.890625
AUROC: 0.8704306520061728
AUPRC: 0.48657110356699407
Accuracy: 0.8104166666666667
AUROC: 0.8851996527777779
AUPRC: 0.5629344182617926
Accuracy: 0.8677083333333333
AUROC: 0.8966169945987654
AUPRC: 0.5243092648565193
Accuracy: 0.85625
AUROC: 0.8794126157407408
AUPRC: 0.6042074574858988
Accuracy: 0.80625
AUROC: 0.9078534915123456
AUP