In [12]:
### training a doublet classifier on DNA barcoded "ground truth" singlets for Zhang Melzer et al. 2023
### Created by Madeline E Melzer on 20231110
### Last edited by Madeline E Melzer on 20240120

In [13]:
import os
import xgboost
import sklearn
import shutil
import numpy as np
import pandas as pd
import scanpy as sc
import scipy
from scipy.io import mmread
import anndata
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.metrics import accuracy_score, roc_auc_score, average_precision_score, f1_score, precision_score, recall_score, confusion_matrix, classification_report
from xgboost import XGBClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from joblib import dump, load
print(xgboost.__version__)

np.random.seed(23)

2.0.2


In [14]:
### variable doublet rates

dataset_list = ["0.05", "0.08", "0.10", "0.15", "0.20", "0.25"]
results_dir = "/Users/mem3579/Library/CloudStorage/OneDrive-NorthwesternUniversity/Arispe and Goyal Labs/ZhangMelzerEtAl/data/classifier/results/"
classifiers_dir = "/Users/mem3579/Library/CloudStorage/OneDrive-NorthwesternUniversity/Arispe and Goyal Labs/ZhangMelzerEtAl/data/classifier/classifiers/"

summary = pd.DataFrame(columns = ["dataset", "condition", "auroc", "auprc", "accuracy", "best_params"])

for dataset in dataset_list:
    for count in range(10):
        dataset_summary_df = main(dataset)
        summary = pd.concat([summary, dataset_summary_df])
    summary.to_csv(f"/Users/mem3579/Library/CloudStorage/OneDrive-NorthwesternUniversity/Arispe and Goyal Labs/ZhangMelzerEtAl/data/classifier/results/datasetSummaries/{dataset}.csv", index = False)

summary.to_csv("/Users/mem3579/Library/CloudStorage/OneDrive-NorthwesternUniversity/Arispe and Goyal Labs/ZhangMelzerEtAl/data/classifier/results/summary_variableDoubletRates.csv", index = False)

KeyboardInterrupt: 

In [15]:
### different datasets
np.random.seed(23)

#dataset_list = ["s1nc_positiveControl"]
#dataset_list = ["non_cancer", "s1nc_positiveControl", "Biorxiv", "TREX", "SPLINTR", "ClonMapper"]
#dataset_list = ["sample1", "sample2"]
dataset_list = ["LARRY", "Biorxiv"]
results_dir = "/Volumes/fsmresfiles/Basic_Sciences/CDB/GoyalLab/People/MadelineMelzer/ZhangMelzerEtAl/data/classifier/results/"
classifiers_dir = "/Volumes/fsmresfiles/Basic_Sciences/CDB/GoyalLab/People/MadelineMelzer/ZhangMelzerEtAl/data/classifier/classifiers/"

summary = pd.DataFrame(columns = ["dataset", "condition", "auroc", "auprc", "accuracy", "best_params"])
for dataset in dataset_list:
    for count in range(10):
        np.random.seed(count)
        dataset_summary_df = main(dataset)
        summary = pd.concat([summary, dataset_summary_df])
    summary.to_csv(f"/Volumes/fsmresfiles/Basic_Sciences/CDB/GoyalLab/People/MadelineMelzer/ZhangMelzerEtAl/data/classifier/results/datasetSummaries/datasets/{dataset}.csv", index = False)

#summary.to_csv("/Users/mem3579/Library/CloudStorage/OneDrive-NorthwesternUniversity/Arispe and Goyal Labs/ZhangMelzerEtAl/data/classifier/results/summary_LARRYWatermelonCellTag.csv", index = False)

{'doublet': 217, 'singlet': 1953}
100%|██████████| 10/10 [00:17<00:00,  1.74s/trial, best loss: -0.9962070056369545]
LARRY Best parameters for tree: {'colsample_bytree': 0.9314134225744485, 'gamma': 0.8066099044465401, 'learning_rate': 0.2388349210983473, 'max_depth': 15, 'min_child_weight': 4, 'n_estimators': 38, 'reg_alpha': 0.8990227809632552, 'reg_lambda': 1.719095808882156, 'scale_pos_weight': 6.11968327069455, 'subsample': 0.6926283577703484}
LARRY Best tree score: {'loss': -0.9962070056369545, 'accuracy': 0.9746543778801844, 'status': 'ok', 'auroc': 0.999583655504669, 'auprc': 0.9962070056369545}
LARRY AUROC: 0.9989510489510489
LARRY AUPRC: 0.9915958150429074
LARRY Accuracy: 0.9723502304147466
  dataset condition     auroc     auprc  accuracy  \
0   LARRY     LARRY  0.998951  0.991596   0.97235   

                                         best_params  
0  {'n_estimators': 39, 'max_depth': 16, 'learnin...  


  summary_df = pd.concat([summary_df, pd.DataFrame([self_dict])], ignore_index=True)


100%|██████████| 10/10 [00:26<00:00,  2.70s/trial, best loss: -0.17598104520638386]
LARRY_neg_control_singlets Best parameters for tree: {'colsample_bytree': 0.7462264203267849, 'gamma': 0.7865926693973192, 'learning_rate': 0.5123312652804152, 'max_depth': 15, 'min_child_weight': 1, 'n_estimators': 53, 'reg_alpha': 0.09435844721714315, 'reg_lambda': 1.725615047323557, 'scale_pos_weight': 77.22603855144048, 'subsample': 0.7900253137853039}
LARRY_neg_control_singlets Best tree score: {'loss': -0.17598104520638386, 'accuracy': 0.8652849740932642, 'status': 'ok', 'auroc': 0.5919723371076006, 'auprc': 0.17598104520638386}
LARRY_neg_control_singlets AUROC: 0.43189368770764114
LARRY_neg_control_singlets AUPRC: 0.1282572593308317
LARRY_neg_control_singlets Accuracy: 0.8316062176165803
100%|██████████| 10/10 [00:03<00:00,  2.54trial/s, best loss: -0.4421012849584278]
LARRY_neg_control_doublets Best parameters for tree: {'colsample_bytree': 0.8311109728717065, 'gamma': 0.8832702553106381, 'learn

  summary = pd.concat([summary, dataset_summary_df])


{'doublet': 217, 'singlet': 1953}
100%|██████████| 10/10 [00:22<00:00,  2.28s/trial, best loss: -0.9708771007127481]
LARRY Best parameters for tree: {'colsample_bytree': 0.9222329222955502, 'gamma': 0.58828859868466, 'learning_rate': 0.2022133071330132, 'max_depth': 2, 'min_child_weight': 4, 'n_estimators': 81, 'reg_alpha': 0.45794542262798454, 'reg_lambda': 1.572547695551997, 'scale_pos_weight': 87.69208152012081, 'subsample': 0.6825541729901274}
LARRY Best tree score: {'loss': -0.9708771007127481, 'accuracy': 0.9838709677419355, 'status': 'ok', 'auroc': 0.9928904428904428, 'auprc': 0.9708771007127481}
LARRY AUROC: 0.9995836555046691
LARRY AUPRC: 0.9963590240780426
LARRY Accuracy: 0.9953917050691244
  dataset condition     auroc     auprc  accuracy  \
0   LARRY     LARRY  0.999584  0.996359  0.995392   

                                         best_params  
0  {'n_estimators': 82, 'max_depth': 3, 'learning...  


  summary_df = pd.concat([summary_df, pd.DataFrame([self_dict])], ignore_index=True)


100%|██████████| 10/10 [00:20<00:00,  2.01s/trial, best loss: -0.1433695166004305]
LARRY_neg_control_singlets Best parameters for tree: {'colsample_bytree': 0.7742578673936483, 'gamma': 0.2546002258001565, 'learning_rate': 0.9616136397722715, 'max_depth': 1, 'min_child_weight': 4, 'n_estimators': 46, 'reg_alpha': 0.7081866399764161, 'reg_lambda': 1.3566956659397942, 'scale_pos_weight': 46.48649107951618, 'subsample': 0.7578144544972563}
LARRY_neg_control_singlets Best tree score: {'loss': -0.1433695166004305, 'accuracy': 0.7046632124352331, 'status': 'ok', 'auroc': 0.5496304834226049, 'auprc': 0.1433695166004305}
LARRY_neg_control_singlets AUROC: 0.491490948538884
LARRY_neg_control_singlets AUPRC: 0.1452825533057234
LARRY_neg_control_singlets Accuracy: 0.7564766839378239
100%|██████████| 10/10 [00:03<00:00,  2.55trial/s, best loss: -0.3658641975308642]
LARRY_neg_control_doublets Best parameters for tree: {'colsample_bytree': 0.5073483455230736, 'gamma': 0.5577649307294584, 'learning_ra

  summary_df = pd.concat([summary_df, pd.DataFrame([self_dict])], ignore_index=True)


100%|██████████| 10/10 [00:24<00:00,  2.41s/trial, best loss: -0.12380084072905273]
LARRY_neg_control_singlets Best parameters for tree: {'colsample_bytree': 0.6109098830178765, 'gamma': 0.36014709841358983, 'learning_rate': 0.09619188137560936, 'max_depth': 9, 'min_child_weight': 3, 'n_estimators': 30, 'reg_alpha': 0.5842437656741174, 'reg_lambda': 2.251449347726057, 'scale_pos_weight': 13.715096525242563, 'subsample': 0.947672969532054}
LARRY_neg_control_singlets Best tree score: {'loss': -0.12380084072905273, 'accuracy': 0.8808290155440415, 'status': 'ok', 'auroc': 0.546748932130992, 'auprc': 0.12380084072905273}
LARRY_neg_control_singlets AUROC: 0.43989423011729606
LARRY_neg_control_singlets AUPRC: 0.1013974301655593
LARRY_neg_control_singlets Accuracy: 0.8704663212435233
100%|██████████| 10/10 [00:03<00:00,  2.67trial/s, best loss: -0.2305728088336784]
LARRY_neg_control_doublets Best parameters for tree: {'colsample_bytree': 0.7706260956728055, 'gamma': 0.6222620435365263, 'learni

  summary_df = pd.concat([summary_df, pd.DataFrame([self_dict])], ignore_index=True)


100%|██████████| 10/10 [00:21<00:00,  2.14s/trial, best loss: -0.13251313400057307]
LARRY_neg_control_singlets Best parameters for tree: {'colsample_bytree': 0.6758146274738852, 'gamma': 0.4267392235272057, 'learning_rate': 0.3838857948174676, 'max_depth': 18, 'min_child_weight': 0, 'n_estimators': 15, 'reg_alpha': 0.09250976866610605, 'reg_lambda': 2.740370461638201, 'scale_pos_weight': 98.10015202505329, 'subsample': 0.6277718137094224}
LARRY_neg_control_singlets Best tree score: {'loss': -0.13251313400057307, 'accuracy': 0.7797927461139896, 'status': 'ok', 'auroc': 0.4718285985490542, 'auprc': 0.13251313400057307}
LARRY_neg_control_singlets AUROC: 0.3811105837683911
LARRY_neg_control_singlets AUPRC: 0.08937797301322155
LARRY_neg_control_singlets Accuracy: 0.7564766839378239
100%|██████████| 10/10 [00:04<00:00,  2.35trial/s, best loss: -0.4921144835332478]
LARRY_neg_control_doublets Best parameters for tree: {'colsample_bytree': 0.7563255712318411, 'gamma': 0.9350600367728926, 'learn

  summary_df = pd.concat([summary_df, pd.DataFrame([self_dict])], ignore_index=True)


100%|██████████| 10/10 [00:18<00:00,  1.90s/trial, best loss: -0.12588884223468397]
LARRY_neg_control_singlets Best parameters for tree: {'colsample_bytree': 0.769614049784858, 'gamma': 0.3385394585748523, 'learning_rate': 0.6888216191341471, 'max_depth': 13, 'min_child_weight': 0, 'n_estimators': 79, 'reg_alpha': 0.7495653769263544, 'reg_lambda': 1.8890493280065621, 'scale_pos_weight': 17.695019016441904, 'subsample': 0.6410342000851053}
LARRY_neg_control_singlets Best tree score: {'loss': -0.12588884223468397, 'accuracy': 0.8393782383419689, 'status': 'ok', 'auroc': 0.5263407688656858, 'auprc': 0.12588884223468397}
LARRY_neg_control_singlets AUROC: 0.4264695911587226
LARRY_neg_control_singlets AUPRC: 0.09712136282315606
LARRY_neg_control_singlets Accuracy: 0.8393782383419689
100%|██████████| 10/10 [00:04<00:00,  2.27trial/s, best loss: -0.37077922077922076]
LARRY_neg_control_doublets Best parameters for tree: {'colsample_bytree': 0.663901407150939, 'gamma': 0.3165130657388645, 'learn

  summary_df = pd.concat([summary_df, pd.DataFrame([self_dict])], ignore_index=True)


100%|██████████| 10/10 [00:19<00:00,  1.94s/trial, best loss: -0.14160854898902392]
LARRY_neg_control_singlets Best parameters for tree: {'colsample_bytree': 0.5103444593436994, 'gamma': 0.4242070727972568, 'learning_rate': 0.3876957064189051, 'max_depth': 10, 'min_child_weight': 8, 'n_estimators': 79, 'reg_alpha': 0.7090960511640536, 'reg_lambda': 1.987322072702891, 'scale_pos_weight': 22.720117212179204, 'subsample': 0.9429635742153433}
LARRY_neg_control_singlets Best tree score: {'loss': -0.14160854898902392, 'accuracy': 0.8756476683937824, 'status': 'ok', 'auroc': 0.5399688114448438, 'auprc': 0.14160854898902392}
LARRY_neg_control_singlets AUROC: 0.4648450742423215
LARRY_neg_control_singlets AUPRC: 0.1408106323403116
LARRY_neg_control_singlets Accuracy: 0.8860103626943006
100%|██████████| 10/10 [00:03<00:00,  2.96trial/s, best loss: -0.267469216937302] 
LARRY_neg_control_doublets Best parameters for tree: {'colsample_bytree': 0.5759293853491132, 'gamma': 0.19751660181930894, 'learn

  summary_df = pd.concat([summary_df, pd.DataFrame([self_dict])], ignore_index=True)


100%|██████████| 10/10 [00:14<00:00,  1.45s/trial, best loss: -0.23214418639918738]
LARRY_neg_control_singlets Best parameters for tree: {'colsample_bytree': 0.776169428227975, 'gamma': 0.604620551526313, 'learning_rate': 0.7207989226852309, 'max_depth': 13, 'min_child_weight': 1, 'n_estimators': 2, 'reg_alpha': 0.7670225248667051, 'reg_lambda': 1.3789204872879262, 'scale_pos_weight': 14.626876369774674, 'subsample': 0.6503554123093244}
LARRY_neg_control_singlets Best tree score: {'loss': -0.23214418639918738, 'accuracy': 0.7849740932642487, 'status': 'ok', 'auroc': 0.6005152891721472, 'auprc': 0.23214418639918738}
LARRY_neg_control_singlets AUROC: 0.561902501864533
LARRY_neg_control_singlets AUPRC: 0.14417843165643773
LARRY_neg_control_singlets Accuracy: 0.7694300518134715
100%|██████████| 10/10 [00:04<00:00,  2.37trial/s, best loss: -0.4061327561327561]
LARRY_neg_control_doublets Best parameters for tree: {'colsample_bytree': 0.6593409446456694, 'gamma': 0.39658650544155477, 'learnin

  summary_df = pd.concat([summary_df, pd.DataFrame([self_dict])], ignore_index=True)


100%|██████████| 10/10 [00:20<00:00,  2.06s/trial, best loss: -0.16437542276229675]
LARRY_neg_control_singlets Best parameters for tree: {'colsample_bytree': 0.9445552777299923, 'gamma': 0.9390879131112061, 'learning_rate': 0.3288292276384149, 'max_depth': 7, 'min_child_weight': 0, 'n_estimators': 2, 'reg_alpha': 0.48972295028320867, 'reg_lambda': 2.168736952027621, 'scale_pos_weight': 5.231528078758256, 'subsample': 0.9986032762052939}
LARRY_neg_control_singlets Best tree score: {'loss': -0.16437542276229675, 'accuracy': 0.8575129533678757, 'status': 'ok', 'auroc': 0.5841073971116685, 'auprc': 0.16437542276229675}
LARRY_neg_control_singlets AUROC: 0.5160349854227405
LARRY_neg_control_singlets AUPRC: 0.127341677774164
LARRY_neg_control_singlets Accuracy: 0.8341968911917098
100%|██████████| 10/10 [00:03<00:00,  2.72trial/s, best loss: -0.1748308642989494]
LARRY_neg_control_doublets Best parameters for tree: {'colsample_bytree': 0.9615867025503647, 'gamma': 0.11508483511297468, 'learning

  summary_df = pd.concat([summary_df, pd.DataFrame([self_dict])], ignore_index=True)


100%|██████████| 10/10 [00:19<00:00,  1.94s/trial, best loss: -0.12952585938004174]
LARRY_neg_control_singlets Best parameters for tree: {'colsample_bytree': 0.5795145653025683, 'gamma': 0.7862995839383856, 'learning_rate': 0.37820965357756625, 'max_depth': 9, 'min_child_weight': 3, 'n_estimators': 13, 'reg_alpha': 0.7409608518503473, 'reg_lambda': 1.299846354274626, 'scale_pos_weight': 70.98121113467366, 'subsample': 0.9476454318780863}
LARRY_neg_control_singlets Best tree score: {'loss': -0.12952585938004174, 'accuracy': 0.7875647668393783, 'status': 'ok', 'auroc': 0.5331886907586956, 'auprc': 0.12952585938004174}
LARRY_neg_control_singlets AUROC: 0.5107464912875449
LARRY_neg_control_singlets AUPRC: 0.1180792209945208
LARRY_neg_control_singlets Accuracy: 0.7953367875647669
100%|██████████| 10/10 [00:04<00:00,  2.12trial/s, best loss: -0.29682820731414084]
LARRY_neg_control_doublets Best parameters for tree: {'colsample_bytree': 0.6104541487613253, 'gamma': 0.8476182112460512, 'learni

  summary_df = pd.concat([summary_df, pd.DataFrame([self_dict])], ignore_index=True)


100%|██████████| 10/10 [00:13<00:00,  1.37s/trial, best loss: -0.12944445145286687]
LARRY_neg_control_singlets Best parameters for tree: {'colsample_bytree': 0.7372120279292902, 'gamma': 0.943452841507474, 'learning_rate': 0.5146113140995613, 'max_depth': 10, 'min_child_weight': 5, 'n_estimators': 39, 'reg_alpha': 6.51273783398576e-05, 'reg_lambda': 1.700429833929366, 'scale_pos_weight': 72.80583485847436, 'subsample': 0.8688334550756527}
LARRY_neg_control_singlets Best tree score: {'loss': -0.12944445145286687, 'accuracy': 0.8238341968911918, 'status': 'ok', 'auroc': 0.48579564716251955, 'auprc': 0.12944445145286687}
LARRY_neg_control_singlets AUROC: 0.4605735982100481
LARRY_neg_control_singlets AUPRC: 0.1086653853026118
LARRY_neg_control_singlets Accuracy: 0.8186528497409327
100%|██████████| 10/10 [00:04<00:00,  2.01trial/s, best loss: -0.6091269841269841]
LARRY_neg_control_doublets Best parameters for tree: {'colsample_bytree': 0.8967174116996974, 'gamma': 0.10972919823542647, 'lear

  summary_df = pd.concat([summary_df, pd.DataFrame([self_dict])], ignore_index=True)


100%|██████████| 10/10 [00:30<00:00,  3.05s/trial, best loss: -0.39668054711255696]
Biorxiv_neg_control_singlets Best parameters for tree: {'colsample_bytree': 0.9060772811227649, 'gamma': 0.5903633737823762, 'learning_rate': 0.9791362145806073, 'max_depth': 13, 'min_child_weight': 7, 'n_estimators': 12, 'reg_alpha': 0.882455000371911, 'reg_lambda': 2.3857770681555497, 'scale_pos_weight': 14.028155765322389, 'subsample': 0.6944155127958811}
Biorxiv_neg_control_singlets Best tree score: {'loss': -0.39668054711255696, 'accuracy': 0.8769230769230769, 'status': 'ok', 'auroc': 0.7185960591133005, 'auprc': 0.39668054711255696}
Biorxiv_neg_control_singlets AUROC: 0.43130434782608695
Biorxiv_neg_control_singlets AUPRC: 0.1136060732192266
Biorxiv_neg_control_singlets Accuracy: 0.7615384615384615
100%|██████████| 10/10 [00:06<00:00,  1.46trial/s, best loss: -0.22549019607843135]
Biorxiv_neg_control_doublets Best parameters for tree: {'colsample_bytree': 0.9821801167739901, 'gamma': 0.67039478063

  summary_df = pd.concat([summary_df, pd.DataFrame([self_dict])], ignore_index=True)


100%|██████████| 10/10 [00:31<00:00,  3.13s/trial, best loss: -0.20884962943177393]
Biorxiv_neg_control_singlets Best parameters for tree: {'colsample_bytree': 0.5940256178248907, 'gamma': 0.8071992433455206, 'learning_rate': 0.6340044842117066, 'max_depth': 8, 'min_child_weight': 4, 'n_estimators': 11, 'reg_alpha': 0.4458022624892528, 'reg_lambda': 2.758329252213735, 'scale_pos_weight': 71.24079637879221, 'subsample': 0.8923004923525581}
Biorxiv_neg_control_singlets Best tree score: {'loss': -0.20884962943177393, 'accuracy': 0.7923076923076923, 'status': 'ok', 'auroc': 0.5362318840579711, 'auprc': 0.20884962943177393}
Biorxiv_neg_control_singlets AUROC: 0.4722906403940887
Biorxiv_neg_control_singlets AUPRC: 0.11495734366621846
Biorxiv_neg_control_singlets Accuracy: 0.7615384615384615
100%|██████████| 10/10 [00:07<00:00,  1.39trial/s, best loss: -0.5666666666666667]
Biorxiv_neg_control_doublets Best parameters for tree: {'colsample_bytree': 0.9307317240449791, 'gamma': 0.59966437594280

  summary_df = pd.concat([summary_df, pd.DataFrame([self_dict])], ignore_index=True)


100%|██████████| 10/10 [00:31<00:00,  3.13s/trial, best loss: -0.20099373018345099]
Biorxiv_neg_control_singlets Best parameters for tree: {'colsample_bytree': 0.9823701470004937, 'gamma': 0.26872894104067485, 'learning_rate': 0.3544935587129869, 'max_depth': 12, 'min_child_weight': 6, 'n_estimators': 9, 'reg_alpha': 0.23932052125997905, 'reg_lambda': 2.8428887398967055, 'scale_pos_weight': 14.123329178058334, 'subsample': 0.7600304772124287}
Biorxiv_neg_control_singlets Best tree score: {'loss': -0.20099373018345099, 'accuracy': 0.8384615384615385, 'status': 'ok', 'auroc': 0.6065270935960592, 'auprc': 0.20099373018345099}
Biorxiv_neg_control_singlets AUROC: 0.4820289855072464
Biorxiv_neg_control_singlets AUPRC: 0.1540162700461019
Biorxiv_neg_control_singlets Accuracy: 0.8615384615384616
100%|██████████| 10/10 [00:03<00:00,  2.99trial/s, best loss: -0.29166666666666663]
Biorxiv_neg_control_doublets Best parameters for tree: {'colsample_bytree': 0.8686190497945965, 'gamma': 0.2974704807

  summary_df = pd.concat([summary_df, pd.DataFrame([self_dict])], ignore_index=True)


100%|██████████| 10/10 [00:43<00:00,  4.31s/trial, best loss: -0.20642496691081566]
Biorxiv_neg_control_singlets Best parameters for tree: {'colsample_bytree': 0.9863362573036014, 'gamma': 0.30632324763329183, 'learning_rate': 0.5214271498565392, 'max_depth': 7, 'min_child_weight': 6, 'n_estimators': 83, 'reg_alpha': 0.4737884884602732, 'reg_lambda': 1.1672616407215033, 'scale_pos_weight': 54.05049365797987, 'subsample': 0.5273936661387503}
Biorxiv_neg_control_singlets Best tree score: {'loss': -0.20642496691081566, 'accuracy': 0.8307692307692308, 'status': 'ok', 'auroc': 0.5899014778325122, 'auprc': 0.20642496691081566}
Biorxiv_neg_control_singlets AUROC: 0.5414492753623188
Biorxiv_neg_control_singlets AUPRC: 0.18645581961536478
Biorxiv_neg_control_singlets Accuracy: 0.8153846153846154
100%|██████████| 10/10 [00:06<00:00,  1.56trial/s, best loss: -0.11764705882352941]
Biorxiv_neg_control_doublets Best parameters for tree: {'colsample_bytree': 0.5358483592814305, 'gamma': 0.47801635965

  summary_df = pd.concat([summary_df, pd.DataFrame([self_dict])], ignore_index=True)


100%|██████████| 10/10 [00:33<00:00,  3.33s/trial, best loss: -0.26794034768273894]
Biorxiv_neg_control_singlets Best parameters for tree: {'colsample_bytree': 0.8378157769277234, 'gamma': 0.49818358106178584, 'learning_rate': 0.4674676682212254, 'max_depth': 17, 'min_child_weight': 4, 'n_estimators': 27, 'reg_alpha': 0.6570629118457422, 'reg_lambda': 1.0413602850442794, 'scale_pos_weight': 15.103365265029202, 'subsample': 0.8736489087431681}
Biorxiv_neg_control_singlets Best tree score: {'loss': -0.26794034768273894, 'accuracy': 0.8615384615384616, 'status': 'ok', 'auroc': 0.7149014778325122, 'auprc': 0.26794034768273894}
Biorxiv_neg_control_singlets AUROC: 0.6631884057971015
Biorxiv_neg_control_singlets AUPRC: 0.18357217214376823
Biorxiv_neg_control_singlets Accuracy: 0.8692307692307693
100%|██████████| 10/10 [00:06<00:00,  1.60trial/s, best loss: -0.14285714285714285]
Biorxiv_neg_control_doublets Best parameters for tree: {'colsample_bytree': 0.9219813381229529, 'gamma': 0.148515195

  summary_df = pd.concat([summary_df, pd.DataFrame([self_dict])], ignore_index=True)


100%|██████████| 10/10 [00:37<00:00,  3.74s/trial, best loss: -0.17161109344674527]
Biorxiv_neg_control_singlets Best parameters for tree: {'colsample_bytree': 0.7506722627515718, 'gamma': 0.6603878397539192, 'learning_rate': 0.9236284278566311, 'max_depth': 15, 'min_child_weight': 1, 'n_estimators': 56, 'reg_alpha': 0.021582557437423877, 'reg_lambda': 2.374306243333277, 'scale_pos_weight': 59.1052024661156, 'subsample': 0.9770429271722247}
Biorxiv_neg_control_singlets Best tree score: {'loss': -0.17161109344674527, 'accuracy': 0.8538461538461538, 'status': 'ok', 'auroc': 0.5292753623188405, 'auprc': 0.17161109344674527}
Biorxiv_neg_control_singlets AUROC: 0.47044334975369456
Biorxiv_neg_control_singlets AUPRC: 0.1683789371968072
Biorxiv_neg_control_singlets Accuracy: 0.8076923076923077
100%|██████████| 10/10 [00:07<00:00,  1.37trial/s, best loss: -0.34090909090909094]
Biorxiv_neg_control_doublets Best parameters for tree: {'colsample_bytree': 0.7954063553300925, 'gamma': 0.85222948962

  summary_df = pd.concat([summary_df, pd.DataFrame([self_dict])], ignore_index=True)


100%|██████████| 10/10 [00:39<00:00,  3.95s/trial, best loss: -0.2748660023127987]
Biorxiv_neg_control_singlets Best parameters for tree: {'colsample_bytree': 0.6401761506103982, 'gamma': 0.11053778623464189, 'learning_rate': 0.8552811680750824, 'max_depth': 12, 'min_child_weight': 6, 'n_estimators': 85, 'reg_alpha': 0.7288655646054588, 'reg_lambda': 2.741234340839938, 'scale_pos_weight': 33.28484780459508, 'subsample': 0.5256436143430818}
Biorxiv_neg_control_singlets Best tree score: {'loss': -0.2748660023127987, 'accuracy': 0.7769230769230769, 'status': 'ok', 'auroc': 0.5431034482758621, 'auprc': 0.2748660023127987}
Biorxiv_neg_control_singlets AUROC: 0.6452173913043479
Biorxiv_neg_control_singlets AUPRC: 0.271797442823559
Biorxiv_neg_control_singlets Accuracy: 0.7846153846153846
100%|██████████| 10/10 [00:05<00:00,  1.77trial/s, best loss: -0.25]              
Biorxiv_neg_control_doublets Best parameters for tree: {'colsample_bytree': 0.9409319220980839, 'gamma': 0.48758277405730555

  summary_df = pd.concat([summary_df, pd.DataFrame([self_dict])], ignore_index=True)


100%|██████████| 10/10 [00:42<00:00,  4.29s/trial, best loss: -0.2161101270854895]
Biorxiv_neg_control_singlets Best parameters for tree: {'colsample_bytree': 0.71313055461554, 'gamma': 0.3798814921344426, 'learning_rate': 0.19764855531882217, 'max_depth': 8, 'min_child_weight': 5, 'n_estimators': 75, 'reg_alpha': 0.659408765070108, 'reg_lambda': 1.5958866946847055, 'scale_pos_weight': 2.2692357748055576, 'subsample': 0.6517361747333503}
Biorxiv_neg_control_singlets Best tree score: {'loss': -0.2161101270854895, 'accuracy': 0.8769230769230769, 'status': 'ok', 'auroc': 0.5472463768115942, 'auprc': 0.2161101270854895}
Biorxiv_neg_control_singlets AUROC: 0.5615763546798029
Biorxiv_neg_control_singlets AUPRC: 0.12629400779025374
Biorxiv_neg_control_singlets Accuracy: 0.8923076923076924
100%|██████████| 10/10 [00:05<00:00,  2.00trial/s, best loss: -0.25]              
Biorxiv_neg_control_doublets Best parameters for tree: {'colsample_bytree': 0.8185553060958448, 'gamma': 0.30138576235456493

  summary_df = pd.concat([summary_df, pd.DataFrame([self_dict])], ignore_index=True)


100%|██████████| 10/10 [00:39<00:00,  3.95s/trial, best loss: -0.1953543801450139]
Biorxiv_neg_control_singlets Best parameters for tree: {'colsample_bytree': 0.7492707969229404, 'gamma': 0.17610317708208625, 'learning_rate': 0.016223005195040847, 'max_depth': 4, 'min_child_weight': 1, 'n_estimators': 93, 'reg_alpha': 0.7767634285930651, 'reg_lambda': 2.1259541529908583, 'scale_pos_weight': 95.68186692962028, 'subsample': 0.6956477876132772}
Biorxiv_neg_control_singlets Best tree score: {'loss': -0.1953543801450139, 'accuracy': 0.33076923076923076, 'status': 'ok', 'auroc': 0.5681159420289854, 'auprc': 0.1953543801450139}
Biorxiv_neg_control_singlets AUROC: 0.3435960591133005
Biorxiv_neg_control_singlets AUPRC: 0.08399594235932466
Biorxiv_neg_control_singlets Accuracy: 0.27692307692307694
100%|██████████| 10/10 [00:06<00:00,  1.66trial/s, best loss: -0.3088235294117647]
Biorxiv_neg_control_doublets Best parameters for tree: {'colsample_bytree': 0.5156421550713514, 'gamma': 0.65125661625

  summary_df = pd.concat([summary_df, pd.DataFrame([self_dict])], ignore_index=True)


100%|██████████| 10/10 [00:36<00:00,  3.66s/trial, best loss: -0.2630350979895547]
Biorxiv_neg_control_singlets Best parameters for tree: {'colsample_bytree': 0.9588407302957535, 'gamma': 0.7502564552952196, 'learning_rate': 0.40446449099803594, 'max_depth': 13, 'min_child_weight': 2, 'n_estimators': 41, 'reg_alpha': 0.5434392286053031, 'reg_lambda': 2.4775903512852855, 'scale_pos_weight': 12.134283448092953, 'subsample': 0.742116930495117}
Biorxiv_neg_control_singlets Best tree score: {'loss': -0.2630350979895547, 'accuracy': 0.8769230769230769, 'status': 'ok', 'auroc': 0.7278325123152709, 'auprc': 0.2630350979895547}
Biorxiv_neg_control_singlets AUROC: 0.6359420289855073
Biorxiv_neg_control_singlets AUPRC: 0.21235928921043032
Biorxiv_neg_control_singlets Accuracy: 0.8692307692307693
100%|██████████| 10/10 [00:04<00:00,  2.11trial/s, best loss: -0.3333333333333333]
Biorxiv_neg_control_doublets Best parameters for tree: {'colsample_bytree': 0.8356434987456498, 'gamma': 0.29423667530889

In [2]:
def main(dataset):
    if dataset.startswith("0"): # denotes variable doublet rate datasets, where the dataset name is the doublet rate
        counts_dir = os.path.join("/Users/mem3579/Library/CloudStorage/OneDrive-NorthwesternUniversity/Arispe and Goyal Labs/ZhangMelzerEtAl/data/classifier/FM01/variable_doublet_rates_2", dataset)
        labels_dir = os.path.join("/Users/mem3579/Library/CloudStorage/OneDrive-NorthwesternUniversity/Arispe and Goyal Labs/ZhangMelzerEtAl/data/classifier/FM01/")
    else:
        counts_dir = os.path.join("/Volumes/fsmresfiles/Basic_Sciences/CDB/GoyalLab/People/MadelineMelzer/ZhangMelzerEtAl/data/classifier", dataset, "10X_doublets_2/")
        labels_dir = os.path.join("/Volumes/fsmresfiles/Basic_Sciences/CDB/GoyalLab/People/MadelineMelzer/ZhangMelzerEtAl/data/classifier", dataset)

        #counts_dir = os.path.join("/Users/mem3579/Library/CloudStorage/OneDrive-NorthwesternUniversity/Arispe and Goyal Labs/ZhangMelzerEtAl/data/classifier/s1s2", dataset, "10X_doublets_2/")
        #labels_dir = os.path.join("/Users/mem3579/Library/CloudStorage/OneDrive-NorthwesternUniversity/Arispe and Goyal Labs/ZhangMelzerEtAl/data/classifier/s1s2", dataset)


    summary_df = pd.DataFrame(columns = ["dataset", "condition", "auroc", "auprc", "accuracy", "best_params"])

    features, labels_encoded, barcodes_1, matrix_array, genes = load_and_preprocess_data(counts_dir, labels_dir)
    #FM01_dict = use_FM01_classifier(features, labels_encoded, barcodes_1, results_dir, classifiers_dir)
    #summary_df = pd.concat([summary_df, pd.DataFrame([FM01_dict])], ignore_index=True)
    #print(summary_df)
    self_dict = train_classifier(features, labels_encoded, barcodes_1, dataset)
    summary_df = pd.concat([summary_df, pd.DataFrame([self_dict])], ignore_index=True)
    print(summary_df)
    singlets_dict, doublets_dict, shuffled_dict, scrambled_dict = negative_controls(features, labels_encoded, barcodes_1, dataset, matrix_array, genes)
    summary_df = pd.concat([summary_df, pd.DataFrame([singlets_dict])], ignore_index=True)
    summary_df = pd.concat([summary_df, pd.DataFrame([doublets_dict])], ignore_index=True)
    summary_df = pd.concat([summary_df, pd.DataFrame([shuffled_dict])], ignore_index=True)
    summary_df = pd.concat([summary_df, pd.DataFrame([scrambled_dict])], ignore_index=True)
    print(summary_df)

    return summary_df
    

In [3]:
def load_and_preprocess_data(counts_dir, labels_dir):
    os.chdir(counts_dir)

    # Read the .mtx file
    matrix = scipy.io.mmread("matrix.mtx")
    matrix = matrix.transpose()
    matrix_array = matrix.toarray()

    # Read the features and barcodes files
    genes = pd.read_csv("features.tsv.gz", header=None, sep="\t")
    genes_list = genes.index.tolist()
    first_column = genes.columns[0]
    genes = genes.set_index(first_column) #this is to ensure the anndata object is created correctly and that there are no extra columns in the features or barcodes dfs
    barcodes = pd.read_csv("barcodes.tsv.gz", header=None, sep="\t")
    first_column = barcodes.columns[0]
    barcodes = barcodes.set_index(first_column)

    # Create the AnnData object
    data = anndata.AnnData(X=matrix_array, var=genes_list, obs=barcodes)

    # getting singlet and multiplet labels
    os.chdir(labels_dir)
    labels_df = pd.read_csv(f'labels_2.csv') #for "variable doublet rates", indicate _{dataset} here

    ############## Preprocessing data

    # combining features matrix and labels
    data.obs.index = data.obs.index.rename('barcode')
    merged = data.obs.merge(labels_df, on='barcode', how='inner')
    #print(merged.head()) #checking what the merged looks like

    # Extract the features matrix and labels
    features = data.X
    labels = merged['label'].values
    unique_labels, counts = np.unique(labels, return_counts=True)
    print(dict(zip(unique_labels, counts))) #checking the number of singlets and multiplets

    # Encode labels
    label_encoder = LabelEncoder()
    labels_encoded = label_encoder.fit_transform(labels)
    labels_encoded = 1 - labels_encoded #switching the labels so that 1s are multiplets and 0s are singlets, so correclty identified 1s are considered true positives
    counts = np.bincount(labels_encoded)
    #print(counts) #checking that the number of singlets and multiplets is the same as above

    barcodes_1 = data.obs.index.to_numpy() #getting the barcodes for the features matrix to identify the cells that are being classified

    return features, labels_encoded, barcodes_1, matrix_array, genes

In [4]:
def use_FM01_classifier(features, labels_encoded, barcodes_1, results_dir, classifiers_dir):
    os.chdir(classifiers_dir + "FM01_control/")
    unfit = load(classifiers_dir + 'FM01_1_tree_stratified_unfit.joblib')
    X_train, X_test, y_train, y_test, barcodes_train, barcodes_test = train_test_split(features, labels_encoded, barcodes_1, test_size=0.2, random_state=23, shuffle=True, stratify=labels_encoded) # 20% test set, 80% training set
    classifier = unfit.fit(X_train, y_train)

    preds_proba = classifier.predict_proba(X_test)[:,1]  # Get probabilities of the positive class (multiplets- 1)
    auroc = roc_auc_score(y_test, preds_proba) # Calculate AUROC
    print(f"{dataset} trained on FM01 AUROC: {auroc}")
    auprc = average_precision_score(y_test, preds_proba) # Calculate AUPRC
    print(f"{dataset} trained on FM01 AUPRC: {auprc}")
    y_preds = classifier.predict(X_test) # Predict labels on the test set
    accuracy = accuracy_score(y_test, y_preds) # Calculate accuracy
    print(f"{dataset} trained on FM01 Accuracy: {accuracy}")

    results_fm01 = pd.DataFrame({
        'barcode': barcodes_test,
        'prediction probability': preds_proba,
        'predicted': y_preds,
        'actual': y_test
    })
    results_fm01.to_csv(os.path.join(results_dir, f"{dataset}_predicted_actual_FM01params.csv"))

    condition = "FM01_params"

    FM01_dict = {"dataset": dataset, "condition": condition, "auroc": auroc, "auprc": auprc, "accuracy": accuracy, "best_params": "FM01 parameters"}

    return FM01_dict

In [5]:
def train_classifier(features, labels_encoded, barcodes_1, condition):
    # split into training, testing, and validation sets
    X_train, X_test_temp, y_train, y_test_temp, barcodes_train, barcodes_test = train_test_split(features, labels_encoded, barcodes_1, test_size=0.4, random_state=count, shuffle=True, stratify=labels_encoded) # 40% test set, 60% training set
    X_test, X_valid, y_test, y_valid, barcodes_test, barcodes_valid = train_test_split(X_test_temp, y_test_temp, barcodes_test, test_size=0.5, random_state=count, shuffle=True, stratify=y_test_temp) # Split 50% of the test set into a validation set
    #print the first 10 rows of the training set
    #print(X_train[:10])
    #print the first 10 rows of the test_temp set
    #print(X_test_temp[:10])


    # Define the hyperparameter space
    space_tree = {
        'n_estimators': hp.choice('n_estimators', range(1, 100)),
        'max_depth': hp.choice('max_depth', range(1, 20)),
        'learning_rate': hp.uniform('learning_rate', 0.01, 1),
        'objective': 'binary:logistic',
        'min_child_weight': hp.choice('min_child_weight', range(1, 10)),
        'gamma': hp.uniform('gamma', 0.1, 1.0),
        'subsample': hp.uniform('subsample', 0.5, 1),
        'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
        'reg_alpha': hp.uniform('reg_alpha', 0.0, 1.0),
        'reg_lambda': hp.uniform('reg_lambda', 1.0, 3.0),
        'scale_pos_weight': hp.uniform('scale_pos_weight', 1, 100),
        'booster': 'gbtree'
    }

    # Define objective function
    def objective(params):
        bst = XGBClassifier(**params, random_state=count)
        bst.fit(X_train, y_train)
        preds = bst.predict(X_valid)
        preds_proba = bst.predict_proba(X_valid)[:, 1]
        accuracy = accuracy_score(y_valid, preds)
        auroc = roc_auc_score(y_valid, preds_proba)  # Calculate AUROC
        auprc = average_precision_score(y_valid, preds_proba)  # Calculate AUPRC
        return {'loss': -auprc, 'accuracy': accuracy, 'status': STATUS_OK, 'auroc': auroc, 'auprc': auprc}

    # Run the hyperparameter optimization
    trials_tree = Trials()
    best_tree = fmin(fn=objective, space=space_tree, algo=tpe.suggest, max_evals=10, trials=trials_tree)
    print(f"{condition} Best parameters for tree: {best_tree}")

    # Summary of the success of the hyperparameter optimization
    best_tree_score = min(trials_tree.results, key=lambda x: x['loss'])
    print(f"{condition} Best tree score: {best_tree_score}")

    # Adjusting the hyperparameters
    best_params_tree = {
        'n_estimators': best_tree['n_estimators'] + 1,  # +1 because hp.choice returns an index
        'max_depth': best_tree['max_depth'] + 1,        # +1 for the same reason
        'learning_rate': best_tree['learning_rate'],
        'objective': 'binary:logistic',
        'min_child_weight': best_tree['min_child_weight'] + 1,  # Adjust if needed
        'gamma': best_tree['gamma'],
        'subsample': best_tree['subsample'],
        'reg_alpha': best_tree['reg_alpha'],
        'reg_lambda': best_tree['reg_lambda'],
        'scale_pos_weight': best_tree['scale_pos_weight'],
        'booster': 'gbtree'
    }

    # Retrain the classifier with the best hyperparameters
    bst_best = XGBClassifier(**best_params_tree, random_state=23)
    if condition == dataset:
        dump(bst_best, classifiers_dir + f'{dataset}_{count}_unfit.joblib') #saving unfit classifier
    bst_best.fit(X_train, y_train)
    if condition == dataset:
        dump(bst_best, classifiers_dir + f'{dataset}_{count}_fit.joblib') #saving fit classifier

    preds_proba = bst_best.predict_proba(X_test)[:,1]  # Get probabilities of the positive class (multiplets- 1)
    auroc = roc_auc_score(y_test, preds_proba) # Calculate AUROC
    print(f"{condition} AUROC: {auroc}")
    auprc = average_precision_score(y_test, preds_proba) # Calculate AUPRC
    print(f"{condition} AUPRC: {auprc}")
    y_preds = bst_best.predict(X_test) # Predict labels on the test set
    accuracy = accuracy_score(y_test, y_preds) # Calculate accuracy
    print(f"{condition} Accuracy: {accuracy}")
    results = pd.DataFrame({
        'barcode': barcodes_test,
        'prediction probability': preds_proba,
        'predicted': y_preds,
        'actual': y_test
    })
    results.to_csv(os.path.join(results_dir, "predictions" ,f"{condition}_sample2_predicted_actual.csv"))

    summary_dict = {"dataset": dataset, "condition": condition, "auroc": auroc, "auprc": auprc, "accuracy": accuracy, "best_params": best_params_tree}

    return summary_dict

In [6]:
def negative_controls(features, labels_encoded, barcodes_1, dataset, matrix_array, genes):
    total_cells = len(features)
    num_doublets = int((len(features)*0.1)/0.9)
    num_singlets = int(total_cells - num_doublets)

    ############### singlets only ###############

    # selecting only singlets as a control
    features_singlets = features[:(num_singlets), :] #singlets always first in the features matrix
    # creating fake doublets from those singlets
    labels_singlets = np.zeros(num_singlets)
    fakedoubletsforsinglets = int((len(features_singlets)*0.1)/0.9)
    # making 10% of the singlets into fake doublets
    labels_singlets[0:fakedoubletsforsinglets] = 1 

    barcodes_singlets = barcodes_1[:num_singlets]

    singlets_dict = train_classifier(features_singlets, labels_singlets, barcodes_singlets, f"{dataset}_neg_control_singlets")
    
    ############### doublets only ###############
    
    # selecting only doublets as a control
    features_doublets = features[-num_doublets:, :] #doublets always last in the features matrix
    # creating fake singlets from those doublets
    labels_doublets = np.ones(num_doublets)
    fakesingletsfordoublets = int(num_doublets - ((len(features_doublets)*0.1)/0.9))
    # making 90% of the doublets into fake singlets
    labels_doublets[0:fakesingletsfordoublets] = 0

    barcodes_doublets = barcodes_1[-num_doublets:]
    
    doublets_dict = train_classifier(features_doublets, labels_doublets, barcodes_doublets, f"{dataset}_neg_control_doublets")
 
    ############### shuffling features ###############

    shuffled_indices = np.random.permutation(matrix_array.shape[1]) # Generate a shuffled index
    matrix_array_shuffled = matrix_array[:, shuffled_indices] # Shuffle the columns of the matrix
    genes_shuffled = genes.iloc[shuffled_indices] # Reorder the features DataFrame to match the new column order (NOTE: this doesnt get used here)

    shuffled_data = anndata.AnnData(X=matrix_array_shuffled, var=genes_shuffled, obs=barcodes_1) #(NOTE: this doesnt get used here-- should i use it? probably unnecessary to specify gene names)

    features_shuffled = shuffled_data.X

    shuffled_dict = train_classifier(features_shuffled, labels_encoded, barcodes_1, f"{dataset}_shuffled")

    ############### scrambling features ###############

    flattened_matrix = matrix_array.flatten()
    np.random.shuffle(flattened_matrix)
    scrambled_matrix_array = flattened_matrix.reshape(matrix_array.shape)

    features_scrambled = scrambled_matrix_array

    scrambled_dict = train_classifier(features_scrambled, labels_encoded, barcodes_1, f"{dataset}_neg_control_scrambled")

    return singlets_dict, doublets_dict, shuffled_dict, scrambled_dict

In [None]:
dataset = "s1nc_positiveControl"
condition = f"{dataset}_standard"
results_dir = "/Users/mem3579/Library/CloudStorage/OneDrive-NorthwesternUniversity/Arispe and Goyal Labs/ZhangMelzerEtAl/data/classifier/results/"
classifiers_dir = "/Users/mem3579/Library/CloudStorage/OneDrive-NorthwesternUniversity/Arispe and Goyal Labs/ZhangMelzerEtAl/data/classifier/classifiers/"

counts_dir = os.path.join("/Users/mem3579/Library/CloudStorage/OneDrive-NorthwesternUniversity/Arispe and Goyal Labs/ZhangMelzerEtAl/data/classifier", dataset, "10X_doublets_2/")
#counts_dir = os.path.join("/Users/mem3579/Library/CloudStorage/OneDrive-NorthwesternUniversity/Arispe and Goyal Labs/ZhangMelzerEtAl/data/classifier/FM01/variable_doublet_rates", dataset)
labels_dir = os.path.join("/Users/mem3579/Library/CloudStorage/OneDrive-NorthwesternUniversity/Arispe and Goyal Labs/ZhangMelzerEtAl/data/classifier", dataset)
#labels_dir = os.path.join("/Users/mem3579/Library/CloudStorage/OneDrive-NorthwesternUniversity/Arispe and Goyal Labs/ZhangMelzerEtAl/data/classifier/FM01/")

summary_df = pd.DataFrame(columns = ["dataset", "condition", "auroc", "auprc", "accuracy", "best_params"])

features, labels_encoded, barcodes_1, matrix_array, genes = load_and_preprocess_data(counts_dir, labels_dir)
#FM01_dict = use_FM01_classifier(features, labels_encoded, barcodes_1, results_dir, classifiers_dir)
#summary_df = pd.concat([summary_df, pd.DataFrame([FM01_dict])], ignore_index=True)
#print(summary_df)
self_dict = train_classifier(features, labels_encoded, barcodes_1, condition)
summary_df = pd.concat([summary_df, pd.DataFrame([self_dict])], ignore_index=True)
print(summary_df)
singlets_dict, doublets_dict, shuffled_dict, scrambled_dict = negative_controls(features, labels_encoded, barcodes_1, condition, matrix_array, genes)
summary_df = pd.concat([summary_df, pd.DataFrame([singlets_dict])], ignore_index=True)
summary_df = pd.concat([summary_df, pd.DataFrame([doublets_dict])], ignore_index=True)
summary_df = pd.concat([summary_df, pd.DataFrame([shuffled_dict])], ignore_index=True)
summary_df = pd.concat([summary_df, pd.DataFrame([scrambled_dict])], ignore_index=True)
print(summary_df)
