In [None]:
# shuffling the features to see if the structure of the data is meaningful for the classifier.
shuffled_indices = np.random.permutation(matrix_array.shape[1]) # Generate a shuffled index
matrix_array_shuffled = matrix_array[:, shuffled_indices] # Shuffle the columns of the matrix
features_shuffled = features.iloc[shuffled_indices] # Reorder the features DataFrame to match the new column order
data_shuffled = anndata.AnnData(X=matrix_array_shuffled, var=features_shuffled, obs=barcodes) # Now, you can create the AnnData object with the shuffled data

In [None]:
# checking randomness in the shuffle (10 repeats)

#set working directory to where the labels and 
os.chdir(counts_dir)

# Read the .mtx file
matrix = scipy.io.mmread("matrix.mtx")
matrix = matrix.transpose()
matrix_array = matrix.toarray()

# Flatten the matrix, shuffle the values, then reshape it back
flattened_matrix = matrix_array.flatten()

for repeat in range(10):
    np.random.seed(repeat)
    np.random.shuffle(flattened_matrix)
    scrambled_matrix_array = flattened_matrix.reshape(matrix_array.shape)

    os.chdir(counts_dir)
    # Read the features and barcodes files
    features = pd.read_csv("features.tsv.gz", header=None, sep="\t")
    first_column = features.columns[0]
    features = features.set_index(first_column) #this is to ensure the anndata object is created correctly and that there are no extra columns in the features or barcodes dfs
    barcodes = pd.read_csv("barcodes.tsv.gz", header=None, sep="\t")
    first_column = barcodes.columns[0]
    barcodes = barcodes.set_index(first_column)

    # Create the AnnData object
    data = anndata.AnnData(X=matrix_array, var=features, obs=barcodes)

    # getting singlet and multiplet labels
    os.chdir(labels_dir)
    labels_df = pd.read_csv('labels_sample1.csv')

    ############## Preprocessing data

    # combining features matrix and labels
    data.obs.index = data.obs.index.rename('barcode')
    merged = data.obs.merge(labels_df, on='barcode', how='inner')
    print(merged.head()) #checking what the merged looks like

    # Extract the features matrix and labels
    features = data.X
    labels = merged['label'].values
    unique_labels, counts = np.unique(labels, return_counts=True)
    print(dict(zip(unique_labels, counts))) #checking the number of singlets and multiplets

    # Encode labels
    label_encoder = LabelEncoder()
    labels_encoded = label_encoder.fit_transform(labels)
    labels_encoded = 1 - labels_encoded #switching the labels so that 1s are multiplets and 0s are singlets, so correclty identified 1s are considered true positives
    counts = np.bincount(labels_encoded)
    print(counts) #checking that the number of singlets and multiplets is the same as above

    barcodes_1 = data.obs.index.to_numpy() #getting the barcodes for the features matrix to identify the cells that are being classified

    # Define the hyperparameter space
    space_tree = {
        'n_estimators': hp.choice('n_estimators', range(1, 100)),
        'max_depth': hp.choice('max_depth', range(1, 20)),
        'learning_rate': hp.uniform('learning_rate', 0.01, 1),
        'objective': 'binary:logistic',
        'min_child_weight': hp.choice('min_child_weight', range(1, 10)),
        'gamma': hp.uniform('gamma', 0.1, 1.0),
        'subsample': hp.uniform('subsample', 0.5, 1),
        'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
        'reg_alpha': hp.uniform('reg_alpha', 0.0, 1.0),
        'reg_lambda': hp.uniform('reg_lambda', 1.0, 3.0),
        'scale_pos_weight': hp.uniform('scale_pos_weight', 1, 100),
        'booster': 'gbtree'
    }

    # Define objective function
    def objective(params):
        bst = XGBClassifier(**params, random_state=23)
        bst.fit(X_train, y_train)
        preds = bst.predict(X_test)
        preds_proba = bst.predict_proba(X_test)[:, 1]
        accuracy = accuracy_score(y_test, preds)
        auroc = roc_auc_score(y_test, preds_proba)  # Calculate AUROC
        auprc = average_precision_score(y_test, preds_proba)  # Calculate AUPRC
        return {'loss': -auprc, 'accuracy': accuracy, 'status': STATUS_OK, 'auroc': auroc, 'auprc': auprc}

    # Run the hyperparameter optimization
    trials_tree = Trials()
    best_tree = fmin(fn=objective, space=space_tree, algo=tpe.suggest, max_evals=10, trials=trials_tree)
    print(f"Best parameters for tree: {best_tree}")

    # Summary of the success of the hyperparameter optimization
    best_tree_score = min(trials_tree.results, key=lambda x: x['loss'])
    print(f"Best tree score: {best_tree_score}")

    # Adjusting the hyperparameters
    best_params_tree = {
        'n_estimators': best_tree['n_estimators'] + 1,  # +1 because hp.choice returns an index
        'max_depth': best_tree['max_depth'] + 1,        # +1 for the same reason
        'learning_rate': best_tree['learning_rate'],
        'objective': 'binary:logistic',
        'min_child_weight': best_tree['min_child_weight'] + 1,  # Adjust if needed
        'gamma': best_tree['gamma'],
        'subsample': best_tree['subsample'],
        'reg_alpha': best_tree['reg_alpha'],
        'reg_lambda': best_tree['reg_lambda'],
        'scale_pos_weight': best_tree['scale_pos_weight'],
        'booster': 'gbtree'
    }

    # Retrain the classifier with the best hyperparameters
    bst_best = XGBClassifier(**best_params_tree, random_state=23)
    dump(bst_best, classifiers_dir + f'{dataset}.joblib') #saving unfit classifier
    bst_best.fit(X_train, y_train)

    preds_proba = bst_best.predict_proba(X_test)[:,1]  # Get probabilities of the positive class (multiplets- 1)
    auroc = roc_auc_score(y_test, preds_proba) # Calculate AUROC
    print(f"AUROC: {auroc}")
    auprc = average_precision_score(y_test, preds_proba) # Calculate AUPRC
    print(f"AUPRC: {auprc}")
    y_preds = bst_best.predict(X_test) # Predict labels on the test set
    accuracy = accuracy_score(y_test, y_preds) # Calculate accuracy
    print(f"Accuracy: {accuracy}")