In [2]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, hamming_loss, make_scorer # type: ignore
from rdkit import Chem
from rdkit.Chem import AllChem, rdFingerprintGenerator, MACCSkeys
import numpy as np



# Input preparation

## Read fingerprint file and experimental file

In [3]:
# Load the data
fingerprints = pd.read_csv(
    "../data/PsychLight/PsychLight_MACCS_fingerprints.csv"
)
labels = pd.read_csv("../data/PsychLight/PsychLight_TrainingSet.csv")
print(fingerprints.head())
print(labels['Class'].head())

X = fingerprints.drop(columns=['ID'])
y = labels['Class']

FileNotFoundError: [Errno 2] No such file or directory: '../data/PsychLight/PsychLight_MACCS_fingerprints.csv'

## Training

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## Optuna

In [5]:
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, hamming_loss, f1_score, jaccard_score
from sklearn.metrics import (
    precision_score,
    recall_score,
    roc_auc_score,
    confusion_matrix,
)
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# define the objective function
def objective(trial):
    # define the hyperparameters to optimize
    n_estimators = trial.suggest_int("n_estimators", 10, 500)
    max_depth = trial.suggest_int("max_depth", 5, 30)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 4)
    bootstrap = trial.suggest_categorical("bootstrap", [True, False])
    
    # create a random forest classifier with the specified hyperparameters
    rf_classifier = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        bootstrap=bootstrap,
        random_state=42
    )
    
    # use cross-validation to evaluate the model
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = cross_val_score(rf_classifier, X_train, y_train, cv=cv, scoring='accuracy')
    
    # return the mean cross-validation score
    return cv_scores.mean()

# use Optuna to optimize the hyperparameters
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=200) 

# output the best hyperparameters   
print("Best Hyperparameters:", study.best_params)

# train a random forest classifier with the best hyperparameters
best_params = study.best_params
best_rf_classifier = RandomForestClassifier(**best_params, random_state=42)
best_rf_classifier.fit(X_train, y_train)

# make predictions on the test set
y_test_pred = best_rf_classifier.predict(X_test)
y_test_pred_proba = best_rf_classifier.predict_proba(X_test)[:, 1]  # get the probabilities for the positive class




[I 2024-12-09 13:48:22,268] A new study created in memory with name: no-name-50b72ed4-9536-40ce-81b9-23bb8e733961
[I 2024-12-09 13:48:23,245] Trial 0 finished with value: 0.8071428571428572 and parameters: {'n_estimators': 425, 'max_depth': 22, 'min_samples_split': 4, 'min_samples_leaf': 4, 'bootstrap': False}. Best is trial 0 with value: 0.8071428571428572.
[I 2024-12-09 13:48:23,924] Trial 1 finished with value: 0.7821428571428573 and parameters: {'n_estimators': 321, 'max_depth': 14, 'min_samples_split': 9, 'min_samples_leaf': 4, 'bootstrap': False}. Best is trial 0 with value: 0.8071428571428572.
[I 2024-12-09 13:48:24,686] Trial 2 finished with value: 0.7821428571428573 and parameters: {'n_estimators': 250, 'max_depth': 26, 'min_samples_split': 3, 'min_samples_leaf': 2, 'bootstrap': True}. Best is trial 0 with value: 0.8071428571428572.
[I 2024-12-09 13:48:25,496] Trial 3 finished with value: 0.7821428571428573 and parameters: {'n_estimators': 389, 'max_depth': 17, 'min_samples_sp

Best Hyperparameters: {'n_estimators': 432, 'max_depth': 6, 'min_samples_split': 8, 'min_samples_leaf': 1, 'bootstrap': False}


In [1]:
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    roc_auc_score,
    precision_score,
    recall_score,
    confusion_matrix,
    cohen_kappa_score,
    matthews_corrcoef,
)

# train the model
best_rf_classifier.fit(X_train, y_train)

# make predictions on the training set
y_train_pred = best_rf_classifier.predict(X_train)
y_train_pred_proba = best_rf_classifier.predict_proba(X_train)[
    :, 1
]  # get the probabilities for the positive class

# make predictions on the test set
y_test_pred = best_rf_classifier.predict(X_test)
y_test_pred_proba = best_rf_classifier.predict_proba(X_test)[
    :, 1
]  # get the probabilities for the positive class

# calculate metrics for the training set
train_accuracy = accuracy_score(y_train, y_train_pred)
train_f1 = f1_score(y_train, y_train_pred, average="micro")
train_roc_auc = roc_auc_score(y_train, y_train_pred_proba)
train_precision = precision_score(y_train, y_train_pred, average="micro")
train_recall = recall_score(y_train, y_train_pred, average="micro")
train_cohen_kappa = cohen_kappa_score(y_train, y_train_pred)
train_mcc = matthews_corrcoef(y_train, y_train_pred)

# confusion matrix for the training set
train_tn, train_fp, train_fn, train_tp = confusion_matrix(y_train, y_train_pred).ravel()
train_specificity = train_tn / (train_tn + train_fp)

# calculate metrics for the test set
test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred, average="micro")
test_roc_auc = roc_auc_score(y_test, y_test_pred_proba)
test_precision = precision_score(y_test, y_test_pred, average="micro")
test_recall = recall_score(y_test, y_test_pred, average="micro")
test_cohen_kappa = cohen_kappa_score(y_test, y_test_pred)
test_mcc = matthews_corrcoef(y_test, y_test_pred)

# confusion matrix for the test set
test_tn, test_fp, test_fn, test_tp = confusion_matrix(y_test, y_test_pred).ravel()
test_specificity = test_tn / (test_tn + test_fp)

# print metrics for the training set
print("\n--- Training Set Metrics ---")
print(f"Accuracy: {train_accuracy:.2f}")
print(f"F1 Score: {train_f1:.2f}")
print(f"ROC-AUC Score: {train_roc_auc:.2f}")
print(f"Precision: {train_precision:.2f}")
print(f"Recall: {train_recall:.2f}")
print(f"Cohen's Kappa: {train_cohen_kappa:.2f}")
print(f"Matthews Correlation Coefficient (MCC): {train_mcc:.2f}")
print(f"Specificity: {train_specificity:.2f}")
print(f"True Negatives (TN): {train_tn}")
print(f"False Positives (FP): {train_fp}")
print(f"False Negatives (FN): {train_fn}")
print(f"True Positives (TP): {train_tp}")

# print metrics for the test set
print("\n--- Test Set Metrics ---")
print(f"Accuracy: {test_accuracy:.2f}")
print(f"F1 Score: {test_f1:.2f}")
print(f"ROC-AUC Score: {test_roc_auc:.2f}")
print(f"Precision: {test_precision:.2f}")
print(f"Recall: {test_recall:.2f}")
print(f"Cohen's Kappa: {test_cohen_kappa:.2f}")
print(f"Matthews Correlation Coefficient (MCC): {test_mcc:.2f}")
print(f"Specificity: {test_specificity:.2f}")
print(f"True Negatives (TN): {test_tn}")
print(f"False Positives (FP): {test_fp}")
print(f"False Negatives (FN): {test_fn}")
print(f"True Positives (TP): {test_tp}")

# Plot ROC curve for the test set
fpr, tpr, _ = roc_curve(y_test, y_test_pred_proba)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color="darkorange", lw=2, label=f"ROC curve (area = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic")
plt.legend(loc="lower right")
plt.show()




NameError: name 'best_rf_classifier' is not defined

## inner-loop, outer-loop

In [7]:
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_val_score
import numpy as np


# Define the inner-loop objective function for hyperparameter tuning
def objective(trial, X_train_inner, y_train_inner):
    # Define hyperparameters to tune
    n_estimators = trial.suggest_int("n_estimators", 10, 500)
    max_depth = trial.suggest_int("max_depth", 5, 30)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 4)
    bootstrap = trial.suggest_categorical("bootstrap", [True, False])

    # Create Random Forest Classifier
    rf_classifier = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        bootstrap=bootstrap,
        random_state=42,
    )

    # Perform cross-validation in the inner loop
    cv_inner = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(
        rf_classifier, X_train_inner, y_train_inner, cv=cv_inner, scoring="roc_auc"
    )

    # Return the mean ROC-AUC score
    return scores.mean()


# Nested cross-validation
outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
outer_results = []  # Store outer-loop results for statistical evaluation

for train_idx, test_idx in outer_cv.split(X, y):
    # Split the dataset into training and testing for the outer loop
    X_train_outer, X_test_outer = X.iloc[train_idx], X.iloc[test_idx]
    y_train_outer, y_test_outer = y.iloc[train_idx], y.iloc[test_idx]

    # Inner loop: Hyperparameter tuning
    study = optuna.create_study(direction="maximize")
    study.optimize(
        lambda trial: objective(trial, X_train_outer, y_train_outer), n_trials=50
    )

    # Get the best hyperparameters
    best_params = study.best_params

    # Train the final model using the best hyperparameters
    best_rf_classifier = RandomForestClassifier(**best_params, random_state=42)
    best_rf_classifier.fit(X_train_outer, y_train_outer)

    # Evaluate on the outer loop's test set
    y_test_pred = best_rf_classifier.predict(X_test_outer)
    y_test_proba = best_rf_classifier.predict_proba(X_test_outer)[:, 1]

    # Calculate metrics
    accuracy = accuracy_score(y_test_outer, y_test_pred)
    f1 = f1_score(y_test_outer, y_test_pred)
    roc_auc = roc_auc_score(y_test_outer, y_test_proba)

    outer_results.append({"accuracy": accuracy, "f1": f1, "roc_auc": roc_auc})

# Compute overall performance metrics from the outer loop
outer_accuracy = np.mean([result["accuracy"] for result in outer_results])
outer_f1 = np.mean([result["f1"] for result in outer_results])
outer_roc_auc = np.mean([result["roc_auc"] for result in outer_results])

print(f"Nested 5-Fold Cross-Validation Results:")
print(f"Accuracy: {outer_accuracy:.4f}")
print(f"F1 Score: {outer_f1:.4f}")
print(f"ROC-AUC: {outer_roc_auc:.4f}")


KeyError: "None of [Index([ 2,  3,  5,  6,  7,  8, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,\n       22, 24, 25, 27, 28, 29, 30, 32, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,\n       45, 47, 48, 49, 50, 51, 52],\n      dtype='int64')] are in the [columns]"

## GridSearchCV

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report


# define the random forest model
rf_model = RandomForestClassifier(random_state=42)

# define the hyperparameters to optimize
param_grid = {
    'n_estimators': [10, 100, 300],  
    'max_depth': [None, 10, 30, 50],  
    'min_samples_split': [2, 10, 20],  
    'min_samples_leaf': [1, 4, 6, 8],      
    'bootstrap': [True, False]                
}

# define the grid search with 5-fold cross-validation
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='f1_samples', n_jobs=-1)

# perform the grid search
grid_search.fit(X_train, y_train)

# output the best hyperparameters
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)




Traceback (most recent call last):
  File "/Users/xiaomuou620/opt/miniconda3/envs/Project1/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 810, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Users/xiaomuou620/opt/miniconda3/envs/Project1/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
  File "/Users/xiaomuou620/opt/miniconda3/envs/Project1/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 355, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
  File "/Users/xiaomuou620/opt/miniconda3/envs/Project1/lib/python3.8/site-packages/sklearn/utils/_param_validation.py", line 214, in wrapper
    return func(*args, **kwargs)
  File "/Users/xiaomuou620/opt/miniconda3/envs/Project1/lib/python3.8/site-packages/sklearn/metrics/_classification.py", line 1239, in f1_score
    return fbeta_score(
  

Best Parameters: {'bootstrap': True, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 10}
Best Cross-Validation Score: nan


 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan na

In [None]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
hamming_loss_score_grid = accuracy_score(y_test, y_pred)
print("Test Score with Best Model:", hamming_loss_score_grid)

report = classification_report(y_test, y_pred, target_names=y.columns)
print(report)

Test Score with Best Model: 0.7647058823529411


AttributeError: 'Series' object has no attribute 'columns'

In [None]:
# 

[[0.01545635 0.98454365]
 [0.01545635 0.98454365]
 [0.01833333 0.98166667]
 [0.03382143 0.96617857]
 [0.03296429 0.96703571]
 [0.01545635 0.98454365]
 [0.0141746  0.9858254 ]
 [0.01545635 0.98454365]
 [0.05557143 0.94442857]
 [0.07083333 0.92916667]
 [0.03382143 0.96617857]
 [0.00333333 0.99666667]]
