## Imports

In [1]:
import pandas as pd

## Preprocessed data

In [2]:
X_path = 'Data/X_preprocessed.csv'
y_path = 'Data/y_preprocessed.csv'

X = pd.read_csv(X_path)
y = pd.read_csv(y_path)

In [None]:
# Initialize a DataFrame to store metrics for different models
metrics_df = pd.DataFrame(columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC'])

def evaluate_sklearn_model(model, X, y, params=None, cv=5):
    """
    Function to perform cross-validation with a specified model, display the confusion matrix, 
    plot the ROC AUC curve, and calculate the model's accuracy. Optionally performs hyperparameter 
    tuning using GridSearchCV. Also, collects important metrics for further comparison.
    
    :param model: The classification model (e.g., LogisticRegression, RandomForestClassifier).
    :param X: Feature dataset.
    :param y: Target labels.
    :param params: Parameter grid for GridSearchCV (optional).
    :param cv: Number of cross-validation folds (default is 5).
    :return: The best model and its accuracy.
    """
    
    # If parameter grid is provided, perform GridSearchCV
    if params:
        grid_search = GridSearchCV(model, param_grid=params, cv=cv, scoring='accuracy', verbose=1)
        grid_search.fit(X, y)
        model = grid_search.best_estimator_  # Update model to the best found by GridSearchCV
        print(f"Best Parameters: {grid_search.best_params_}")
    
    # Predict values using cross-validation
    y_pred = cross_val_predict(model, X, y, cv=cv, method='predict')
    y_pred_proba = cross_val_predict(model, X, y, cv=cv, method='predict_proba')[:, 1]

    # Compute the confusion matrix
    conf = confusion_matrix(y, y_pred)

    # Create a figure with two subplots (1 row, 2 columns)
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))

    # Plot the confusion matrix on the first subplot
    sns.heatmap(conf, annot=True, fmt='d', cmap='Blues', ax=axes[0])
    axes[0].set_title(f'Confusion Matrix - {cv}-Fold Cross-Validation')
    axes[0].set_xlabel('Predicted')
    axes[0].set_ylabel('Actual')

    # Calculate ROC AUC
    roc_auc = roc_auc_score(y, y_pred_proba)

    # Plot the ROC AUC curve on the second subplot
    fpr, tpr, _ = roc_curve(y, y_pred_proba)
    axes[1].plot(fpr, tpr, label=f'ROC AUC = {roc_auc:.2f}')
    axes[1].plot([0, 1], [0, 1], linestyle='--')
    axes[1].set_xlabel('False Positive Rate')
    axes[1].set_ylabel('True Positive Rate')
    axes[1].set_title(f'ROC AUC Curve - {cv}-Fold Cross-Validation')
    axes[1].legend(loc='lower right')

    # Ensure the 'Results' directory exists
    os.makedirs('Results', exist_ok=True)

    # Save the figure to the 'Results' directory with a dynamic name based on the model
    model_name = model.__class__.__name__
    plot_path = os.path.join('Results', f'results_{model_name}.png')
    plt.tight_layout()
    plt.savefig(plot_path)
    plt.show()
    
    # Calculate additional metrics
    accuracy = accuracy_score(y, y_pred)
    precision = precision_score(y, y_pred)
    recall = recall_score(y, y_pred)
    f1 = f1_score(y, y_pred)

    # Append the metrics to the DataFrame
    metrics_df.loc[len(metrics_df)] = [model_name, accuracy, precision, recall, f1, roc_auc]
    
    return model, accuracy, metrics_df