In [None]:
# Step 1: Import Libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
from sklearn.model_selection import RandomizedSearchCV
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
X_train = pd.read_csv('X_train.csv')
y_train = pd.read_csv('y_train.csv')
X_test = pd.read_csv('X_test.csv')
y_test = pd.read_csv('y_test.csv')

In [None]:
# Step 2: Define Models and Hyperparameters for Tuning
# Models dictionary
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier()
}

In [None]:
# Hyperparameters for Random Forest
rf_params = {
    'max_depth': [5, 8, 15, None, 10],
    'max_features': [5, 7, "auto", 8],
    'min_samples_split': [2, 8, 15, 20],
    'n_estimators': [100, 200, 500, 1000]
}

In [None]:
# List of models for hyperparameter tuning
randomcv_models = [
    ("RF", RandomForestClassifier(), rf_params),
]

In [None]:
# Hyperparameter tuning using RandomizedSearchCV
model_param = {}
for name, model, params in randomcv_models:
    random = RandomizedSearchCV(estimator=model,
                                param_distributions=params,
                                n_iter=100,
                                cv=3,
                                verbose=2,
                                n_jobs=-1)
    random.fit(X_train, y_train)
    model_param[name] = random.best_params_


In [None]:
# Print best parameters for Random Forest
for model_name in model_param:
    print(f"---------------- Best Params for {model_name} -------------------")
    print(model_param[model_name])

In [None]:
# Step 3: Train Models and Evaluate Performance
# Assuming X_train, X_test, y_train, y_test are already defined
for i in range(len(list(models.values()))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)  # Train model

    # Training set performance
    y_train_pred = model.predict(X_train)
    model_train_accuracy = accuracy_score(y_train, y_train_pred)  # Calculate Accuracy
    model_train_f1 = f1_score(y_train, y_train_pred, average='weighted')  # Calculate F1 score
    model_train_precision = precision_score(y_train, y_train_pred)  # Calculate Precision
    model_train_recall = recall_score(y_train, y_train_pred)  # Calculate Recall
    model_train_rocauc_score = roc_auc_score(y_train, y_train_pred)  # Calculate ROC AUC

    # Test set performance
    y_test_pred = model.predict(X_test)
    model_test_accuracy = accuracy_score(y_test, y_test_pred)  # Calculate Accuracy
    model_test_f1 = f1_score(y_test, y_test_pred, average='weighted')  # Calculate F1 score
    model_test_precision = precision_score(y_test, y_test_pred)  # Calculate Precision
    model_test_recall = recall_score(y_test, y_test_pred)  # Calculate Recall
    model_test_rocauc_score = roc_auc_score(y_test, y_test_pred)  # Calculate ROC AUC

    # Print performance metrics
    print(list(models.keys())[i])
    
    print('Model performance for Training set')
    print('- Accuracy: {:.4f}'.format(model_train_accuracy))
    print('- F1 score: {:.4f}'.format(model_train_f1))
    print('- Precision: {:.4f}'.format(model_train_precision))
    print('- Recall: {:.4f}'.format(model_train_recall))
    print('- Roc AUC Score: {:.4f}'.format(model_train_rocauc_score))

    print('----------------------------------')

    print('Model performance for Test set')
    print('- Accuracy: {:.4f}'.format(model_test_accuracy))
    print('- F1 score: {:.4f}'.format(model_test_f1))
    print('- Precision: {:.4f}'.format(model_test_precision))
    print('- Recall: {:.4f}'.format(model_test_recall))
    print('- Roc AUC Score: {:.4f}'.format(model_test_rocauc_score))
    print('='*35)
    print('\n')


In [None]:
# Step 4: Plot ROC Curve for Random Forest
# Add the models to the list for ROC plot
auc_models = [
    {
        'label': 'Random Forest Classifier',
        'model': RandomForestClassifier(n_estimators=1000, min_samples_split=2, max_features=7, max_depth=None),
        'auc': 0.8325
    },
]

# Create loop through all models
plt.figure()
for algo in auc_models:
    model = algo['model']  # Select the model
    model.fit(X_train, y_train)  # Train the model
    # Compute False Positive Rate, True Positive Rate, and ROC AUC
    fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(X_test)[:, 1])
    # Plot ROC curve
    plt.plot(fpr, tpr, label='%s ROC (area = %0.2f)' % (algo['label'], algo['auc']))

# Custom settings for the plot
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('1-Specificity (False Positive Rate)')
plt.ylabel('Sensitivity (True Positive Rate)')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.savefig('roc_curve.png')