In [2]:
#Importing necessary libraries
import numpy as np
import pandas as pd
import joblib
import os
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, fbeta_score
from xgboost import XGBClassifier

# Load preprocessed data
X_train_smote = pd.read_csv('../data/processed/X_train_smote.csv')
y_train_smote = pd.read_csv('../data/processed/y_train_smote.csv').values.ravel()
X_test = pd.read_csv('../data/processed/X_test.csv')
y_test = pd.read_csv('../data/processed/y_test.csv').values.ravel()

# Metrics function to print results
def print_metrics(y_true, y_pred, model_name):
    print(f"{model_name} Metrics:")
    print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
    print(f"Precision: {precision_score(y_true, y_pred):.4f}")
    print(f"Recall: {recall_score(y_true, y_pred):.4f}")
    print(f"F1 Score: {f1_score(y_true, y_pred):.4f}")
    print(f"F-beta Score (beta=2): {fbeta_score(y_true, y_pred, beta=2.0):.4f}")
    print("\n")
# Logistic Regression
model = LogisticRegression()
model.fit(X_train_smote, y_train_smote)
y_pred = model.predict(X_test)
print_metrics(y_test, y_pred, "Logistic Regression")

# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_smote, y_train_smote)
y_pred_rf = rf_model.predict(X_test)
print_metrics(y_test, y_pred_rf, "Random Forest")

# XGBoost
xgb_model = XGBClassifier(n_estimators=100, random_state=42)
xgb_model.fit(X_train_smote, y_train_smote)
y_pred_xgb = xgb_model.predict(X_test)
print_metrics(y_test, y_pred_xgb, "XGBoost")

# Dynamic comparison of models based on F-beta score
scores = {
    'Logistic Regression': fbeta_score(y_test, y_pred, beta=2),
    'Random Forest': fbeta_score(y_test, y_pred_rf, beta=2),
    'XGBoost': fbeta_score(y_test, y_pred_xgb, beta=2)
}
best_model = max(scores, key=scores.get)
print(f"Best model by F2 score: {best_model} ({scores[best_model]:.4f})")

# Saving predictions to CSV 
pd.DataFrame({
    'y_test': y_test,
    'y_pred_logistic': y_pred,
    'y_pred_rf': y_pred_rf,
    'y_pred_xgb': y_pred_xgb
}).to_csv('../data/processed/model_predictions.csv', index=False)

# Saving models
os.makedirs('../models', exist_ok=True)
joblib.dump(rf_model, '../models/random_forest.pkl')
joblib.dump(xgb_model, '../models/xgboost.pkl')
print("Models saved successfully.")

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Metrics:
Accuracy: 0.9792
Precision: 0.0657
Recall: 0.8632
F1 Score: 0.1221
F-beta Score (beta=2): 0.2518


Random Forest Metrics:
Accuracy: 0.9995
Precision: 0.9136
Recall: 0.7789
F1 Score: 0.8409
F-beta Score (beta=2): 0.8026


XGBoost Metrics:
Accuracy: 0.9994
Precision: 0.8523
Recall: 0.7895
F1 Score: 0.8197
F-beta Score (beta=2): 0.8013


Best model by F2 score: Random Forest (0.8026)
Models saved successfully.
