In [2]:
#Importing necessary libraries
import numpy as np
import pandas as pd
import joblib
import os
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, fbeta_score, precision_recall_curve, auc, make_scorer
from xgboost import XGBClassifier

# Load preprocessed data
X_train_smote = pd.read_csv('../data/processed/X_train_smote.csv')
y_train_smote = pd.read_csv('../data/processed/y_train_smote.csv').values.ravel()
X_test = pd.read_csv('../data/processed/X_test.csv')
y_test = pd.read_csv('../data/processed/y_test.csv').values.ravel()

# Metrics function to print results
def print_metrics(y_true, y_pred, model_name):
    print(f"{model_name} Metrics:")
    print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
    print(f"Precision: {precision_score(y_true, y_pred):.4f}")
    print(f"Recall: {recall_score(y_true, y_pred):.4f}")
    print(f"F1 Score: {f1_score(y_true, y_pred):.4f}")
    print(f"F-beta Score (beta=2): {fbeta_score(y_true, y_pred, beta=2.0):.4f}")
    print("\n")
# Logistic Regression
model = LogisticRegression(max_iter=100, random_state=42)
model.fit(X_train_smote, y_train_smote)
y_pred = model.predict(X_test)
print_metrics(y_test, y_pred, "Logistic Regression")
# Cross-validation for Logistic Regression
f2_scorer= make_scorer(fbeta_score, beta=2)
cv_scores= cross_val_score(model, X_train_smote, y_train_smote, cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42), scoring=f2_scorer)
print(f"Logistic Regression CV F2 Scores: {cv_scores}")
print(f"Logistic Regression CV F2 Score Mean: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_smote, y_train_smote)
y_pred_rf = rf_model.predict(X_test)
print_metrics(y_test, y_pred_rf, "Random Forest")

#Cross-validation for Random Forest
f2_scorer= make_scorer(fbeta_score, beta=2)
cv_scores_rf= cross_val_score(rf_model, X_train_smote, y_train_smote, cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42), scoring=f2_scorer)
print(f"Random Forest CV F2 Scores: {cv_scores_rf}")
print(f"Random Forest CV F2 Score Mean: {cv_scores_rf.mean():.4f} ± {cv_scores_rf.std():.4f}")

# XGBoost
xgb_model = XGBClassifier(n_estimators=100, random_state=42)
xgb_model.fit(X_train_smote, y_train_smote)
y_pred_xgb = xgb_model.predict(X_test)
print_metrics(y_test, y_pred_xgb, "XGBoost")

#Cross-validation for XGBoost
f2_scorer= make_scorer(fbeta_score, beta=2)
cv_scores_xgb= cross_val_score(xgb_model, X_train_smote, y_train_smote, cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42), scoring=f2_scorer)
print(f"XGBoost CV F2 Scores: {cv_scores_xgb}")
print(f"XGBoost CV F2 Score Mean: {cv_scores_xgb.mean():.4f} ± {cv_scores_xgb.std():.4f}")
# Dynamic comparison of models based on F-beta score
scores = {
    'Logistic Regression': fbeta_score(y_test, y_pred, beta=2),
    'Random Forest': fbeta_score(y_test, y_pred_rf, beta=2),    
    'XGBoost': fbeta_score(y_test, y_pred_xgb, beta=2)
}
best_model = max(scores, key=scores.get)
print(f"Best model by F2 score: {best_model} ({scores[best_model]:.4f})")
#Treshhold optimization for Random Forest
y_proba_rf = rf_model.predict_proba(X_test)[:, 1]
precision_rf, recall_rf, thresholds_rf = precision_recall_curve(y_test, y_proba_rf)
f2_scores_rf = (1 + 2**2) * (precision_rf * recall_rf) / (2**2 * precision_rf + recall_rf + 1e-10)
best_threshold_rf = thresholds_rf[np.argmax(f2_scores_rf)]
print(f"Optimal threshold for Random Forest: {best_threshold_rf:.4f}")
y_pred_rf_opt = (y_proba_rf >= best_threshold_rf).astype(int)
print_metrics(y_test, y_pred_rf_opt, "Random Forest (Optimized Threshold)") 


# Saving predictions to CSV 
pd.DataFrame({
    'y_test': y_test,
    'y_pred_logistic': y_pred,
    'y_pred_rf': y_pred_rf,
    'y_pred_xgb': y_pred_xgb
}).to_csv('../data/processed/model_predictions.csv', index=False)

# Saving models
os.makedirs('../models', exist_ok=True)
joblib.dump(rf_model, '../models/random_forest.pkl')
joblib.dump(xgb_model, '../models/xgboost.pkl')
print("Models saved successfully.")

Logistic Regression Metrics:
Accuracy: 0.9722
Precision: 0.0503
Recall: 0.8737
F1 Score: 0.0952
F-beta Score (beta=2): 0.2045


Logistic Regression CV F2 Scores: [0.93411245 0.93334878 0.93490123 0.93335324 0.93625285]
Logistic Regression CV F2 Score Mean: 0.9344 ± 0.0011
Random Forest Metrics:
Accuracy: 0.9994
Precision: 0.9091
Recall: 0.7368
F1 Score: 0.8140
F-beta Score (beta=2): 0.7659


Random Forest CV F2 Scores: [0.99998676 0.99998235 0.99996028 0.99994705 0.99994705]
Random Forest CV F2 Score Mean: 1.0000 ± 0.0000
XGBoost Metrics:
Accuracy: 0.9994
Precision: 0.8444
Recall: 0.8000
F1 Score: 0.8216
F-beta Score (beta=2): 0.8085


XGBoost CV F2 Scores: [0.99998676 0.99992057 0.99993822 0.99991616 0.99994263]
XGBoost CV F2 Score Mean: 0.9999 ± 0.0000
Best model by F2 score: XGBoost (0.8085)
Optimal threshold for Random Forest: 0.3300
Random Forest (Optimized Threshold) Metrics:
Accuracy: 0.9994
Precision: 0.8298
Recall: 0.8211
F1 Score: 0.8254
F-beta Score (beta=2): 0.8228


Models