In [6]:
# --- 1. IMPORT NECESSARY LIBRARIES ---
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve, f1_score
import os
import joblib # Used for saving the trained models
import time # To time the grid search process
import numpy as np

# --- 2. SETUP AND LOOP THROUGH EACH CLASS ---
# This script assumes you have a 'data' folder with the 4 CSV files from Step 1.

# Create a directory to save the trained models
if not os.path.exists('trained_models'):
    os.makedirs('trained_models')

# Loop through each of the 4 classes
for i in range(1, 5):
    start_time = time.time()
    print(f"--- Processing Class {i} ---")
    
    # --- A. Load the Data ---
    file_path = f'data/class_{i}_modeling_data.csv'
    try:
        df = pd.read_csv(file_path)
    except FileNotFoundError:
        print(f"Error: Could not find the file {file_path}. Please ensure it exists.")
        continue

    # --- B. Separate Features (X) and Target (y) ---
    X = df.drop('udpyilal', axis=1)
    y = df['udpyilal']
    
    # --- C. Split Data into Training and Testing Sets ---
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    # --- D. Handle Class Imbalance ---
    scale_pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]
    print(f"Calculated scale_pos_weight for class imbalance: {scale_pos_weight:.2f}")

    # --- E. Define the Parameter Grid for GridSearchCV ---
    param_grid = {
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.05, 0.1],
        'n_estimators': [100, 200, 300],
        'subsample': [0.8, 0.9, 1.0],
        'colsample_bytree': [0.8, 0.9, 1.0],
        'gamma': [0, 0.1, 0.5]
    }

    # --- F. Initialize the XGBoost Model and GridSearchCV ---
    model = xgb.XGBClassifier(
        objective='binary:logistic',
        scale_pos_weight=scale_pos_weight,
        eval_metric='logloss',
        use_label_encoder=False,
        random_state=42
    )
    
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='roc_auc',
        cv=3,
        verbose=1,
        n_jobs=-1
    )
    
    print("Starting GridSearchCV... This may take a significant amount of time.")
    grid_search.fit(X_train, y_train)
    
    # --- G. Get the Best Model and Evaluate ---
    print("\nGridSearchCV complete.")
    print(f"Best parameters found: {grid_search.best_params_}")
    
    best_model = grid_search.best_estimator_
    
    print("\n--- Evaluating the best model on the test set (Default 0.5 Threshold) ---")
    y_pred_default = best_model.predict(X_test)
    y_pred_proba = best_model.predict_proba(X_test)[:, 1]

    print("\nClassification Report (Default Threshold):")
    print(classification_report(y_test, y_pred_default))
    
    auc_score = roc_auc_score(y_test, y_pred_proba)
    print(f"AUC-ROC Score: {auc_score:.4f}")

    # --- NEW: H. Adjust Prediction Threshold for Better Precision ---
    # Find the optimal threshold that maximizes the F1-score, which is a balance
    # between precision and recall. This often gives more balanced results.
    precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)
    # Calculate F1 score for each threshold, ignoring the last value
    fscore = (2 * precision * recall) / (precision + recall)
    # Find the index of the best F1 score
    ix = np.argmax(fscore)
    best_threshold = thresholds[ix]
    
    print(f"\n--- Evaluating with New, Optimized Threshold: {best_threshold:.2f} ---")
    
    # Apply the new threshold to the probabilities to get new predictions
    y_pred_new_threshold = (y_pred_proba >= best_threshold).astype(int)
    
    print("\nClassification Report (Optimized Threshold):")
    print(classification_report(y_test, y_pred_new_threshold))
    
    # --- I. Save the Trained Model ---
    model_filename = f'trained_models/xgb_model_class_{i}.joblib'
    joblib.dump(best_model, model_filename)
    print(f"\nOptimized model for Class {i} saved to: {model_filename}")

    end_time = time.time()
    print(f"Total time for Class {i}: {(end_time - start_time) / 60:.2f} minutes\n")

print("--- Step 2 Complete. All optimized models have been trained and saved. ---")


--- Processing Class 1 ---
Calculated scale_pos_weight for class imbalance: 12.10
Starting GridSearchCV... This may take a significant amount of time.
Fitting 3 folds for each of 729 candidates, totalling 2187 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



GridSearchCV complete.
Best parameters found: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.8}

--- Evaluating the best model on the test set (Default 0.5 Threshold) ---

Classification Report (Default Threshold):
              precision    recall  f1-score   support

           0       0.96      0.67      0.79      9040
           1       0.14      0.66      0.24       747

    accuracy                           0.67      9787
   macro avg       0.55      0.67      0.51      9787
weighted avg       0.90      0.67      0.75      9787

AUC-ROC Score: 0.7330

--- Evaluating with New, Optimized Threshold: 0.89 ---

Classification Report (Optimized Threshold):
              precision    recall  f1-score   support

           0       0.92      1.00      0.96      9040
           1       0.00      0.00      0.00       747

    accuracy                           0.92      9787
   macro avg       0.46      0.50      0.48      

  fscore = (2 * precision * recall) / (precision + recall)


Calculated scale_pos_weight for class imbalance: 6.17
Starting GridSearchCV... This may take a significant amount of time.
Fitting 3 folds for each of 729 candidates, totalling 2187 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



GridSearchCV complete.
Best parameters found: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.9}

--- Evaluating the best model on the test set (Default 0.5 Threshold) ---

Classification Report (Default Threshold):
              precision    recall  f1-score   support

           0       0.91      0.63      0.75     15098
           1       0.21      0.61      0.32      2445

    accuracy                           0.63     17543
   macro avg       0.56      0.62      0.53     17543
weighted avg       0.81      0.63      0.69     17543

AUC-ROC Score: 0.6699

--- Evaluating with New, Optimized Threshold: 0.86 ---

Classification Report (Optimized Threshold):


  fscore = (2 * precision * recall) / (precision + recall)


              precision    recall  f1-score   support

           0       0.86      1.00      0.92     15098
           1       0.00      0.00      0.00      2445

    accuracy                           0.86     17543
   macro avg       0.43      0.50      0.46     17543
weighted avg       0.74      0.86      0.80     17543


Optimized model for Class 2 saved to: trained_models/xgb_model_class_2.joblib
Total time for Class 2: 6.75 minutes

--- Processing Class 3 ---
Calculated scale_pos_weight for class imbalance: 11.53
Starting GridSearchCV... This may take a significant amount of time.
Fitting 3 folds for each of 729 candidates, totalling 2187 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



GridSearchCV complete.
Best parameters found: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.9}

--- Evaluating the best model on the test set (Default 0.5 Threshold) ---

Classification Report (Default Threshold):
              precision    recall  f1-score   support

           0       0.96      0.64      0.77     14116
           1       0.14      0.66      0.23      1224

    accuracy                           0.64     15340
   macro avg       0.55      0.65      0.50     15340
weighted avg       0.89      0.64      0.72     15340

AUC-ROC Score: 0.6999

--- Evaluating with New, Optimized Threshold: 0.62 ---

Classification Report (Optimized Threshold):
              precision    recall  f1-score   support

           0       0.94      0.86      0.90     14116
           1       0.19      0.37      0.25      1224

    accuracy                           0.82     15340
   macro avg       0.56      0.61      0.57     1

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



GridSearchCV complete.
Best parameters found: {'colsample_bytree': 1.0, 'gamma': 0.5, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.8}

--- Evaluating the best model on the test set (Default 0.5 Threshold) ---

Classification Report (Default Threshold):
              precision    recall  f1-score   support

           0       0.98      0.70      0.82     13231
           1       0.10      0.67      0.17       654

    accuracy                           0.70     13885
   macro avg       0.54      0.68      0.49     13885
weighted avg       0.94      0.70      0.79     13885

AUC-ROC Score: 0.7384

--- Evaluating with New, Optimized Threshold: 0.89 ---

Classification Report (Optimized Threshold):
              precision    recall  f1-score   support

           0       0.95      1.00      0.98     13231
           1       0.00      0.00      0.00       654

    accuracy                           0.95     13885
   macro avg       0.48      0.50      0.49    

  fscore = (2 * precision * recall) / (precision + recall)
