In [1]:
# --- 1. IMPORT NECESSARY LIBRARIES ---
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, roc_auc_score
import os
import joblib # Used for saving the trained models
import time # To time the grid search process

# --- 2. SETUP AND LOOP THROUGH EACH CLASS ---
# This script assumes you have a 'data' folder with the 4 CSV files from Step 1.

# Create a directory to save the trained Random Forest models
if not os.path.exists('trained_rf_models'):
    os.makedirs('trained_rf_models')

# Loop through each of the 4 classes
for i in range(1, 5):
    start_time = time.time()
    print(f"--- Processing Class {i} with Random Forest ---")
    
    # --- A. Load the Data ---
    file_path = f'data/class_{i}_modeling_data.csv'
    try:
        df = pd.read_csv(file_path)
    except FileNotFoundError:
        print(f"Error: Could not find the file {file_path}. Please ensure it exists.")
        continue

    # --- B. Separate Features (X) and Target (y) ---
    X = df.drop('udpyilal', axis=1)
    y = df['udpyilal']
    
    # --- C. Split Data into Training and Testing Sets ---
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    # --- D. Define the Parameter Grid for GridSearchCV ---
    # This is a parameter grid specifically for RandomForestClassifier.
    # We will test different numbers of trees and tree depths.
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [5, 10, None], # None means nodes are expanded until all leaves are pure
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2'] # Number of features to consider when looking for the best split
    }

    # --- E. Initialize the Random Forest Model and GridSearchCV ---
    # We use class_weight='balanced' to automatically handle the class imbalance.
    # This tells the model to adjust weights inversely proportional to class frequencies.
    model = RandomForestClassifier(
        class_weight='balanced',
        random_state=42
    )
    
    # GridSearchCV will test all combinations in param_grid using 3-fold cross-validation.
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='roc_auc',
        cv=3,
        verbose=1,
        n_jobs=-1
    )
    
    print("Starting GridSearchCV for Random Forest... This may take some time.")
    grid_search.fit(X_train, y_train)
    
    # --- F. Get the Best Model and Evaluate ---
    print("\nGridSearchCV complete.")
    print(f"Best parameters found: {grid_search.best_params_}")
    
    # The best model found by the search
    best_model = grid_search.best_estimator_
    
    print("\nEvaluating the best model on the test set...")
    y_pred = best_model.predict(X_test)
    y_pred_proba = best_model.predict_proba(X_test)[:, 1]

    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    auc_score = roc_auc_score(y_test, y_pred_proba)
    print(f"AUC-ROC Score: {auc_score:.4f}")
    
    # --- G. Save the Trained Model ---
    model_filename = f'trained_rf_models/rf_model_class_{i}.joblib'
    joblib.dump(best_model, model_filename)
    print(f"\nOptimized Random Forest model for Class {i} saved to: {model_filename}")

    end_time = time.time()
    print(f"Total time for Class {i}: {(end_time - start_time) / 60:.2f} minutes\n")

print("--- All Random Forest models have been trained and saved. ---")


--- Processing Class 1 with Random Forest ---
Starting GridSearchCV for Random Forest... This may take some time.
Fitting 3 folds for each of 36 candidates, totalling 108 fits

GridSearchCV complete.
Best parameters found: {'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'n_estimators': 200}

Evaluating the best model on the test set...

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.69      0.80      9040
           1       0.14      0.63      0.23       747

    accuracy                           0.68      9787
   macro avg       0.55      0.66      0.52      9787
weighted avg       0.90      0.68      0.76      9787

AUC-ROC Score: 0.7258

Optimized Random Forest model for Class 1 saved to: trained_rf_models/rf_model_class_1.joblib
Total time for Class 1: 2.71 minutes

--- Processing Class 2 with Random Forest ---
Starting GridSearchCV for Random Forest... This may take some time.
Fitting 3 folds for each o