I run the model on google colab using t100 gpu thus i have to submit the .ipynb version

In [26]:
from google.colab import drive
drive.flush_and_unmount()
drive.mount('/content/drive')

Mounted at /content/drive


In [27]:
import os
root = os.path.join(os.getcwd(),'drive','MyDrive','train_model')
os.listdir(root)

['processed_data.csv',
 'train_model.ipynb',
 'features_config.json',
 'model_params.json']

In [38]:
data_csv = os.path.join(root,'processed_data.csv')
features_cfg = os.path.join(root,'features_config.json')
model_cfg = os.path.join(root,'model_params.json')

# output
# xgb path will be handled in the xgb fold-wise processing
save_lr_pkl = os.path.join(root,'model','LogisticRegression.pkl')
save_rf_pkl = os.path.join(root,'model','RandomForestClassifier.pkl')

save_inference_csv = os.path.join(root,'test_data_with_actuals.csv')

In [37]:
import pandas as pd
import numpy as np
import json
import joblib
import warnings
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import KFold

try:
    from xgboost import XGBClassifier
except ImportError:
    XGBClassifier = None

warnings.filterwarnings('ignore')


 I wrote a 5-fold model

In [39]:
def train_models_with_5folds():
    print("Loading processed data and configuration...")
    try:
        df = pd.read_csv(data_csv)
        with open(features_cfg, 'r') as f:
            feat_config = json.load(f)
        with open(model_cfg, 'r') as f:
            model_params = json.load(f)
    except FileNotFoundError as e:
        print(f"Error: {e}. Run feature_engineering.py first.")
        return

    features = feat_config['features']
    label = feat_config['label']

    X = df[features]
    y = df[label]

    # no-shuffle split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

    print(f"Training samples: {len(X_train)}, Test samples: {len(X_test)}")

    models = {}

    # Logistic Regression
    if "LogisticRegression" in model_params:
        print("Training Logistic Regression...")
        lr_params = model_params["LogisticRegression"]
        lr = LogisticRegression(**lr_params)
        lr.fit(X_train, y_train)
        joblib.dump(lr, save_lr_pkl)
        models["LogisticRegression"] = lr

    # Random Forest
    if "RandomForestClassifier" in model_params:
        print("Training Random Forest...")
        rf_params = model_params["RandomForestClassifier"]
        rf = RandomForestClassifier(**rf_params, random_state=42)
        rf.fit(X_train, y_train)
        joblib.dump(lr, save_rf_pkl)
        models["RandomForestClassifier"] = rf

    #5-fold CV
    if "XGBClassifier" in model_params:
        if XGBClassifier:
            print("Training XGBoost with 5-fold CV...")

            xgb_params = model_params["XGBClassifier"]
            kf = KFold(n_splits=5, shuffle=False)

            fold_models = []
            fold_accuracies = []

            for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
                print(f"  Fold {fold+1}/5")

                X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
                y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

                xgb = XGBClassifier(
                    **xgb_params,
                    use_label_encoder=False,
                    eval_metric='logloss',
                    random_state=42
                )
                xgb.fit(X_tr, y_tr)

                # validation accuracy
                val_pred = xgb.predict(X_val)
                val_acc = accuracy_score(y_val, val_pred)
                print(f"    Fold {fold+1} accuracy: {val_acc:.4f}")

                fold_models.append(xgb)
                fold_accuracies.append(val_acc)
                joblib.dump(xgb,os.path.join(root, 'model', f'XGBClassifier_fold{fold+1}.pkl'))

            print(f"XGBoost 5-fold mean accuracy: {np.mean(fold_accuracies):.4f}")

            # save the last model
            models["XGBClassifier"] = fold_models[-1]

        else:
            print("Skipping XGBoost. Cannot import XGBClassifier")

    # Evaluation
    best_model = None
    best_acc = 0
    results_summary = []

    print("Model Evaluation...")
    for name, model in models.items():
        preds = model.predict(X_test)
        acc = accuracy_score(y_test, preds)
        print(f"{name} Accuracy: {acc:.4f}")

        results_summary.append({'Model': name, 'Accuracy': acc})

        if acc > best_acc:
            best_acc = acc
            best_model = model

    if best_model:
        joblib.dump(best_model, save_pkl)
        print(f"\nBest model ({type(best_model).__name__}) saved to the corresponding path.")

    test_indices = X_test.index
    test_data_export = df.iloc[test_indices].copy()
    test_data_export.to_csv(save_inference_csv, index=False)

In [40]:
train_models_with_5folds()

Loading processed data and configuration...
Training samples: 984, Test samples: 246
Training Logistic Regression...
Training Random Forest...
Training XGBoost with 5-fold CV...
  Fold 1/5
    Fold 1 accuracy: 0.4772
  Fold 2/5
    Fold 2 accuracy: 0.5279
  Fold 3/5
    Fold 3 accuracy: 0.5431
  Fold 4/5
    Fold 4 accuracy: 0.5431
  Fold 5/5
    Fold 5 accuracy: 0.4847
XGBoost 5-fold mean accuracy: 0.5152
Model Evaluation...
LogisticRegression Accuracy: 0.5041
RandomForestClassifier Accuracy: 0.5041
XGBClassifier Accuracy: 0.5447

Best model (XGBClassifier) saved to best_model.pkl
