In [2]:
# 04. Model Training (FINAL FIXED VERSION)

import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.over_sampling import SMOTE
import lightgbm as lgb
import joblib

# -----------------------------
# LOAD PREPROCESSED DATA
# -----------------------------
train = pd.read_csv("train_processed.csv")

# FULL FEATURE SET
X = train.drop("Churn", axis=1)
y = train["Churn"]

# Identify object columns (should be NONE after preprocessing)
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

# If there are still categorical columns â†’ encode them
label_encoders = {}
X_encoded = X.copy()

for col in categorical_cols:
    le = LabelEncoder()
    X_encoded[col] = le.fit_transform(X[col].astype(str))
    label_encoders[col] = le

# -----------------------------
# SCALE NUMERIC FEATURES
# -----------------------------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_encoded)

# -----------------------------
# BALANCE THE DATA
# -----------------------------
sm = SMOTE(random_state=42)
X_bal, y_bal = sm.fit_resample(X_scaled, y)

# -----------------------------
# TRAIN MODEL
# -----------------------------
lgb_model = lgb.LGBMClassifier(
    n_estimators=600,
    learning_rate=0.05,
    max_depth=-1,
    num_leaves=50,
    subsample=0.8,
    colsample_bytree=0.8,
    class_weight="balanced",
    random_state=42
)

lgb_model.fit(X_bal, y_bal)


# -----------------------------
# SAVE EVERYTHING FOR INFERENCE
# -----------------------------

# Save model
joblib.dump(lgb_model, "rf_model.pkl")

# Save scaler
joblib.dump(scaler, "scaler.pkl")

# Save label encoders (if any)
joblib.dump(label_encoders, "label_encoders.pkl")

# Save the column order used during training
joblib.dump(list(X_encoded.columns), "feature_names.pkl")

print("Model training complete. All artifacts saved.")

[LightGBM] [Info] Number of positive: 249999, number of negative: 249999
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.063043 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2571
[LightGBM] [Info] Number of data points in the train set: 499998, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Model training complete. All artifacts saved.
