In [1]:
# ---------------------------------------------------
# Import Required Libraries
# ---------------------------------------------------
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier

In [2]:
   
df = pd.read_csv("../data/processed/train_fe.csv")
print("Shape of FE dataset:", df.shape)

# Hedef ve feature ayrımı
y = df["TARGET"]
X = df.drop(columns=["TARGET"])

# ---------------------------------------------------
# 2) Kategorik kolonları Label Encode et
# ---------------------------------------------------
cat_cols = X.select_dtypes(include=["object"]).columns

le_dict = {}
for col in cat_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    le_dict[col] = le

print("Categorical columns encoded:", list(cat_cols))

# ---------------------------------------------------
# 3) Train / Validation split (Stratified)
# ---------------------------------------------------
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train shape:", X_train.shape, "Val shape:", X_val.shape)

# ---------------------------------------------------
# 4) FE'li Baseline LightGBM (Hızlı)
# ---------------------------------------------------
baseline_model = LGBMClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=-1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

baseline_model.fit(X_train, y_train)
baseline_preds = baseline_model.predict_proba(X_val)[:, 1]
baseline_auc = roc_auc_score(y_val, baseline_preds)
print(f"Baseline with FE ROC-AUC: {baseline_auc:.4f}")

# ---------------------------------------------------
# 5) Hyperparameter Search (RandomizedSearchCV)
# ---------------------------------------------------
param_dist = {
    "num_leaves": [16, 31, 63, 127],
    "max_depth": [-1, 4, 6, 8, 10],
    "learning_rate": [0.01, 0.03, 0.05, 0.07],
    "n_estimators": [200, 400, 600, 800],
    "min_child_samples": [10, 20, 30, 50],
    "subsample": [0.6, 0.7, 0.8, 0.9, 1.0],
    "colsample_bytree": [0.6, 0.7, 0.8, 0.9, 1.0]
}

opt_model = LGBMClassifier(
    objective="binary",
    random_state=42,
    n_jobs=-1
)

random_search = RandomizedSearchCV(
    estimator=opt_model,
    param_distributions=param_dist,
    n_iter=30,               # İstersen 50 yapabilirsin
    scoring="roc_auc",
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

print("Starting RandomizedSearchCV...")
random_search.fit(X_train, y_train)

print("Best ROC-AUC from CV:", random_search.best_score_)
print("Best Params:", random_search.best_params_)

# ---------------------------------------------------
# 6) En iyi parametrelerle modeli yeniden eğit
# ---------------------------------------------------
best_params = random_search.best_params_

final_model = LGBMClassifier(
    **best_params,
    objective="binary",
    random_state=42,
    n_jobs=-1
)

final_model.fit(X_train, y_train)

val_preds = final_model.predict_proba(X_val)[:, 1]
final_auc = roc_auc_score(y_val, val_preds)
print(f"FINAL MODEL ROC-AUC (Validation): {final_auc:.4f}")

# ---------------------------------------------------
# 7) Modeli kaydet
# ---------------------------------------------------
joblib.dump(final_model, "../models/final_model.pkl")
print("Final model saved to ../models/final_model.pkl")


Shape of FE dataset: (307511, 134)
Categorical columns encoded: ['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE']
Train shape: (246008, 133) Val shape: (61503, 133)
[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.079461 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14554
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 129
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432482
[LightGBM] [Info] Start training from score -2.432482
Baseline with FE ROC-AUC: 0.7673
Starting RandomizedSe