In [2]:
# --- 1. CORE LIBRARIES AND UTILITIES ---
# Standard data manipulation and visualization libraries
import numpy as np
import pandas as pd
from time import time

# --- 2. SCIKIT-LEARN: MODEL SELECTION & METRICS ---
# Tools for splitting data, cross-validation, and performance evaluation
from sklearn.model_selection import (
    train_test_split, 
    KFold, 
    cross_val_score,
    StratifiedKFold, 
    RandomizedSearchCV
)
from sklearn.metrics import (
    classification_report, 
    confusion_matrix, 
    accuracy_score, 
    roc_auc_score
)
from sklearn.pipeline import Pipeline # For creating robust modeling workflows
from sklearn.impute import SimpleImputer # Tool to handle NaNs
from sklearn.preprocessing import FunctionTransformer, StandardScaler

import lightgbm as lgb

import os
import sys

In [3]:
# Define project Path in Colab
PROJECT_BASE_PATH = '/content/drive/MyDrive/Project_01' 

# ADD 'src' DIRECTORY TO PYTHON PATH
SRC_PATH = os.path.join(PROJECT_BASE_PATH, 'src')

# verify if SRC_PATH is already in sys.path
if SRC_PATH not in sys.path:
    sys.path.insert(0, SRC_PATH)
    print("✅ Successfully added 'src' directory to Python path.")

# IMPORT Paths CLASS FROM config MODULE
from config import Paths

✅ Successfully added 'src' directory to Python path.


In [4]:
try:
    from config import Paths
    
    # 3. Inicialize a instância com um nome único (cfg)
    cfg = Paths(PROJECT_BASE_PATH) # <-- Mudança aqui
    cfg.create_dirs() 
    
    print("\n✅ Project configuration (Paths) initialized successfully.")
    print(f"Raw Data Path check: {cfg.TRAIN_RAW_FILE}")
    
except ImportError:
    print("❌ Error: Could not import Paths from config module.")


✅ Project configuration (Paths) initialized successfully.
Raw Data Path check: /content/drive/MyDrive/Project_01/data/raw/application_train.csv


In [5]:
FINAL_TRAIN_FILE = os.path.join(cfg.DATA_PROCESSED_DIR, 'train_final_encoded.parquet')
FINAL_TEST_FILE = os.path.join(cfg.DATA_PROCESSED_DIR, 'test_final_encoded.parquet')

try:
    # Read the Parquet files
    df_train_final = pd.read_parquet(FINAL_TRAIN_FILE)
    df_test_final = pd.read_parquet(FINAL_TEST_FILE)
    
    print(f"✅ Loaded Encoded Train Data. Shape: {df_train_final.shape}")
    print(f"✅ Loaded Encoded Test Data. Shape: {df_test_final.shape}")

except FileNotFoundError:
    print("❌ ERROR: Parquet files not found. Ensure Block 13 was executed successfully and saved the files.")
    # Exit or raise an error if critical data is missing
    raise

✅ Loaded Encoded Train Data. Shape: (307511, 135)
✅ Loaded Encoded Test Data. Shape: (48744, 134)


In [6]:
print("--- Defining bases and universal Imputer ---")

# 1. Prepare Features (X) and Target (y)
TARGET = "TARGET"

X = df_train_final.drop(columns=[TARGET], errors="ignore")
y = df_train_final[TARGET]

print(f"Feature count: {X.shape[1]}")
print(f"Number of training samples: {X.shape[0]}")

--- Defining bases and universal Imputer ---


Feature count: 134
Number of training samples: 307511


In [None]:
# --- Randomized Search for LightGBM ---

# ================================================================
# Randomized Search
# ================================================================

lgbm = lgb.LGBMClassifier(
    objective="binary",
    boosting_type="gbdt",
    random_state=42,
    n_jobs=-1,
)

# Hyperparameter distributions (Wide but efficient)
param_dist = {
    "num_leaves": [15, 31, 50, 70, 90],
    "learning_rate": [0.005, 0.01, 0.02, 0.03, 0.05],
    "n_estimators": [300, 500, 700, 900, 1100],
    "max_depth": [-1, 6, 8, 12],
    "min_child_samples": [5, 10, 20, 30, 50],
    "subsample": [0.6, 0.7, 0.8, 0.9, 1.0],
    "colsample_bytree": [0.6, 0.7, 0.8, 0.9, 1.0],
}

# Number of parameter sets to try
N_ITER = 40   # This is the sweet spot for Colab (≈ 5–10 min)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

print(f"\n=== Starting RandomizedSearchCV with {N_ITER} iterations ===")

random_search = RandomizedSearchCV(
    estimator=lgbm,
    param_distributions=param_dist,
    n_iter=N_ITER,
    scoring="roc_auc",
    cv=cv,
    verbose=2,
    n_jobs=-1,
    random_state=42,
)

random_search.fit(X, y)

# ================================================================
# Results
# ================================================================

print("\n=== Randomized Search Completed ===")
print(f"Best ROC-AUC: {random_search.best_score_:.5f}")
print("Best params:")
print(random_search.best_params_)

# Save best params for Notebook 06
params_path = os.path.join(cfg.MODEL_DIR, "best_params.json")
with open(params_path, "w") as f:
    json.dump(random_search.best_params_, f, indent=4)

print(f"Saved best params to {params_path}")

# Save raw best estimator (not final model)
best_model_path = os.path.join(cfg.MODEL_DIR, "best_lgbm_random.pkl")
joblib.dump(random_search.best_estimator_, best_model_path)

print(f"Saved best LightGBM estimator to {best_model_path}")



=== Starting RandomizedSearchCV with 40 iterations ===
Fitting 5 folds for each of 40 candidates, totalling 200 fits


