In [None]:
#!sudo apt-get install -y libomp-dev
#!git clone --recursive https://github.com/microsoft/LightGBM
#%cd LightGBM
#!mkdir build && cd build && cmake -DUSE_GPU=1 -DUSE_OPENCL=1 .. && make -j$(nproc)
#!cd ../python-package && python setup.py install

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  libllvm14 libomp-14-dev libomp5-14
Suggested packages:
  libomp-14-doc
The following NEW packages will be installed:
  libllvm14 libomp-14-dev libomp-dev libomp5-14
0 upgraded, 4 newly installed, 0 to remove and 41 not upgraded.
Need to get 24.7 MB of archives.
After this operation, 118 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 libllvm14 amd64 1:14.0.0-1ubuntu1.1 [24.0 MB]
Get:2 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 libomp5-14 amd64 1:14.0.0-1ubuntu1.1 [389 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 libomp-14-dev amd64 1:14.0.0-1ubuntu1.1 [347 kB]
Get:4 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libomp-dev amd64 1:14.0-55~exp2 [3,074 B]
Fetched 24.7 MB in 3s (9,409 kB/s)
debconf: unable to initialize frontend: 

In [14]:
# --- 1. CORE LIBRARIES AND UTILITIES ---
# Standard data manipulation and visualization libraries
import numpy as np
import pandas as pd
from time import time

# --- 2. SCIKIT-LEARN: MODEL SELECTION & METRICS ---
# Tools for splitting data, cross-validation, and performance evaluation
from sklearn.model_selection import (
    train_test_split, 
    KFold, 
    cross_val_score,
    StratifiedKFold, 
    RandomizedSearchCV
)
from sklearn.metrics import (
    classification_report, 
    confusion_matrix, 
    accuracy_score, 
    roc_auc_score
)
from sklearn.pipeline import Pipeline # For creating robust modeling workflows
from sklearn.impute import SimpleImputer # Tool to handle NaNs
from sklearn.preprocessing import FunctionTransformer, StandardScaler

import lightgbm as lgb

import os
import sys
import json
import joblib


In [2]:
# Define project Path in Colab
PROJECT_BASE_PATH = '/content/drive/MyDrive/Project_01' 

# ADD 'src' DIRECTORY TO PYTHON PATH
SRC_PATH = os.path.join(PROJECT_BASE_PATH, 'src')

# verify if SRC_PATH is already in sys.path
if SRC_PATH not in sys.path:
    sys.path.insert(0, SRC_PATH)
    print("✅ Successfully added 'src' directory to Python path.")

# IMPORT Paths CLASS FROM config MODULE
from config import Paths

✅ Successfully added 'src' directory to Python path.


In [3]:
try:
    from config import Paths
    
    # 3. Inicialize a instância com um nome único (cfg)
    cfg = Paths(PROJECT_BASE_PATH) # <-- Mudança aqui
    cfg.create_dirs() 
    
    print("\n✅ Project configuration (Paths) initialized successfully.")
    print(f"Raw Data Path check: {cfg.TRAIN_RAW_FILE}")
    
except ImportError:
    print("❌ Error: Could not import Paths from config module.")


✅ Project configuration (Paths) initialized successfully.
Raw Data Path check: /content/drive/MyDrive/Project_01/data/raw/application_train.csv


In [4]:
FINAL_TRAIN_FILE = os.path.join(cfg.DATA_PROCESSED_DIR, 'train_final_encoded.parquet')
FINAL_TEST_FILE = os.path.join(cfg.DATA_PROCESSED_DIR, 'test_final_encoded.parquet')

try:
    # Read the Parquet files
    df_train_final = pd.read_parquet(FINAL_TRAIN_FILE)
    df_test_final = pd.read_parquet(FINAL_TEST_FILE)
    
    print(f"✅ Loaded Encoded Train Data. Shape: {df_train_final.shape}")
    print(f"✅ Loaded Encoded Test Data. Shape: {df_test_final.shape}")

except FileNotFoundError:
    print("❌ ERROR: Parquet files not found. Ensure Block 13 was executed successfully and saved the files.")
    # Exit or raise an error if critical data is missing
    raise

✅ Loaded Encoded Train Data. Shape: (307511, 135)
✅ Loaded Encoded Test Data. Shape: (48744, 134)


In [5]:
print("--- Defining bases and universal Imputer ---")

# 1. Prepare Features (X) and Target (y)
TARGET = "TARGET"

X = df_train_final.drop(columns=[TARGET], errors="ignore")
y = df_train_final[TARGET]

print(f"Feature count: {X.shape[1]}")
print(f"Number of training samples: {X.shape[0]}")

--- Defining bases and universal Imputer ---


Feature count: 134
Number of training samples: 307511


In [15]:
# --- Randomized Search for LightGBM ---

# ================================================================
# Randomized Search
# ================================================================

lgbm = lgb.LGBMClassifier(
    objective="binary",
    boosting_type="gbdt",
    random_state=42
)

# Hyperparameter distributions (Wide but efficient)
param_dist = {
    "num_leaves": [31, 50, 70],
    "learning_rate": [0.01, 0.02],
    "n_estimators": [300, 500, 700],
    "max_depth": [-1, 8],
}

# Number of parameter sets to try
N_ITER = 3

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

print(f"\n=== Starting RandomizedSearchCV with {N_ITER} iterations ===")

random_search = RandomizedSearchCV(
    estimator=lgbm,
    param_distributions=param_dist,
    n_iter=N_ITER,
    scoring="roc_auc",
    cv=cv,
    verbose=2,
    n_jobs=1,  
    random_state=42,
)

random_search.fit(X, y)

# ================================================================
# Results
# ================================================================

print("\n=== Randomized Search Completed ===")
print(f"Best ROC-AUC: {random_search.best_score_:.5f}")
print("Best params:")
print(random_search.best_params_)

# Save best params for Notebook 06
params_path = os.path.join(cfg.MODEL_DIR, "best_params.json")
with open(params_path, "w") as f:
    json.dump(random_search.best_params_, f, indent=4)

print(f"Saved best params to {params_path}")

# Save raw best estimator (not final model)
best_model_path = os.path.join(cfg.MODEL_DIR, "best_lgbm_random.pkl")
joblib.dump(random_search.best_estimator_, best_model_path)

print(f"Saved best LightGBM estimator to {best_model_path}")



=== Starting RandomizedSearchCV with 3 iterations ===
Fitting 5 folds for each of 3 candidates, totalling 15 fits
[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.230372 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12809
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 124
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432482
[LightGBM] [Info] Start training from score -2.432482
[CV] END learning_rate=0.02, max_depth=8, n_estimators=700, num_leaves=70; total time= 1.9min
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.297324 seconds.
You can set `force_row_wise=true` to remove the overh