In [8]:
# --- 1. CORE LIBRARIES AND UTILITIES ---
# Standard data manipulation and visualization libraries
import numpy as np
import pandas as pd
from time import time

# --- 2. SCIKIT-LEARN: MODEL SELECTION & METRICS ---
# Tools for splitting data, cross-validation, and performance evaluation
from sklearn.model_selection import (
    train_test_split, 
    KFold, 
    cross_val_score, 
    GridSearchCV
)
from sklearn.metrics import (
    classification_report, 
    confusion_matrix, 
    accuracy_score, 
    roc_auc_score
)
from sklearn.pipeline import Pipeline # For creating robust modeling workflows
from sklearn.impute import SimpleImputer # Tool to handle NaNs
from sklearn.preprocessing import FunctionTransformer, StandardScaler

import lightgbm as lgb

import os
import sys

# Define project Path in Colab
PROJECT_BASE_PATH = '/content/drive/MyDrive/Project_01' 

# ADD 'src' DIRECTORY TO PYTHON PATH
SRC_PATH = os.path.join(PROJECT_BASE_PATH, 'src')

# verify if SRC_PATH is already in sys.path
if SRC_PATH not in sys.path:
    sys.path.insert(0, SRC_PATH)
    print("✅ Successfully added 'src' directory to Python path.")

# IMPORT Paths CLASS FROM config MODULE
from config import Paths

try:
    from config import Paths
    
    # 3. Inicialize a instância com um nome único (cfg)
    cfg = Paths(PROJECT_BASE_PATH) # <-- Mudança aqui
    cfg.create_dirs() 
    
    print("\n✅ Project configuration (Paths) initialized successfully.")
    print(f"Raw Data Path check: {cfg.TRAIN_RAW_FILE}")
    
except ImportError:
    print("❌ Error: Could not import Paths from config module.")

FINAL_TRAIN_FILE = os.path.join(cfg.DATA_PROCESSED_DIR, 'train_final_encoded.parquet')
FINAL_TEST_FILE = os.path.join(cfg.DATA_PROCESSED_DIR, 'test_final_encoded.parquet')

try:
    # Read the Parquet files
    df_train_final = pd.read_parquet(FINAL_TRAIN_FILE)
    df_test_final = pd.read_parquet(FINAL_TEST_FILE)
    
    print(f"✅ Loaded Encoded Train Data. Shape: {df_train_final.shape}")
    print(f"✅ Loaded Encoded Test Data. Shape: {df_test_final.shape}")

except FileNotFoundError:
    print("❌ ERROR: Parquet files not found. Ensure that the parquet was executed successfully in Feature_Eng notebook and saved the files.")
    # Exit or raise an error if critical data is missing
    raise


✅ Project configuration (Paths) initialized successfully.
Raw Data Path check: /content/drive/MyDrive/Project_01/data/raw/application_train.csv


✅ Loaded Encoded Train Data. Shape: (307511, 135)
✅ Loaded Encoded Test Data. Shape: (48744, 134)


In [None]:
# Final LightGBM Training with Optimized Hyperparameters

# Imports are omitted as requested, assuming lgb, time, pd, joblib, and necessary configuration (cfg) are imported.

print("--- Starting: Final Model Training for Deployment ---")

# 1. Prepare Features (X) and Target (y)
EXCLUDED_COLS = ['SK_ID_CURR', 'TARGET'] 
features = [col for col in df_train_final.columns if col not in EXCLUDED_COLS]

X = df_train_final[features]
y = df_train_final['TARGET']

X_test = df_test_final[features]

# Final NaN/Inf Handling (Crucial)
# This step ensures X is fully numerical after all previous cleaning steps.
X = X.replace([np.inf, -np.inf], np.nan).fillna(X.mean())
X_test = X_test.replace([np.inf, -np.inf], np.nan).fillna(X_test.mean())
print(f"✅ Feature matrices prepared and cleaned. Training shape: {X.shape}")


# 2. Configure Final Model using Optimized Parameters
# NOTE: Replace these placeholder values with the ACTUAL best parameters 
# found in your '04_Model_Tuning_GridSearch' notebook.
OPTIMAL_PARAMS = {
    'n_estimators': 1000,   
    'learning_rate': 0.01, 
    'num_leaves': 63,       
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'n_jobs': -1,
    'seed': 42,
    'verbose': -1,
}

final_model = lgb.LGBMClassifier(**OPTIMAL_PARAMS)

print(f"--- Training final model using optimized parameters: {OPTIMAL_PARAMS} ---")

# 3. Final Training on FULL Dataset
start_time = time()
final_model.fit(X, y)
end_time = time()

print(f"✅ Final training completed in {(end_time - start_time):.2f} seconds.")


# 4. MLOps Artifact 1: Save the Optimized Model 
# This file is the core component for your API endpoint (e.g., FastAPI/Flask).
model_file_path = os.path.join (cfg.MODEL_DIR, 'final_lgbm_optimized_model.pkl')

joblib.dump(final_model, model_file_path)

print(f"✅ MLOps Artifact: Optimized Model saved to: {model_file_path.name}")


# 5. MLOps Artifact 2: Submission File Creation 
submission_ids = df_test_final['SK_ID_CURR'].astype(int)

df_submission = pd.DataFrame({
    'SK_ID_CURR': submission_ids,
    'TARGET': final_model.predict_proba(X_test)[:, 1]
})

submission_file_path = os.path.join(cfg.SUBMISSION_DIR,  'submission_lgbm_final.csv')

df_submission.to_csv(submission_file_path, index=False)

print("\n=======================================================")
print(f"✅ MLOps Artifact: Final Submission file saved successfully!")
print(f"File: {submission_file_path.name}")
print("=======================================================")

--- Starting: Final Model Training for Deployment ---


✅ Feature matrices prepared and cleaned. Training shape: (307511, 133)
--- Training final model using optimized parameters: {'n_estimators': 1000, 'learning_rate': 0.01, 'num_leaves': 63, 'metric': 'auc', 'boosting_type': 'gbdt', 'n_jobs': -1, 'seed': 42, 'verbose': -1} ---
