In [11]:
# scikeras not itstalled in the colab environment
!pip install scikeras



In [12]:
# --- 1. CORE LIBRARIES AND UTILITIES ---
# Standard data manipulation and visualization libraries
import numpy as np
import pandas as pd
from time import time

# --- 2. SCIKIT-LEARN: MODEL SELECTION & METRICS ---
# Tools for splitting data, cross-validation, and performance evaluation
from sklearn.model_selection import (
    train_test_split, 
    KFold, 
    cross_val_score, 
    GridSearchCV
)
from sklearn.metrics import (
    classification_report, 
    confusion_matrix, 
    accuracy_score, 
    roc_auc_score
)
from sklearn.pipeline import Pipeline # For creating robust modeling workflows
from sklearn.impute import SimpleImputer # Tool to handle NaNs
from sklearn.preprocessing import FunctionTransformer, StandardScaler

# --- 3. SCIKIT-LEARN: BASE CLASSIFIERS ---
# Linear Models, Instance-Based, and Probabilistic Models
from sklearn.linear_model import LogisticRegression # Linear Model
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis # Discriminant Model
from sklearn.naive_bayes import GaussianNB # Probabilistic Model
from sklearn.neighbors import KNeighborsClassifier # Instance-based Model
from sklearn.tree import DecisionTreeClassifier # Tree-based Model

# Note: SVC (Support Vector Machine) is often excluded from rapid exploration 
# due to its high computational cost on large datasets.
from sklearn.svm import SVC 
from sklearn.neural_network import MLPClassifier # Simple Neural Network

# --- 4. SCIKIT-LEARN: ENSEMBLE MODELS ---
# Advanced classifiers for improved performance (often the top performers)
from sklearn.ensemble import (
    AdaBoostClassifier, 
    GradientBoostingClassifier,
    RandomForestClassifier, 
    ExtraTreesClassifier
)
# modern boosting libraries that often outperform Scikit-learn ensembles:
import lightgbm as lgb
import xgboost as xgb


# --- 5. DEEP LEARNING (Keras/TensorFlow) ---
# Note: Using SciKeras for Scikit-learn compatibility is the modern best practice.
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout # Added Dropout for robustness
from scikeras.wrappers import KerasClassifier # Modern replacement
from tensorflow.keras.optimizers import SGD, Adam # Adam is usually preferred

In [13]:
import os
import sys

In [14]:
# Define project Path in Colab
PROJECT_BASE_PATH = '/content/drive/MyDrive/Project_01' 

# ADD 'src' DIRECTORY TO PYTHON PATH
SRC_PATH = os.path.join(PROJECT_BASE_PATH, 'src')

# verify if SRC_PATH is already in sys.path
if SRC_PATH not in sys.path:
    sys.path.insert(0, SRC_PATH)
    print("✅ Successfully added 'src' directory to Python path.")

# IMPORT Paths CLASS FROM config MODULE
from config import Paths

In [15]:
try:
    from config import Paths
    
    # 3. Inicialize a instância com um nome único (cfg)
    cfg = Paths(PROJECT_BASE_PATH) # <-- Mudança aqui
    cfg.create_dirs() 
    
    print("\n✅ Project configuration (Paths) initialized successfully.")
    print(f"Raw Data Path check: {cfg.TRAIN_RAW_FILE}")
    
except ImportError:
    print("❌ Error: Could not import Paths from config module.")


✅ Project configuration (Paths) initialized successfully.
Raw Data Path check: /content/drive/MyDrive/Project_01/data/raw/application_train.csv


In [16]:

FINAL_TRAIN_FILE = os.path.join(cfg.DATA_PROCESSED_DIR, 'train_final_encoded.parquet')
FINAL_TEST_FILE = os.path.join(cfg.DATA_PROCESSED_DIR, 'test_final_encoded.parquet')

try:
    # Read the Parquet files
    df_train_final = pd.read_parquet(FINAL_TRAIN_FILE)
    df_test_final = pd.read_parquet(FINAL_TEST_FILE)
    
    print(f"✅ Loaded Encoded Train Data. Shape: {df_train_final.shape}")
    print(f"✅ Loaded Encoded Test Data. Shape: {df_test_final.shape}")

except FileNotFoundError:
    print("❌ ERROR: Parquet files not found. Ensure Block 13 was executed successfully and saved the files.")
    # Exit or raise an error if critical data is missing
    raise

✅ Loaded Encoded Train Data. Shape: (307511, 135)
✅ Loaded Encoded Test Data. Shape: (48744, 134)


In [17]:
print("--- Starting Comprehensive Model Cross-Validation on SIMPLIFIED Data ---")

# 1. Prepare Features (X) and Target (y)
EXCLUDED_COLS = ['SK_ID_CURR', 'TARGET'] 
features = [col for col in df_train_final.columns if col not in EXCLUDED_COLS]

X = df_train_final[features]
y = df_train_final['TARGET']

# 2. Define Preprocessing Steps
# The data is much cleaner now, so we only need SimpleImputer to handle residual NaNs (like DAYS_EMPLOYED anomaly).
# NOTE: We can skip the FunctionTransformer for 'Inf' replacement, as Target Encoding is unlikely to generate Infs.

universal_imputer = Pipeline([
    # Use 'mean' strategy to handle residual NaNs from the simplified feature engineering
    ('imputer', SimpleImputer(strategy='mean')),
])


# 3. Model List (Applying Imputation and Scaling strategically)
models = [
    # A. Linear & Scaled Models (Need Imputer + Scaler)
    ('Logistic Regression (Scaled)', Pipeline([
        ('preprocessor', universal_imputer),
        ('scaler', StandardScaler()), 
        ('classifier', LogisticRegression(solver='liblinear', random_state=42, C=0.01, max_iter=200))
    ])),
    ('K-Nearest Neighbors (Scaled)', Pipeline([
        ('preprocessor', universal_imputer),
        ('scaler', StandardScaler()), 
        ('classifier', KNeighborsClassifier(n_neighbors=5))
    ])),
    
    # B. Tree Models (Need Imputer for safety/consistency with base scikit-learn)
    ('Decision Tree (Clean)', Pipeline([
        ('preprocessor', universal_imputer),
        ('classifier', DecisionTreeClassifier(random_state=42))
    ])),
    ('Random Forest (Clean)', Pipeline([
        ('preprocessor', universal_imputer),
        ('classifier', RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42, n_jobs=-1))
    ])),

    # C. Boosting Models (Imputer kept for consistency, although they handle NaNs)
    ('LightGBM (Base)', Pipeline([
        ('preprocessor', universal_imputer),
        ('classifier', lgb.LGBMClassifier(random_state=42, n_jobs=-1, verbose=-1, n_estimators=500))
    ])),
    ('XGBoost (Base)', Pipeline([
        ('preprocessor', universal_imputer),
        ('classifier', xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, n_jobs=-1, n_estimators=500))
    ])),
]

# 4. Set up Cross-Validation strategy
NFOLDS = 3 
kfold = KFold(n_splits=NFOLDS, shuffle=True, random_state=42)

# Lists to store results
cv_results = []
model_names = []
training_times = []


# 5. Training Loop with Cross-Validation

print(f"--- Running {NFOLDS}-Fold Cross-Validation for {len(models)} models ---")

for name, model in models:
    start_time = time()
    
    try:
        # Perform Cross-Validation using ROC AUC (Area Under the Curve)
        scores = cross_val_score(model, X, y, cv=kfold, scoring='roc_auc', n_jobs=-1)
        
        end_time = time()
        
        # Append results
        cv_results.append(scores)
        model_names.append(name)
        training_times.append(end_time - start_time)
        
        # Print interim results
        print(f"--------------------------------------------------")
        print(f"Model: {name}")
        print(f"Mean AUC: {scores.mean():.4f} (Std Dev: {scores.std():.4f})")
        print(f"Training Time: {(end_time - start_time):.2f} seconds")
        
    except Exception as e:
        print(f"❌ Error encountered for model {name}: {e}")
        cv_results.append([np.nan] * NFOLDS)
        model_names.append(name)
        training_times.append(np.nan)


# 6. Final Summary

results_df = pd.DataFrame({
    'Model': model_names,
    'Mean_AUC': [np.mean(res) for res in cv_results],
    'Std_Dev_AUC': [np.std(res) for res in cv_results],
    'Time_Seconds': training_times
}).sort_values(by='Mean_AUC', ascending=False)

print("\n--- Summary of Model Performance (Ranked by Mean AUC) ---")
print(results_df)

--- Starting Comprehensive Model Cross-Validation on SIMPLIFIED Data ---


--- Running 3-Fold Cross-Validation for 6 models ---
--------------------------------------------------
Model: Logistic Regression (Scaled)
Mean AUC: 0.7459 (Std Dev: 0.0021)
Training Time: 55.63 seconds
--------------------------------------------------
Model: K-Nearest Neighbors (Scaled)
Mean AUC: 0.5748 (Std Dev: 0.0003)
Training Time: 876.44 seconds
--------------------------------------------------
Model: Decision Tree (Clean)
Mean AUC: 0.5395 (Std Dev: 0.0025)
Training Time: 99.57 seconds
--------------------------------------------------
Model: Random Forest (Clean)
Mean AUC: 0.7264 (Std Dev: 0.0018)
Training Time: 111.14 seconds




--------------------------------------------------
Model: LightGBM (Base)
Mean AUC: 0.7586 (Std Dev: 0.0027)
Training Time: 199.39 seconds
--------------------------------------------------
Model: XGBoost (Base)
Mean AUC: 0.7211 (Std Dev: 0.0095)
Training Time: 129.56 seconds

--- Summary of Model Performance (Ranked by Mean AUC) ---
                          Model  Mean_AUC  Std_Dev_AUC  Time_Seconds
4               LightGBM (Base)  0.758578     0.002664    199.387240
0  Logistic Regression (Scaled)  0.745944     0.002082     55.631243
3         Random Forest (Clean)  0.726444     0.001812    111.135025
5                XGBoost (Base)  0.721120     0.009476    129.557973
1  K-Nearest Neighbors (Scaled)  0.574846     0.000316    876.444904
2         Decision Tree (Clean)  0.539471     0.002540     99.566021


LightTGBM has shown to be the best model, but it is important to notice that Logistic Regression Model also had a great perfomance in the scaled base. The running time for LightGBM is four times longer than Logistic Regression running time, and since LightGBM can deal better with non-linear relations and is trained in the base without scalation, I will perform a grid search for that algorithm in order to test some hyperparameters e maybe increase the Mean_AUC for LightGBM.