In [4]:
# scikeras not itstalled in the colab environment
!pip install scikeras

Collecting scikeras
  Downloading scikeras-0.13.0-py3-none-any.whl.metadata (3.1 kB)
Downloading scikeras-0.13.0-py3-none-any.whl (26 kB)
Installing collected packages: scikeras
Successfully installed scikeras-0.13.0


In [26]:
# --- 1. CORE LIBRARIES AND UTILITIES ---
# Standard data manipulation and visualization libraries
import numpy as np
import pandas as pd
from time import time

# --- 2. SCIKIT-LEARN: MODEL SELECTION & METRICS ---
# Tools for splitting data, cross-validation, and performance evaluation
from sklearn.model_selection import (
    train_test_split, 
    KFold, 
    cross_val_score, 
    GridSearchCV
)
from sklearn.metrics import (
    classification_report, 
    confusion_matrix, 
    accuracy_score, 
    roc_auc_score
)
from sklearn.pipeline import Pipeline # For creating robust modeling workflows
from sklearn.impute import SimpleImputer # Tool to handle NaNs
from sklearn.preprocessing import FunctionTransformer, StandardScaler

# --- 3. SCIKIT-LEARN: BASE CLASSIFIERS ---
# Linear Models, Instance-Based, and Probabilistic Models
from sklearn.linear_model import LogisticRegression # Linear Model
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis # Discriminant Model
from sklearn.naive_bayes import GaussianNB # Probabilistic Model
from sklearn.neighbors import KNeighborsClassifier # Instance-based Model
from sklearn.tree import DecisionTreeClassifier # Tree-based Model

# Note: SVC (Support Vector Machine) is often excluded from rapid exploration 
# due to its high computational cost on large datasets.
from sklearn.svm import SVC 
from sklearn.neural_network import MLPClassifier # Simple Neural Network

# --- 4. SCIKIT-LEARN: ENSEMBLE MODELS ---
# Advanced classifiers for improved performance (often the top performers)
from sklearn.ensemble import (
    AdaBoostClassifier, 
    GradientBoostingClassifier,
    RandomForestClassifier, 
    ExtraTreesClassifier
)
# modern boosting libraries that often outperform Scikit-learn ensembles:
import lightgbm as lgb
import xgboost as xgb


# --- 5. DEEP LEARNING (Keras/TensorFlow) ---
# Note: Using SciKeras for Scikit-learn compatibility is the modern best practice.
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout # Added Dropout for robustness
from scikeras.wrappers import KerasClassifier # Modern replacement
from tensorflow.keras.optimizers import SGD, Adam # Adam is usually preferred

In [14]:
import os
import sys

In [15]:
# Define project Path in Colab
PROJECT_BASE_PATH = '/content/drive/MyDrive/Project_01' 

# ADD 'src' DIRECTORY TO PYTHON PATH
SRC_PATH = os.path.join(PROJECT_BASE_PATH, 'src')

# verify if SRC_PATH is already in sys.path
if SRC_PATH not in sys.path:
    sys.path.insert(0, SRC_PATH)
    print("✅ Successfully added 'src' directory to Python path.")

# IMPORT Paths CLASS FROM config MODULE
from config import Paths

✅ Successfully added 'src' directory to Python path.


In [16]:
try:
    from config import Paths
    
    # 3. Inicialize a instância com um nome único (cfg)
    cfg = Paths(PROJECT_BASE_PATH) # <-- Mudança aqui
    cfg.create_dirs() 
    
    print("\n✅ Project configuration (Paths) initialized successfully.")
    print(f"Raw Data Path check: {cfg.TRAIN_RAW_FILE}")
    
except ImportError:
    print("❌ Error: Could not import Paths from config module.")


✅ Project configuration (Paths) initialized successfully.
Raw Data Path check: /content/drive/MyDrive/Project_01/data/raw/application_train.csv


In [19]:

FINAL_TRAIN_FILE = os.path.join(cfg.DATA_PROCESSED_DIR, 'train_final_encoded.parquet')
FINAL_TEST_FILE = os.path.join(cfg.DATA_PROCESSED_DIR, 'test_final_encoded.parquet')

try:
    # Read the Parquet files
    df_train_final = pd.read_parquet(FINAL_TRAIN_FILE)
    df_test_final = pd.read_parquet(FINAL_TEST_FILE)
    
    print(f"✅ Loaded Encoded Train Data. Shape: {df_train_final.shape}")
    print(f"✅ Loaded Encoded Test Data. Shape: {df_test_final.shape}")

except FileNotFoundError:
    print("❌ ERROR: Parquet files not found. Ensure Block 13 was executed successfully and saved the files.")
    # Exit or raise an error if critical data is missing
    raise

✅ Loaded Encoded Train Data. Shape: (307511, 266)
✅ Loaded Encoded Test Data. Shape: (48744, 265)


In [33]:
print("--- Running Data Diagnostics on Features (X) ---")

# 1. Separate Features (X) and Target (y)
EXCLUDED_COLS = ['SK_ID_CURR', 'TARGET', 'TIME_INDEX'] 
features = [col for col in df_train_final.columns if col not in EXCLUDED_COLS]

X = df_train_final[features]
y = df_train_final['TARGET']

# 2. Check for Non-Numerical Data Types
non_numeric_cols = X.select_dtypes(include=['object', 'category']).columns

if len(non_numeric_cols) > 0:
    print("❌ CRITICAL ERROR: Found non-numerical columns in the feature matrix X.")
    print("   Please fix the following columns in Block 13 (Encoding):")
    print(non_numeric_cols)
    # Stop execution to prevent further errors
    raise ValueError("Non-numerical features found. Check Block 13.")
else:
    print("✅ All features appear to be numerical (int/float).")

# 3. Final Check for Invalid Values (NaN/Inf) *before* the Pipeline
# (The Pipeline should handle this, but checking the raw state is useful)
total_nans = X.isna().sum().sum()
total_infs = np.isinf(X).sum().sum()

if total_nans > 0 or total_infs > 0:
    print(f"⚠️ WARNING: X contains {total_nans} NaN values and {total_infs} Inf values.")
    print("   The Universal Preprocessor Pipeline should handle these.")
else:
    print("✅ No NaNs or Infs found in raw features (before pipeline).")

# 4. Convert X to a pure NumPy array for Scikit-learn (Safe measure)
# This forces Pandas to resolve any underlying type issues.
X = X.values 
print("✅ X converted to NumPy array for training.")

# (Continue with the KFold setup and training loop...)


--- Running Data Diagnostics on Features (X) ---


✅ All features appear to be numerical (int/float).
   The Universal Preprocessor Pipeline should handle these.
✅ X converted to NumPy array for training.


In [34]:
# Define the Inf/NaN Cleanup Step
replace_inf = FunctionTransformer(lambda X: np.nan_to_num(X, nan=np.nan, posinf=np.nan, neginf=np.nan), 
                                  validate=False)

# Define the Universal Preprocessor Pipeline (Inf/NaN Handling)
universal_preprocessor = Pipeline([
    ('inf_handler', replace_inf),
    ('imputer', SimpleImputer(strategy='mean')),
])


# 1. Configuration and Model List 
models = [
    # A. Linear & Scaled Models (Need Preprocessor + Scaler)
    ('Logistic Regression (Scaled)', Pipeline([
        ('preprocessor', universal_preprocessor),
        ('scaler', StandardScaler()), 
        ('classifier', LogisticRegression(solver='liblinear', random_state=42, C=0.01, max_iter=200))
    ])),
    ('K-Nearest Neighbors (Scaled)', Pipeline([
        ('preprocessor', universal_preprocessor),
        ('scaler', StandardScaler()), 
        ('classifier', KNeighborsClassifier(n_neighbors=5))
    ])),
    
    # B. Tree Models (Need Preprocessor for NaN handling)
    ('Decision Tree (Clean)', Pipeline([
        ('preprocessor', universal_preprocessor),
        ('classifier', DecisionTreeClassifier(random_state=42))
    ])),
    ('Random Forest (Clean)', Pipeline([
        ('preprocessor', universal_preprocessor),
        ('classifier', RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42, n_jobs=-1))
    ])),

    # C. Boosting Models (Run through Preprocessor for consistency)
    ('LightGBM (Clean)', Pipeline([
        ('preprocessor', universal_preprocessor),
        ('classifier', lgb.LGBMClassifier(random_state=42, n_jobs=-1, verbose=-1, n_estimators=500))
    ])),
    ('XGBoost (Clean)', Pipeline([
        ('preprocessor', universal_preprocessor),
        ('classifier', xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, n_jobs=-1, n_estimators=500))
    ])),
]

# Set up Cross-Validation strategy
NFOLDS = 3 
kfold = KFold(n_splits=NFOLDS, shuffle=True, random_state=42)

# Lists to store results
cv_results = []
model_names = []
training_times = []


# 2. Training Loop with Cross-Validation

print("--- Starting Model Cross-Validation ---")

for name, model in models:
    start_time = time()
    
    try:
        # Perform Cross-Validation using ROC AUC
        scores = cross_val_score(model, X, y, cv=kfold, scoring='roc_auc', n_jobs=-1)
        
        end_time = time()
        
        # Append results
        cv_results.append(scores)
        model_names.append(name)
        training_times.append(end_time - start_time)
        
        # Print interim results
        print(f"--------------------------------------------------")
        print(f"Model: {name}")
        print(f"Mean AUC: {scores.mean():.4f} (Std Dev: {scores.std():.4f})")
        print(f"Training Time: {(end_time - start_time):.2f} seconds")
        
    except Exception as e:
        print(f"❌ Error encountered for model {name}: {e}")
        cv_results.append([np.nan] * NFOLDS)
        model_names.append(name)
        training_times.append(np.nan)


# 3. Final Summary

results_df = pd.DataFrame({
    'Model': model_names,
    'Mean_AUC': [np.mean(res) for res in cv_results],
    'Std_Dev_AUC': [np.std(res) for res in cv_results],
    'Time_Seconds': training_times
}).sort_values(by='Mean_AUC', ascending=False)

print("\n--- Summary of Model Performance (Ranked by Mean AUC) ---")
print(results_df)

--- Starting Model Cross-Validation ---


: 