In [1]:
# Install category_encoders if not already installed in the environment
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.9.0-py3-none-any.whl.metadata (7.9 kB)
Downloading category_encoders-2.9.0-py3-none-any.whl (85 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m85.9/85.9 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.9.0


In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import sys
from sklearn.preprocessing import LabelEncoder

# Define project Path in Colab
PROJECT_BASE_PATH = '/content/drive/MyDrive/Project_01' 

# ADD 'src' DIRECTORY TO PYTHON PATH
SRC_PATH = os.path.join(PROJECT_BASE_PATH, 'src')

# verify if SRC_PATH is already in sys.path
if SRC_PATH not in sys.path:
    sys.path.insert(0, SRC_PATH)
    print("‚úÖ Successfully added 'src' directory to Python path.")

# IMPORT Paths CLASS FROM config MODULE
from config import Paths

‚úÖ Successfully added 'src' directory to Python path.


In [3]:
try:
    from config import Paths
    
    # 3. Inicialize a inst√¢ncia com um nome √∫nico (cfg)
    cfg = Paths(PROJECT_BASE_PATH) # <-- Mudan√ßa aqui
    cfg.create_dirs() 
    
    print("\n‚úÖ Project configuration (Paths) initialized successfully.")
    print(f"Raw Data Path check: {cfg.TRAIN_RAW_FILE}")
    
except ImportError:
    print("‚ùå Error: Could not import Paths from config module.")


‚úÖ Project configuration (Paths) initialized successfully.
Raw Data Path check: /content/drive/MyDrive/Project_01/data/raw/application_train.csv


In [4]:
DATA_DIR_PROCESSED = os.path.join(PROJECT_BASE_PATH, 'data', 'processed')
TRAIN_PROCESSED_FILE = os.path.join(DATA_DIR_PROCESSED, 'train_enriched.csv')
TEST_PROCESSED_FILE = os.path.join(DATA_DIR_PROCESSED, 'test_enriched.csv')

In [5]:
try:
    # --- LOAD TRAIN DATA USING CONFIG PATHS ---
    df_train_final = pd.read_csv(cfg.TRAIN_PROCESSED_FILE)
    
    # ESSENTIAL FIX: Ensures TIME_INDEX is of temporal type after loading from CSV
    df_train_final['TIME_INDEX'] = pd.to_datetime(df_train_final['TIME_INDEX'])
    
    print(f"‚úÖ Loaded final training data. Shape: {df_train_final.shape}")

    # --- LOAD TEST DATA USING CONFIG PATHS ---
    df_test_final = pd.read_csv(cfg.TEST_PROCESSED_FILE)
    
    # Apply the same temporal conversion to the Test Set
    df_test_final['TIME_INDEX'] = pd.to_datetime(df_test_final['TIME_INDEX'])
    
    print(f"‚úÖ Loaded final testing data. Shape: {df_test_final.shape}")
except FileNotFoundError:
    print(f"‚ùå ERROR: Final processed file not found. Check if Block 9 was run and files exist at: {cfg.DATA_PROCESSED_DIR}")
    df_train_final = None
    df_test_final = None
if df_train_final is not None and df_test_final is not None:
    print("--- Starting Feature Engineering ---")
    
    # --- 1. Create Lagged and Dynamic Macro Features (TRAIN SET) ---
    
    # KeyError/SetIndex Fix: Ensures TIME_INDEX is a regular column before setting
    if 'TIME_INDEX' not in df_train_final.columns and 'TIME_INDEX' in df_train_final.index.names:
        df_train_final = df_train_final.reset_index(level='TIME_INDEX')
        print("üí° TRAIN: TIME_INDEX restored from index to column.")
    
    # Temporarily set TIME_INDEX as the index for time-series operations
    df_train_final = df_train_final.set_index('TIME_INDEX')
    
    # Use the correct column names
    macro_features_to_engineer = ['SELIC', 'IPCA'] 
    
    for col in macro_features_to_engineer:
        # A) Lag (Previous Month's Value)
        df_train_final[f'{col}_LAG1'] = df_train_final.groupby('SK_ID_CURR')[col].shift(1)

        # B) Change (Current Month - Previous Month)
        df_train_final[f'{col}_CHANGE'] = df_train_final[col] - df_train_final[f'{col}_LAG1']
        
        # C) Trend (3-Month Rolling Mean)
        df_train_final[f'{col}_ROLLING_MEAN3'] = df_train_final.groupby('SK_ID_CURR')[col].transform(
            lambda x: x.rolling(window=3, min_periods=1).mean().shift(1)
        )
        
    print(f"‚úÖ TRAIN: Created dynamic features: SELIC/IPCA LAGs, Changes, and Rolling Means.")

    # --- 2. Temporal Features from TIME_INDEX (TRAIN SET) ---
    
    # Extract month and year using the already defined index
    df_train_final['MONTH_OF_YEAR'] = df_train_final.index.to_series().dt.month
    df_train_final['YEAR'] = df_train_final.index.to_series().dt.year
    
    print("‚úÖ TRAIN: Created cyclical temporal features (MONTH_OF_YEAR, YEAR).")

    # Restore TIME_INDEX as a regular column
    df_train_final = df_train_final.reset_index()

    print(f"\nTraining set shape after Macro Feature Engineering: {df_train_final.shape}")
    
    
    # --------------------------------------------------------------------------------
    # --- REPEAT FOR TEST SET ---
    # --------------------------------------------------------------------------------
    
    # KeyError/SetIndex Fix for the Test Set
    if 'TIME_INDEX' not in df_test_final.columns and 'TIME_INDEX' in df_test_final.index.names:
        df_test_final = df_test_final.reset_index(level='TIME_INDEX')
        print("üí° TEST: TIME_INDEX restored from index to column.")
    
    # Temporarily set TIME_INDEX as the index for time-series operations
    df_test_final = df_test_final.set_index('TIME_INDEX')
    
    for col in macro_features_to_engineer:
        df_test_final[f'{col}_LAG1'] = df_test_final.groupby('SK_ID_CURR')[col].shift(1)
        df_test_final[f'{col}_CHANGE'] = df_test_final[col] - df_test_final[f'{col}_LAG1']
        df_test_final[f'{col}_ROLLING_MEAN3'] = df_test_final.groupby('SK_ID_CURR')[col].transform(
            lambda x: x.rolling(window=3, min_periods=1).mean().shift(1)
        )
        
    print(f"‚úÖ TEST: Created dynamic features: SELIC/IPCA LAGs, Changes, and Rolling Means.")

    # Temporal Features (TEST SET)
    df_test_final['MONTH_OF_YEAR'] = df_test_final.index.to_series().dt.month
    df_test_final['YEAR'] = df_test_final.index.to_series().dt.year
    
    print("‚úÖ TEST: Created cyclical temporal features (MONTH_OF_YEAR, YEAR).")

    # Restore TIME_INDEX as a regular column
    df_test_final = df_test_final.reset_index()

    print(f"\nTesting set shape after Macro Feature Engineering: {df_test_final.shape}")

‚úÖ Loaded final training data. Shape: (307511, 125)
‚úÖ Loaded final testing data. Shape: (48744, 124)
--- Starting Feature Engineering ---
‚úÖ TRAIN: Created dynamic features: SELIC/IPCA LAGs, Changes, and Rolling Means.
‚úÖ TRAIN: Created cyclical temporal features (MONTH_OF_YEAR, YEAR).

Training set shape after Macro Feature Engineering: (307511, 133)
‚úÖ TEST: Created dynamic features: SELIC/IPCA LAGs, Changes, and Rolling Means.
‚úÖ TEST: Created cyclical temporal features (MONTH_OF_YEAR, YEAR).

Testing set shape after Macro Feature Engineering: (48744, 132)


In [6]:
print("--- Starting Micro Feature Engineering ---")

# Treat DAYS_EMPLOYED anomaly (365243 days ‚âà 1000 years) as missing (NaN) and flag it

#------------------------------------------------------------
# --- IMPORTANT DATA CLEANING NOTE: DAYS_EMPLOYED ANOMALY ---
# The value 365243 in DAYS_EMPLOYED is a known data anomaly specific to the Home Credit dataset.
# It represents approximately 1000 years, and is used by the client/bank to code for
# applicants who are currently **unemployed** or whose employment status is unverified.
#
# Best Practice Treatment:
# 1. Replace the anomalous value (365243) with **NaN** to treat it as a missing value.
# 2. Create a new **binary feature** (DAYS_EMPLOYED_ANOM) to allow the model (like LightGBM)
#    to explicitly learn the predictive power of this specific 'unemployed/anomaly' group.
# This prevents the model from interpreting 365243 as a literal, extremely long employment history.
# -----------------------------------------------------------
# --- 1. Anomaly Treatment and Transformation ---

# 1. Fix the DAYS_EMPLOYED Anomaly
# Replace the extreme positive value with NaN for proper imputation later.
DAYS_EMPLOYED_ANOMALY = 365243 
df_train_final['DAYS_EMPLOYED'].replace({DAYS_EMPLOYED_ANOMALY: np.nan})
df_test_final['DAYS_EMPLOYED'].replace({DAYS_EMPLOYED_ANOMALY: np.nan})
print("‚úÖ DAYS_EMPLOYED anomaly fixed and replaced with NaN.")

# 2. Create Simple Ratio Features (Crucial for risk assessment)
# These are highly predictive and do not increase dimensionality.
df_train_final['CREDIT_INCOME_RATIO'] = df_train_final['AMT_CREDIT'] / df_train_final['AMT_INCOME_TOTAL']
df_test_final['CREDIT_INCOME_RATIO'] = df_test_final['AMT_CREDIT'] / df_test_final['AMT_INCOME_TOTAL']

df_train_final['ANNUITY_INCOME_RATIO'] = df_train_final['AMT_ANNUITY'] / df_train_final['AMT_INCOME_TOTAL']
df_test_final['ANNUITY_INCOME_RATIO'] = df_test_final['AMT_ANNUITY'] / df_test_final['AMT_INCOME_TOTAL']

df_train_final['PAYMENT_RATE'] = df_train_final['AMT_ANNUITY'] / df_train_final['AMT_CREDIT']
df_test_final['PAYMENT_RATE'] = df_test_final['AMT_ANNUITY'] / df_test_final['AMT_CREDIT']

print("‚úÖ Created 3 core ratio features.")

# Handle TIME_INDEX

if 'TIME_INDEX' in df_train_final.columns:
    # Convert 'YYYY-MM' string to YYYY * 12 + MM, or simply the Year (simple numerical feature)
    try:
        df_train_final['YEAR'] = pd.to_datetime(df_train_final['TIME_INDEX']).dt.year
        df_test_final['YEAR'] = pd.to_datetime(df_test_final['TIME_INDEX']).dt.year
        
        # Drop the original string TIME_INDEX to prevent errors in Block 13
        df_train_final = df_train_final.drop(columns=['TIME_INDEX'])
        df_test_final = df_test_final.drop(columns=['TIME_INDEX'])
        
        print("‚úÖ TIME_INDEX converted to numerical YEAR and original index removed.")
    except Exception:
        print("‚ùå WARNING: Could not convert TIME_INDEX to datetime. Dropping TIME_INDEX.")
        df_train_final = df_train_final.drop(columns=['TIME_INDEX'])
        df_test_final = df_test_final.drop(columns=['TIME_INDEX'])


print(f"\nTraining set shape after revised feature engineering: {df_train_final.shape}")

--- Starting Micro Feature Engineering ---
‚úÖ DAYS_EMPLOYED anomaly fixed and replaced with NaN.
‚úÖ Created 3 core ratio features.


‚úÖ TIME_INDEX converted to numerical YEAR and original index removed.

Training set shape after revised feature engineering: (307511, 135)


In [7]:
# Target Encoding

from category_encoders import TargetEncoder
from sklearn.model_selection import KFold
import re
import joblib

In [8]:
# Target Encoding and Final Cleanup

print("--- Starting: K-Fold Target Encoding (Leakage-safe) ---")

# 1. Identify categorical features (raw, before encoding)
categorical_features = df_train_final.select_dtypes(include=['object', 'category']).columns.tolist()
TARGET_COLUMN = 'TARGET'

print(f"Number of categorical features: {len(categorical_features)}")
print(f"Categorical features: {categorical_features}")

# Optional: save categorical features list as an artifact
categorical_features_df = pd.DataFrame(categorical_features, columns=['cat_feat'])
categorical_features_path = os.path.join(cfg.REPORT_DIR, 'categorical_features.csv')
categorical_features_df.to_csv(categorical_features_path, index=False)
print(f"‚úÖ Categorical features list saved to: {categorical_features_path}")

# 2. Out-of-Fold Target Encoding for the TRAIN set
N_FOLDS = 5
kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=42)

# DataFrame to hold OOF encoded values (same index as df_train_final)
oof_encoded = pd.DataFrame(index=df_train_final.index, columns=categorical_features, dtype=float)

for fold, (train_idx, val_idx) in enumerate(kf.split(df_train_final), start=1):
    print(f"  -> Fitting TargetEncoder on fold {fold}/{N_FOLDS}...")
    
    te_fold = TargetEncoder(cols=categorical_features, smoothing=0.3)
    te_fold.fit(
        df_train_final.loc[train_idx, categorical_features],
        df_train_final.loc[train_idx, TARGET_COLUMN]
    )
    
    # Transform validation fold and store only encoded columns
    encoded_val = te_fold.transform(df_train_final.loc[val_idx, categorical_features])
    # Keep only the encoded categorical columns
    encoded_val = encoded_val[categorical_features]
    
    oof_encoded.iloc[val_idx] = encoded_val.values

# 3. Fit a FINAL TargetEncoder on the FULL TRAIN set (for TEST + INFERENCE)
print("\n--- Fitting final TargetEncoder on the full training set (for test + API) ---")
final_target_encoder = TargetEncoder(cols=categorical_features, smoothing=0.3)
final_target_encoder.fit(
    df_train_final[categorical_features],
    df_train_final[TARGET_COLUMN]
)

# Handle any remaining NaNs in OOF encoded data (e.g., if some rows were not covered properly)
encoded_full_train = final_target_encoder.transform(df_train_final[categorical_features])[categorical_features]
oof_encoded = oof_encoded.fillna(encoded_full_train)

# Rename encoded columns with a clear suffix
oof_encoded.columns = [f"{col}_TARGET_ENC" for col in categorical_features]

print(f"‚úÖ OOF Target Encoding completed. Encoded TRAIN shape: {oof_encoded.shape}")

# 4. Apply TargetEncoder to the TEST set using the final encoder
encoded_test = final_target_encoder.transform(df_test_final[categorical_features])[categorical_features]
encoded_test.columns = [f"{col}_TARGET_ENC" for col in categorical_features]

print(f"‚úÖ TEST Target Encoding completed. Encoded TEST shape: {encoded_test.shape}")

# 5. Drop raw categorical features and append encoded ones (TRAIN and TEST)
df_train_final = pd.concat(
    [
        df_train_final.drop(columns=categorical_features, errors='ignore'),
        oof_encoded
    ],
    axis=1
)

df_test_final = pd.concat(
    [
        df_test_final.drop(columns=categorical_features, errors='ignore'),
        encoded_test
    ],
    axis=1
)

print(f"\nShapes after adding encoded features and dropping raw categoricals:")
print(f"  > Train: {df_train_final.shape}")
print(f"  > Test:  {df_test_final.shape}")

# 6. Clean column names (remove special characters) ‚Äì same function used before
def clean_names(df: pd.DataFrame) -> pd.DataFrame:
    cols = df.columns
    new_cols = []
    for col in cols:
        new_col = re.sub(r'[^A-Za-z0-9_]+', '', col)
        new_cols.append(new_col)
    df.columns = new_cols
    return df

df_train_final = clean_names(df_train_final)
df_test_final = clean_names(df_test_final)

print(f"\n‚úÖ Target Encoding and name cleaning complete. Final Feature Count:")
print(f"  > Train Shape: {df_train_final.shape}")
print(f"  > Test Shape:  {df_test_final.shape}")

# 7. Save FINAL TargetEncoder as MLOps artifact (used later by the API)
encoder_path = os.path.join(cfg.MODEL_DIR, 'final_target_encoder.pkl')
joblib.dump(final_target_encoder, encoder_path)
print(f"‚úÖ MLOps Artifact: Final TargetEncoder saved to: {encoder_path}")

--- Starting: K-Fold Target Encoding (Leakage-safe) ---
Number of categorical features: 16
Categorical features: ['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE']


‚úÖ Categorical features list saved to: /content/drive/MyDrive/Project_01/reports/categorical_features.csv
  -> Fitting TargetEncoder on fold 1/5...
  -> Fitting TargetEncoder on fold 2/5...
  -> Fitting TargetEncoder on fold 3/5...
  -> Fitting TargetEncoder on fold 4/5...
  -> Fitting TargetEncoder on fold 5/5...

--- Fitting final TargetEncoder on the full training set (for test + API) ---
‚úÖ OOF Target Encoding completed. Encoded TRAIN shape: (307511, 16)
‚úÖ TEST Target Encoding completed. Encoded TEST shape: (48744, 16)

Shapes after adding encoded features and dropping raw categoricals:
  > Train: (307511, 135)
  > Test:  (48744, 134)

‚úÖ Target Encoding and name cleaning complete. Final Feature Count:
  > Train Shape: (307511, 135)
  > Test Shape:  (48744, 134)
‚úÖ MLOps Artifact: Final TargetEncoder saved to: /content/drive/MyDrive/Project_01/models/final_target_encoder.pkl


In [9]:
print("\n--- Saving Final Processed DataFrames (Parquet Format) ---")

# Define the save paths using the 'cfg' instance (assuming cfg.DATA_PROCESSED_DIR exists)
FINAL_TRAIN_FILE = os.path.join(cfg.DATA_PROCESSED_DIR, 'train_final_encoded.parquet')
FINAL_TEST_FILE = os.path.join(cfg.DATA_PROCESSED_DIR, 'test_final_encoded.parquet')


# 1. Save the Training DataFrame
# Use index=False as 'TIME_INDEX' should now be a regular column after Block 13
df_train_final.to_parquet(FINAL_TRAIN_FILE, index=False)
print(f"‚úÖ Final Train set saved to: {FINAL_TRAIN_FILE}")

# 2. Save the Test DataFrame
df_test_final.to_parquet(FINAL_TEST_FILE, index=False)
print(f"‚úÖ Final Test set saved to: {FINAL_TEST_FILE}")


--- Saving Final Processed DataFrames (Parquet Format) ---
‚úÖ Final Train set saved to: /content/drive/MyDrive/Project_01/data/processed/train_final_encoded.parquet
‚úÖ Final Test set saved to: /content/drive/MyDrive/Project_01/data/processed/test_final_encoded.parquet
