In [1]:
# --- NOTEBOOK 04: FEATURE ENGINEERING ---

import pandas as pd
import numpy as np
import logging
import gc
from sklearn.preprocessing import LabelEncoder

# Logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class Config:
    INPUT_PATH = 'clean_train.parquet' # From NB 01 (Clean but Raw)
    OUTPUT_PATH = 'engineered_data.parquet'
    
    # Feature Groups
    # We define what we want to build
    CREATE_RATIOS = True
    AGGREGATE_EXT_SOURCES = True
    ENCODE_CATEGORICALS = True

logger.info("Notebook 04 Initialized.")

2026-01-20 07:42:31,525 - INFO - Notebook 04 Initialized.


In [2]:
class DomainEngineer:
    """
    Creates financial ratios used in credit scoring.
    """
    @staticmethod
    def create_financial_features(df: pd.DataFrame) -> pd.DataFrame:
        logger.info("Engineering Financial Ratios...")
        
        # 1. CREDIT TERM (How long to pay back?)
        # Credit / Annuity = Roughly the length of the loan in years (if interest was 0)
        df['NEW_CREDIT_TO_ANNUITY_RATIO'] = df['AMT_CREDIT'] / df['AMT_ANNUITY']
        
        # 2. CREDIT POWER (Loan vs Income)
        # Did you borrow 10x your salary?
        df['NEW_CREDIT_TO_INCOME_RATIO'] = df['AMT_CREDIT'] / df['AMT_INCOME_TOTAL']
        
        # 3. DEBT BURDEN (Annuity vs Income)
        # What % of paycheck goes to loan?
        df['NEW_ANNUITY_TO_INCOME_RATIO'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
        
        # 4. GOODS VALUATION (Loan vs Goods Price)
        # Did you borrow MORE than the car is worth? (Insurance/Fees)
        # Note: We dropped AMT_GOODS_PRICE in NB 02 (Linear), but Trees love this ratio.
        if 'AMT_GOODS_PRICE' in df.columns:
            df['NEW_CREDIT_TO_GOODS_RATIO'] = df['AMT_CREDIT'] / df['AMT_GOODS_PRICE']
        
        # 5. EMPLOYMENT STABILITY
        # What % of your life have you worked?
        # (DAYS_EMPLOYED is negative, DAYS_BIRTH is negative)
        df['NEW_EMPLOYED_TO_BIRTH_RATIO'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
        
        return df

In [3]:
class PolymathEngineer:
    """
    Aggregates groups of features into summary statistics.
    """
    @staticmethod
    def aggregate_ext_sources(df: pd.DataFrame) -> pd.DataFrame:
        logger.info("Aggregating External Sources...")
        
        ext_cols = ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']
        
        # Row-wise operations
        # Mean: The "Average Trust"
        df['NEW_EXT_SOURCES_MEAN'] = df[ext_cols].mean(axis=1)
        
        # Std: The "Disagreement" (High std = One bureau likes you, one hates you)
        df['NEW_EXT_SOURCES_STD'] = df[ext_cols].std(axis=1)
        
        # Min: The "Worst Case" (Risk models often care about the lowest score)
        df['NEW_EXT_SOURCES_MIN'] = df[ext_cols].min(axis=1)
        
        return df

In [4]:
class CategoricalEngineer:
    """
    Handles Categorical Encoding for Tree Models (Label Encoding).
    """
    @staticmethod
    def label_encode(df: pd.DataFrame) -> pd.DataFrame:
        logger.info("Label Encoding Categoricals...")
        
        # Select object columns
        # Note: We strictly encode ONLY objects. 
        # We do NOT touch flags (0/1) or integers.
        obj_cols = df.select_dtypes(include=['object', 'category']).columns
        
        le = LabelEncoder()
        
        for col in obj_cols:
            # Fill missing with specific string so Encoder doesn't crash
            # We treat 'NaN' as a category itself!
            df[col] = df[col].astype(str).fillna('MISSING')
            
            # Encode
            df[col] = le.fit_transform(df[col])
            
            # Convert to category type (LightGBM loves this)
            df[col] = df[col].astype('category')
            
        return df

In [5]:
def run_feature_foundry():
    # 1. Load Clean Data (From NB 01)
    # We do NOT use the VIF-dropped data from NB 02. Trees can handle redundancy.
    # We go back to the source: clean_train.parquet
    logger.info(f"Loading {Config.INPUT_PATH}...")
    df = pd.read_parquet(Config.INPUT_PATH)
    
    # 2. Domain Features
    if Config.CREATE_RATIOS:
        df = DomainEngineer.create_financial_features(df)
        
    # 3. Polymath Features
    if Config.AGGREGATE_EXT_SOURCES:
        df = PolymathEngineer.aggregate_ext_sources(df)
        
    # 4. Encoding
    if Config.ENCODE_CATEGORICALS:
        df = CategoricalEngineer.label_encode(df)
        
    # 5. Save
    logger.info(f"Saving Engineered Data to {Config.OUTPUT_PATH}...")
    df.to_parquet(Config.OUTPUT_PATH, index=False)
    
    logger.info(f"Final Shape: {df.shape}")
    return df

# Execute
df_engineered = run_feature_foundry()

2026-01-20 07:43:37,432 - INFO - Loading clean_train.parquet...
2026-01-20 07:43:37,973 - INFO - Engineering Financial Ratios...
2026-01-20 07:43:37,994 - INFO - Aggregating External Sources...
2026-01-20 07:43:38,217 - INFO - Label Encoding Categoricals...
2026-01-20 07:43:40,210 - INFO - Saving Engineered Data to engineered_data.parquet...
2026-01-20 07:43:41,850 - INFO - Final Shape: (307506, 131)
