In [1]:
# --- NOTEBOOK 07: THE PREVIOUS PLANET ---

import pandas as pd
import numpy as np
import gc
import logging

# Logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class Config:
    # Input Paths
    PREV_APP_PATH = 'data/previous_application.csv'
    INSTALLMENTS_PATH = 'data/installments_payments.csv'
    POS_CASH_PATH = 'data/POS_CASH_balance.csv'
    CREDIT_CARD_PATH = 'data/credit_card_balance.csv'
    
    # We load the output of Notebook 06
    MAIN_DATA_PATH = 'train_bureau_merged.parquet'
    
    OUTPUT_PATH = 'train_full_merged.parquet'

logger.info("Notebook 07 Initialized.")

2026-01-20 12:51:48,764 - INFO - Notebook 07 Initialized.


In [3]:
def process_installments():
    logger.info("Loading Installments Payments...")
    ins = pd.read_csv(Config.INSTALLMENTS_PATH)
    
    # 1. Feature Engineering (The "Bad Behavior" Flags)
    # Days Late: If they paid 5 days after due date. (Negative numbers = Early payment)
    ins['DPD'] = ins['DAYS_ENTRY_PAYMENT'] - ins['DAYS_INSTALMENT']
    ins['DPD'] = ins['DPD'].apply(lambda x: x if x > 0 else 0) # Only care about Late, not Early
    
    # Amount Underpaid: If they owed 100 but paid 90.
    ins['AMT_UNDERPAID'] = ins['AMT_INSTALMENT'] - ins['AMT_PAYMENT']
    ins['AMT_UNDERPAID'] = ins['AMT_UNDERPAID'].apply(lambda x: x if x > 0 else 0)
    
    # 2. Aggregations by USER (SK_ID_CURR)
    # We want to know: "Max days late", "Total amount unpaid", "Average payment size"
    agg_dict = {
        'DPD': ['max', 'mean', 'sum'],
        'AMT_UNDERPAID': ['max', 'mean', 'sum'],
        'AMT_PAYMENT': ['mean', 'sum'],
        'NUM_INSTALMENT_VERSION': ['nunique'] # Did they change the loan terms? (Refinance)
    }
    
    logger.info("Aggregating Installments by User...")
    ins_agg = ins.groupby('SK_ID_CURR').agg(agg_dict)
    
    # Flatten Columns
    ins_agg.columns = [f"INS_{c[0]}_{c[1].upper()}" for c in ins_agg.columns]
    
    # Count of installments (History Length)
    ins_agg['INS_COUNT'] = ins.groupby('SK_ID_CURR').size()
    
    logger.info(f"Installments Aggregated. Shape: {ins_agg.shape}")
    del ins
    gc.collect()
    return ins_agg

# Execute
ins_agg = process_installments()

2026-01-20 12:52:54,030 - INFO - Loading Installments Payments...
2026-01-20 12:53:12,424 - INFO - Aggregating Installments by User...
2026-01-20 12:53:14,755 - INFO - Installments Aggregated. Shape: (339587, 10)


In [4]:
def process_pos_and_cc():
    # --- A. POS CASH ---
    logger.info("Loading POS Cash Balance...")
    pos = pd.read_csv(Config.POS_CASH_PATH)
    
    # One-Hot Encode Contract Status (Active, Completed, Signed)
    pos_cats = pd.get_dummies(pos['NAME_CONTRACT_STATUS'], prefix='POS')
    pos = pd.concat([pos, pos_cats], axis=1)
    
    # Aggregate
    agg_dict = {col: ['mean'] for col in pos_cats.columns}
    agg_dict['SK_DPD'] = ['max', 'mean'] # Days Past Due on POS
    
    logger.info("Aggregating POS by User...")
    pos_agg = pos.groupby('SK_ID_CURR').agg(agg_dict)
    pos_agg.columns = [f"POS_{c[0]}_{c[1].upper()}" for c in pos_agg.columns]
    pos_agg['POS_COUNT'] = pos.groupby('SK_ID_CURR').size()
    
    del pos, pos_cats
    gc.collect()
    
    # --- B. CREDIT CARD ---
    logger.info("Loading Credit Card Balance...")
    cc = pd.read_csv(Config.CREDIT_CARD_PATH)
    
    # Key Metrics: Balance, Drawings (Cash withdrawals), Limit usage
    cc_agg_dict = {
        'AMT_BALANCE': ['mean', 'max'],
        'AMT_DRAWINGS_ATM_CURRENT': ['mean', 'sum'], # Taking cash out is risky!
        'AMT_CREDIT_LIMIT_ACTUAL': ['mean'],
        'SK_DPD': ['max', 'mean']
    }
    
    logger.info("Aggregating Credit Card by User...")
    cc_agg = cc.groupby('SK_ID_CURR').agg(cc_agg_dict)
    cc_agg.columns = [f"CC_{c[0]}_{c[1].upper()}" for c in cc_agg.columns]
    cc_agg['CC_COUNT'] = cc.groupby('SK_ID_CURR').size()
    
    del cc
    gc.collect()
    
    return pos_agg, cc_agg

# Execute
pos_agg, cc_agg = process_pos_and_cc()

2026-01-20 12:53:24,740 - INFO - Loading POS Cash Balance...
2026-01-20 12:53:30,273 - INFO - Aggregating POS by User...
2026-01-20 12:53:33,303 - INFO - Loading Credit Card Balance...
2026-01-20 12:53:38,663 - INFO - Aggregating Credit Card by User...


In [5]:
def process_previous_applications():
    logger.info("Loading Previous Applications...")
    prev = pd.read_csv(Config.PREV_APP_PATH)
    
    # 1. Contract Status (Approved vs Refused)
    prev_cats = pd.get_dummies(prev['NAME_CONTRACT_STATUS'], prefix='PREV')
    prev = pd.concat([prev, prev_cats], axis=1)
    
    # 2. Aggregations
    # We want counts of Refusals and Mean Amounts
    num_cols = ['AMT_ANNUITY', 'AMT_APPLICATION', 'AMT_CREDIT', 'AMT_DOWN_PAYMENT']
    agg_dict = {col: ['min', 'max', 'mean'] for col in num_cols}
    
    # Add Categorical Means (e.g., % of time Refused)
    for cat in prev_cats.columns:
        agg_dict[cat] = ['mean']
        
    logger.info("Aggregating Previous Apps by User...")
    prev_agg = prev.groupby('SK_ID_CURR').agg(agg_dict)
    prev_agg.columns = [f"PREV_{c[0]}_{c[1].upper()}" for c in prev_agg.columns]
    
    # Count total previous apps
    prev_agg['PREV_APP_COUNT'] = prev.groupby('SK_ID_CURR').size()
    
    logger.info(f"Previous Apps Aggregated. Shape: {prev_agg.shape}")
    del prev
    gc.collect()
    return prev_agg

# Execute
prev_agg = process_previous_applications()

2026-01-20 12:53:39,352 - INFO - Loading Previous Applications...
2026-01-20 12:53:45,691 - INFO - Aggregating Previous Apps by User...
2026-01-20 12:53:46,430 - INFO - Previous Apps Aggregated. Shape: (338857, 17)


In [6]:
def join_all_planets(ins_agg, pos_agg, cc_agg, prev_agg):
    logger.info(f"Loading Main Data from {Config.MAIN_DATA_PATH}...")
    df_main = pd.read_parquet(Config.MAIN_DATA_PATH)
    
    original_cols = df_main.shape[1]
    
    # Sequential Merges (Left Joins)
    df = df_main.merge(ins_agg, on='SK_ID_CURR', how='left')
    df = df.merge(pos_agg, on='SK_ID_CURR', how='left')
    df = df.merge(cc_agg, on='SK_ID_CURR', how='left')
    df = df.merge(prev_agg, on='SK_ID_CURR', how='left')
    
    new_cols = df.shape[1]
    logger.info(f"Final Shape: {df.shape}")
    logger.info(f"Added {new_cols - original_cols} features from Previous History.")
    
    logger.info(f"Saving to {Config.OUTPUT_PATH}...")
    df.to_parquet(Config.OUTPUT_PATH, index=False)
    
    return df

# Execute
df_final = join_all_planets(ins_agg, pos_agg, cc_agg, prev_agg)

# Clean up to prevent RAM crash
del ins_agg, pos_agg, cc_agg, prev_agg
gc.collect()

2026-01-20 12:53:51,129 - INFO - Loading Main Data from train_bureau_merged.parquet...
2026-01-20 12:53:52,550 - INFO - Final Shape: (307506, 317)
2026-01-20 12:53:52,552 - INFO - Added 47 features from Previous History.
2026-01-20 12:53:52,553 - INFO - Saving to train_full_merged.parquet...


0

In [7]:
df_final.shape

(307506, 317)