In [1]:
# Install category_encoders if not already installed in the environment
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.9.0-py3-none-any.whl.metadata (7.9 kB)
Downloading category_encoders-2.9.0-py3-none-any.whl (85 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m85.9/85.9 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.9.0


In [2]:
import os
import sys
import pandas as pd
import numpy as np
import joblib
import re
from category_encoders import TargetEncoder
from typing import Dict, Any, Optional

In [3]:
# Define project Path in Colab
PROJECT_BASE_PATH = '/content/drive/MyDrive/Project_01' 

# ADD 'src' DIRECTORY TO PYTHON PATH
SRC_PATH = os.path.join(PROJECT_BASE_PATH, 'src')

# verify if SRC_PATH is already in sys.path
if SRC_PATH not in sys.path:
    sys.path.insert(0, SRC_PATH)
    print("‚úÖ Successfully added 'src' directory to Python path.")

# IMPORT Paths CLASS FROM config MODULE
from config import Paths

‚úÖ Successfully added 'src' directory to Python path.


In [4]:
try:
    from config import Paths
    
    cfg = Paths(PROJECT_BASE_PATH)
    cfg.create_dirs() 
    
    print("\n‚úÖ Project configuration (Paths) initialized successfully.")
    print(f"Raw Data Path check: {cfg.TRAIN_RAW_FILE}")
    
except ImportError:
    print("‚ùå Error: Could not import Paths from config module.")


‚úÖ Project configuration (Paths) initialized successfully.
Raw Data Path check: /content/drive/MyDrive/Project_01/data/raw/application_train.csv


In [5]:
DATA_DIR_PROCESSED = os.path.join(PROJECT_BASE_PATH, 'data', 'processed')
TRAIN_PROCESSED_FILE = os.path.join(DATA_DIR_PROCESSED, 'train_enriched.csv')

In [6]:
try:
    # --- LOAD TRAIN DATA USING CONFIG PATHS ---
    df_train_final = pd.read_csv(cfg.TRAIN_PROCESSED_FILE)
    
    # ESSENTIAL FIX: Ensures TIME_INDEX is of temporal type after loading from CSV
    df_train_final['TIME_INDEX'] = pd.to_datetime(df_train_final['TIME_INDEX'])
    
    print(f"‚úÖ Loaded final training data. Shape: {df_train_final.shape}")
except FileNotFoundError:
    print("‚ùå ERROR: Final processed file not found. Check your PROJECT_BASE_PATH.")
    df_train_final = None

‚úÖ Loaded final training data. Shape: (307511, 125)


In [7]:
if df_train_final is not None:
    print("--- Starting Feature Engineering ---")
    
    # --- 1. Create Lagged and Dynamic Macro Features (TRAIN SET) ---
    
    # KeyError/SetIndex Fix: Ensures TIME_INDEX is a regular column before setting
    if 'TIME_INDEX' not in df_train_final.columns and 'TIME_INDEX' in df_train_final.index.names:
        df_train_final = df_train_final.reset_index(level='TIME_INDEX')
        print("üí° TRAIN: TIME_INDEX restored from index to column.")
    
    # Temporarily set TIME_INDEX as the index for time-series operations
    df_train_final = df_train_final.set_index('TIME_INDEX')
    
    # Use the correct column names
    macro_features_to_engineer = ['SELIC', 'IPCA'] 
    
    for col in macro_features_to_engineer:
        # A) Lag (Previous Month's Value)
        df_train_final[f'{col}_LAG1'] = df_train_final.groupby('SK_ID_CURR')[col].shift(1)

        # B) Change (Current Month - Previous Month)
        df_train_final[f'{col}_CHANGE'] = df_train_final[col] - df_train_final[f'{col}_LAG1']
        
        # C) Trend (3-Month Rolling Mean)
        df_train_final[f'{col}_ROLLING_MEAN3'] = df_train_final.groupby('SK_ID_CURR')[col].transform(
            lambda x: x.rolling(window=3, min_periods=1).mean().shift(1)
        )
        
    print(f"‚úÖ TRAIN: Created dynamic features: SELIC/IPCA LAGs, Changes, and Rolling Means.")
 # --- 2. Temporal Features from TIME_INDEX (TRAIN SET) ---
    
    # Extract month and year using the already defined index
    df_train_final['MONTH_OF_YEAR'] = df_train_final.index.to_series().dt.month
    df_train_final['YEAR'] = df_train_final.index.to_series().dt.year
    
    print("‚úÖ TRAIN: Created cyclical temporal features (MONTH_OF_YEAR, YEAR).")

    # Restore TIME_INDEX as a regular column
    df_train_final = df_train_final.reset_index()

    print(f"\nTraining set shape after Macro Feature Engineering: {df_train_final.shape}")

--- Starting Feature Engineering ---
‚úÖ TRAIN: Created dynamic features: SELIC/IPCA LAGs, Changes, and Rolling Means.
‚úÖ TRAIN: Created cyclical temporal features (MONTH_OF_YEAR, YEAR).

Training set shape after Macro Feature Engineering: (307511, 133)


In [8]:
# 1. Fix the DAYS_EMPLOYED Anomaly
# Replace the extreme positive value with NaN for proper imputation later.
DAYS_EMPLOYED_ANOMALY = 365243 
df_train_final['DAYS_EMPLOYED'].replace({DAYS_EMPLOYED_ANOMALY: np.nan})

Unnamed: 0,DAYS_EMPLOYED
0,-637.0
1,-1188.0
2,-225.0
3,-3039.0
4,-3038.0
...,...
307506,-236.0
307507,
307508,-7921.0
307509,-4786.0


In [9]:
# 2. Create Simple Ratio Features (Crucial for risk assessment)
# These are highly predictive and do not increase dimensionality.
df_train_final['CREDIT_INCOME_RATIO'] = df_train_final['AMT_CREDIT'] / df_train_final['AMT_INCOME_TOTAL']

df_train_final['ANNUITY_INCOME_RATIO'] = df_train_final['AMT_ANNUITY'] / df_train_final['AMT_INCOME_TOTAL']

df_train_final['PAYMENT_RATE'] = df_train_final['AMT_ANNUITY'] / df_train_final['AMT_CREDIT']

print("‚úÖ Created 3 core ratio features.")

‚úÖ Created 3 core ratio features.


In [10]:
if 'TIME_INDEX' in df_train_final.columns:
    # Convert 'YYYY-MM' string to YYYY * 12 + MM, or simply the Year (simple numerical feature)
    try:
        df_train_final['YEAR'] = pd.to_datetime(df_train_final['TIME_INDEX']).dt.year
        
        # Drop the original string TIME_INDEX to prevent errors in Block 13
        df_train_final = df_train_final.drop(columns=['TIME_INDEX'])
        
        print("‚úÖ TIME_INDEX converted to numerical YEAR and original index removed.")
    except Exception:
        print("‚ùå WARNING: Could not convert TIME_INDEX to datetime. Dropping TIME_INDEX.")
        df_train_final = df_train_final.drop(columns=['TIME_INDEX'])


print(f"\nTraining set shape after revised feature engineering: {df_train_final.shape}")

‚úÖ TIME_INDEX converted to numerical YEAR and original index removed.

Training set shape after revised feature engineering: (307511, 135)


In [18]:
# creation of a schema for schemas.py in order to deploy an endpoint for the project
df_train_schema = pd.DataFrame(
    df_train_final.dtypes, 
    columns = ["data_type"]
    )
df_train_schema = df_train_schema.reset_index()
df_train_schema = df_train_schema.rename(columns={'index': 'feature_name'})

df_train_schema.to_csv(os.path.join(cfg.REPORT_DIR, "data_train_schema.csv"))

print("‚úÖ DataFrame created successfully and saved at REPORT_DIR as data_train_schema.csv")

‚úÖ DataFrame created successfully and saved at REPORT_DIR as data_train_schema.csv


In [None]:
print("--- Starting: Saving Preprocessing Artifacts ---")

# --- 1. Data Preparation ---
EXCLUDED_COLS = ['SK_ID_CURR', 'TARGET'] 
y_train = df_train_final['TARGET']


# --- 2. Artifact 1: Saving Imputation Means Map ---

imputation_maps = {}
# Select all columns that can hold numerical data (float/int)
numerical_cols = df_train_final.select_dtypes(include=[np.number]).columns.tolist()

for col in numerical_cols:
    if col in EXCLUDED_COLS:
        continue
    # Calculate the mean after replacing Inf with NaN (robust method)
    imputation_maps[col] = df_train_final[col].replace([np.inf, -np.inf], np.nan).mean()

# Add the critical anomaly value for DAYS_EMPLOYED
imputation_maps['DAYS_EMPLOYED_ANOMALY'] = 365243 

# Construct path using os.path.join for cross-platform compatibility
imputation_map_path = os.path.join(cfg.MODEL_DIR, 'imputation_means_map.pkl')
joblib.dump(imputation_maps, imputation_map_path)
print(f"‚úÖ Artifact 1 (Imputation Map) saved to: {imputation_map_path}")


# --- 3. Artifact 2: Saving the TargetEncoder Map ---

# Identify the categorical features used in Block 13
categorical_features = df_train_final.select_dtypes(include=['object', 'category']).columns.tolist()

# Fit a FINAL TargetEncoder on the entire training set (No CV this time!)
final_target_encoder = TargetEncoder(cols=categorical_features)
final_target_encoder.fit(df_train_final[categorical_features], y_train)

# Construct path using os.path.join
target_encoder_path = os.path.join(cfg.MODEL_DIR, 'final_target_encoder.pkl')
joblib.dump(final_target_encoder, target_encoder_path)
print(f"‚úÖ Artifact 2 (Target Encoder Map) saved to: {target_encoder_path}")

--- Starting: Saving Preprocessing Artifacts ---
‚úÖ Artifact 1 (Imputation Map) saved to: /content/drive/MyDrive/Project_01/models/imputation_means_map.pkl
‚úÖ Artifact 2 (Target Encoder Map) saved to: /content/drive/MyDrive/Project_01/models/final_target_encoder.pkl


In [None]:

class PredictionHandler:
    """
    Handles loading MLOps artifacts and making consistent predictions 
    for single or batch inputs in a live environment.
    """
    
    # 1. Load All Artifacts in __init__
    def __init__(self, model_path: str, mean_map_path: str, encoder_path: str):
        
        try:
            # Load MLOps Artifacts
            self.model = joblib.load(model_path)
            self.imputation_maps = joblib.load(mean_map_path)   # Loads saved means/anomaly
            self.target_encoder = joblib.load(encoder_path)     # Loads saved encoder object
            
            # The list of feature names used during training is CRITICAL for alignment
            self.feature_names = list(self.model.feature_name_)
            print("‚úÖ PredictionHandler fully initialized with all MLOps artifacts.")
            
        except Exception as e:
            print(f"CRITICAL ERROR: Failed to load MLOps artifacts: {e}")
            self.model = None

        
    def _clean_names(self, df: pd.DataFrame) -> pd.DataFrame:
        """Removes problematic characters for LightGBM/XGBoost compatibility."""
        cols = df.columns
        new_cols = []
        for col in cols:
            # Regex to keep only alphanumeric characters and underscores
            new_col = re.sub(r'[^A-Za-z0-9_]+', '', col)
            new_cols.append(new_col)
        df.columns = new_cols
        return df


    def preprocess(self, input_df: pd.DataFrame) -> pd.DataFrame:
        """Applies the entire sequential preprocessing pipeline consistently."""
        
        df = input_df.copy()

        # --- A. (Feature Engineering) ---
        
        # 1. Anomaly Fix (Uses saved value from imputation map)
        if 'DAYS_EMPLOYED' in df.columns:
            # Load the saved value for the anomaly fix (365243)
            DAYS_EMPLOYED_ANOMALY = self.imputation_maps.get('DAYS_EMPLOYED_ANOMALY', 365243)
            df['DAYS_EMPLOYED'].replace(DAYS_EMPLOYED_ANOMALY, np.nan, inplace=True)
            df['DAYS_EMPLOYED'] = np.abs(df['DAYS_EMPLOYED'])
        
        # 2. Ratio Features (Must match training!)
        df['CREDIT_INCOME_RATIO'] = df['AMT_CREDIT'] / df['AMT_INCOME_TOTAL']
        df['ANNUITY_INCOME_RATIO'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
        
        # 3. Time Feature Conversion (Must match training!)
        if 'TIME_INDEX' in df.columns:
             try:
                 df['YEAR'] = pd.to_datetime(df['TIME_INDEX']).dt.year
                 df = df.drop(columns=['TIME_INDEX'])
             except:
                 pass
                
        # --- B. (Target Encoding) ---
        
        # Apply the SAVED encoder to the categorical columns of the live data
        df = self.target_encoder.transform(df)
        
        # Drop original categorical columns (they are now encoded)
        df = df.drop(columns=self.target_encoder.cols, errors='ignore') 
        
        # --- C. Final Cleaning and Alignment ---
        
        # 1. Clean Feature Names
        df = self._clean_names(df)

        # 2. Handle NaN/Inf using SAVED MEANS (CORRECT MLOPS IMPUTATION)
        df = df.replace([np.inf, -np.inf], np.nan)
        
        # Fill missing values using the means calculated from the TRAINING data
        for col, mean_val in self.imputation_maps.items():
            if col in df.columns and col != 'DAYS_EMPLOYED_ANOMALY': 
                df[col] = df[col].fillna(mean_val)
        
        # 3. Align Columns (CRITICAL MLOps Step)
        
        # Select and re-order columns to match the model's training list
        processed_df = df[[col for col in self.feature_names if col in df.columns]]
        # Fill any missing engineered features (that weren't in the raw input) with 0 or a consistent value
        processed_df = processed_df.reindex(columns=self.feature_names, fill_value=0) 
        
        return processed_df


    def predict_proba(self, raw_input_data: Dict[str, Any]) -> float:
        """
        Receives raw input data (e.g., from a JSON API request) and returns the 
        probability of default (Target=1).
        """
        if self.model is None:
            return 0.5 # Default prediction if model failed to load

        # 1. Convert input dictionary/JSON to DataFrame
        input_df = pd.DataFrame([raw_input_data])
        
        # 2. Preprocess the data
        processed_df = self.preprocess(input_df)
        
        # 3. Ensure column order matches the training data (CRITICAL!)
        # The reindex in preprocess should handle this, but an explicit check is safe:
        # processed_df = processed_df[self.feature_names] 

        # 4. Generate prediction probability
        prediction_proba = self.model.predict_proba(processed_df)[0][1]
        
        return prediction_proba