In [None]:
import os
import gc
from glob import glob
from pathlib import Path
from datetime import datetime
import numpy as np
import pandas as pd
import polars as pl
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import lightgbm as lgb
import torch
import torch.nn as nn

from sklearn.model_selection import StratifiedGroupKFold
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostClassifier, Pool
import xgboost
from collections import Counter

import warnings
warnings.filterwarnings(action='ignore')

In [None]:
ROOT            = Path("/kaggle/input/home-credit-risk-dataset-corr-dropped-top-2")
SCHEMA_PATH     = Path("/kaggle/input/schema-home-credit-risk-data")

In [None]:
df = pl.read_csv(ROOT / 'model_abt_pl_corr2_drop.csv').to_pandas()
df.shape

In [None]:
df.head()

In [None]:
def set_data_types(df, schema):
    for col, dtype in schema.items():
        if dtype == 'category':
            dtype = 'object'
        df[col] = df[col].astype(dtype)
    return df
    

In [None]:
class Model_Utils:
    """
    Created by: Julie Anne Co, 2024
    
    Helper functions for modeling LightGBM & CatBoost
    Model evaluation functions
    """
    @staticmethod
    def model_evals(y_true, y_proba, cutoff = 0.5):
        from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, average_precision_score, roc_auc_score
        """
        Returns model evaluation metrics for a binary classification model

        Parameters:
        -----------
            y_true: int (0,1) 
                Actual binary labels

            y_proba: float (between 0 and 1)
                Probability scores output of model 

        Returns:
        --------
            result: dict
                Dictionary of metrics and their results based on the input
                    - event rate (% predicted 1's)
                    - accuracy
                    - roc_auc
                    - pr_auc
                    - recall
                    - precision
                    - f1
                    - lift
        """

        y_pred = (y_proba > cutoff).astype(int)

        event_rate = y_pred.mean()

        accuracy = accuracy_score(y_true, y_pred)

        roc_auc = roc_auc_score(y_true, y_proba)

        pr_auc = average_precision_score(y_true, y_proba)

        recall = recall_score(y_true, y_pred)

        precision = precision_score(y_true, y_pred)

        f1 = f1_score(y_true, y_pred)

        lift = recall / event_rate

        return {'event_rate': event_rate,
                'acc': accuracy, 
                'roc_auc': roc_auc,
                'pr_auc': pr_auc, 
                'recall': recall, 
                'precision': precision, 
                'f1': f1, 
                'lift': lift}
    
    @staticmethod
    def DumbClassifier(y_true):
        """
        Predict 0 (majority class) for all
        """
        import numpy as np
        y_pred = np.zeros(len(y_true))
        return y_pred
    
    @staticmethod
    def RandomChanceClassifier(y_true):
        """
        Predict random 1 based on event rate
        """
        import numpy as np
        y_pred = Model_Utils.DumbClassifier(y_true)
        event = y_true.sum()
        ind = np.random.randint(0, len(y_true), size=event)
        y_pred[ind] = 1
        return y_pred
    
    @staticmethod
    def save_model_results(results, schema, filepath):
        """
        Save Model Results in csv
        """
        if not os.path.exists(filepath):
            score_schema = {'model_name': 'str', 
                        'model': 'str',
                        'params': 'str',
                        'acc': 'float',
                        'precision': 'float',
                        'recall': 'float',
                        'f1_score': 'float',
                        'roc_auc': 'float',
                        'pr_auc': 'float',
                        'lift': 'float'}
            scores = pd.DataFrame(columns = score_schema.keys()).astype(score_schema)
        else:
            scores = pd.read_csv(filepath)
        row = []
        
        for key in schema.keys():
            row.append(results[key])
        
        scores.loc[len(scores)] = row
        scores.to_csv(filepath, index = False)
        
    @staticmethod
    def LightGBMClassifier_CV(X, y, cat_cols, cv = 5, group = None, params = None):
        """
        K-Fold Average peformance of a LigthGBM estimator
        """
        import lightgbm
        from sklearn.model_selection import StratifiedGroupKFold
        
        cv = StratifiedGroupKFold(n_splits=cv, shuffle = True, random_state = 42)
        
        if params == None:
            params = {'random_state': 42
                     ,'objective': 'binary'
                     ,'verbose': -1
                     ,'n_jobs': -1}
        
        scores = {'params': params,
                        'acc': [],
                        'precision': [],
                        'recall': [],
                        'f1_score': [],
                        'roc_auc': [],
                        'pr_auc': [],
                        'lift': []}

        split = 1
        
        for train_ind, valid_ind in cv.split(X, y, groups=group):
            X_train, y_train = X.iloc[train_ind], y.iloc[train_ind]
            X_valid, y_valid = X.iloc[valid_ind], y.iloc[valid_ind]

            X_train[cat_cols] = X_train[cat_cols].astype("category")
            X_valid[cat_cols] = X_valid[cat_cols].astype("category")

            lgb = lightgbm.LGBMClassifier(**params)
            lgb.fit(X_train, y_train)
            print(lgb.get_params(deep=True))
            print(lgb._other_params)

            y_proba = lgb.predict_proba(X_valid)[:, 1]
            results = Model_Utils.model_evals(y_valid, y_proba)
            print(f'LightGBM, Val CV{split}: {results}"')
            
            split += 1
            
            for key in results.keys():
                if key in scores.keys():
                    scores[key].append(results[key])
        
        for key in results.keys():
            if key in scores.keys():
                scores[key] = np.array(scores[key]).mean()
        
        return scores

    
    @staticmethod
    def train_test_split(X, y, test_size = 0.2):
        """
        Split data into train-test
        """
        from sklearn.model_selection import train_test_split
        import polars as pl
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
        
        print(f"X_train: {X_train.shape}")
        print(f"y_train: {y_train.shape}")
        #pl.from_pandas(pd.concat([y_train, X_train], axis = 1)).write_csv('model_abt_train.csv')
        del X_train
        del y_train
        gc.collect()
        
        print(f"X_test: {X_test.shape}")
        print(f"y_test: {y_test.shape}")
        pl.from_pandas(pd.concat([y_test, X_test], axis = 1)).write_csv('model_abt_test.csv')
        del X_test
        del y_test
        gc.collect()

In [None]:
schema = pd.read_csv(SCHEMA_PATH / "data_schema.csv", names = ['Columns', 'dtype'])
schema = schema[schema['Columns'].isin(df.columns)]
schema = schema.set_index('Columns')['dtype'].to_dict()
set(schema.values())

In [None]:
non_feat = ['case_id', 'WEEK_NUM', 'decision_month', 'decision_weekday', 'target']
num_cols = [x for x, dtype in schema.items() if x not in non_feat and dtype in ['int64', 'float64', 'int8']]
cat_cols = [x for x, dtype in schema.items() if x not in non_feat and dtype == 'category']

In [None]:
df = set_data_types(df, schema)

In [None]:
id_val = df[['case_id', 'WEEK_NUM', 'target', 'decision_month', 'decision_weekday']]
y = df['target']
X = df[num_cols + cat_cols]
del df

#### Split Data into Train & Test
We will get 20% of the data as holdout. We will tune our model on the 80%.

In [None]:
Model_Utils.train_test_split(X, y)

In [None]:
pl.read_csv('model_abt_test.csv').head()

##### We will define our baseline vs Random Classifier (random chance) and a Dumb Classifier (predict all majority class). Optimized models should be able to beat these 2.


In [None]:
score_schema = {'model_name': 'str', 
                        'model': 'str',
                        'params': 'str',
                        'acc': 'float',
                        'precision': 'float',
                        'recall': 'float',
                        'f1': 'float',
                        'roc_auc': 'float',
                        'pr_auc': 'float',
                        'lift': 'float'}

##### Dumb Classifier


In [None]:
results  = Model_Utils.model_evals(y, Model_Utils.DumbClassifier(y))
print(results)
results['model_name'] = 'Dumb Classifier'
results['model'] = 'Predict All Major'
results['params'] = None
Model_Utils.save_model_results(results, score_schema, "baseline.csv")
del results
gc.collect()

##### Random Chance Classifier

In [None]:
results  = Model_Utils.model_evals(y, Model_Utils.RandomChanceClassifier(y))
print(results)
results['model_name'] = 'Random Chance Classifier'
results['model'] = 'Randomly Predict at Event Rate'
results['params'] = None
Model_Utils.save_model_results(results, score_schema, "baseline.csv")
del results
gc.collect()
