In [1]:
import os
import gc 
from glob import glob
from pathlib import Path
from datetime import datetime
import numpy as np
import pandas as pd
import polars as pl
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import lightgbm
import torch
import torch.nn as nn

from sklearn.model_selection import StratifiedGroupKFold
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostClassifier, Pool
import xgboost
from collections import Counter

import shap
import pickle
import warnings
warnings.filterwarnings(action='ignore')

In [2]:
ROOT            = Path("/kaggle/input/home-credit-risk-model-train-test")
SCHEMA_PATH            = Path("/kaggle/input/schema-home-credit-risk-data")

In [3]:
def set_data_types(df, schema):
    
    for col, dtype in schema.items():
        if dtype == 'category':
            dtype = 'object'
        df[col] = df[col].astype(dtype, errors = 'ignore')
    return df

In [4]:
X = pl.read_csv(ROOT/'model_abt_train.csv').to_pandas()
X_test = pl.read_csv(ROOT/'model_abt_test.csv').to_pandas()

schema = pd.read_csv(SCHEMA_PATH / "data_schema.csv", names = ['Columns', 'dtype'])
schema = schema[schema['Columns'].isin(X.columns)]
schema = schema.set_index('Columns')['dtype'].to_dict()

X = set_data_types(X, schema)
X_test = set_data_types(X_test, schema)
del schema

In [5]:
id_val = X[['case_id', 'WEEK_NUM']]
id_test_val = X_test[['case_id', 'WEEK_NUM']]

y = X['target']
y_test = X_test['target']

cols = X.drop(['case_id', 'WEEK_NUM', 'target', 'decision_month', 'decision_weekday'], axis = 1).columns

X = X.drop(['case_id', 'WEEK_NUM', 'target', 'decision_month', 'decision_weekday'], axis = 1)
X_test = X_test.drop(['case_id', 'WEEK_NUM', 'target', 'decision_month', 'decision_weekday'], axis = 1)

In [6]:
cat_cols = [x for x, dtypes in X.dtypes.items() if dtypes == 'O']
num_cols = [x for x, dtypes in X.dtypes.items() if dtypes in ['float', 'int']]
X[cat_cols] = X[cat_cols].astype(str)
X_test[cat_cols] = X_test[cat_cols].astype(str)

In [7]:
with open('cat_cols.pkl', 'wb') as f:
    pickle.dump(cat_cols, f)

In [8]:
class Model_Utils:
    @staticmethod
    def model_evals(y_true, y_proba, cutoff = 0.5):
        from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, average_precision_score, roc_auc_score
        """
        Returns model evaluation metrics for a binary classification model

        Parameters:
        -----------
            y_true: int (0,1) 
                Actual binary labels

            y_proba: float (between 0 and 1)
                Probability scores output of model 

        Returns:
        --------
            result: dict
                Dictionary of metrics and their results based on the input
                    - event rate (% predicted 1's)
                    - accuracy
                    - roc_auc
                    - pr_auc
                    - recall
                    - precision
                    - f1
                    - lift
        """

        y_pred = (y_proba > cutoff).astype(int)

        event_rate = y_pred.mean()

        accuracy = accuracy_score(y_true, y_pred)

        roc_auc = roc_auc_score(y_true, y_proba)

        pr_auc = average_precision_score(y_true, y_proba)

        recall = recall_score(y_true, y_pred)

        precision = precision_score(y_true, y_pred)

        f1 = f1_score(y_true, y_pred)

        lift = recall / event_rate

        return {'event_rate': event_rate,
                'acc': accuracy, 
                'precision': precision, 
                'recall': recall, 
                'f1': f1, 
                'roc_auc': roc_auc,
                'pr_auc': pr_auc, 
                'lift': lift}
    
    @staticmethod
    def cutoff_perc(y_actual, y_proba, percent):
        n_cutoff = int(len(y_actual) * percent)

        scores = pd.DataFrame({'actual': y_actual, 'proba': y_proba})
        scores = scores.sort_values(by = 'proba', ascending= False)
        scores['rank'] = scores['proba'].rank(ascending = False)
        scores['ranked_pred'] = np.where(scores['rank']<= n_cutoff, 1, 0)

        print(len(scores[scores['ranked_pred'] == 1]))
        print(scores[scores['ranked_pred'] == 1]['proba'].min())
        return Model_Utils.model_evals(y_actual, y_proba, cutoff = scores[scores['ranked_pred'] == 1]['proba'].min())
    
    @staticmethod
    def DumbClassifier(y_true):
        import numpy as np
        y_pred = np.zeros(len(y_true))
        return y_pred
    
    @staticmethod
    def RandomChanceClassifier(y_true):
        import numpy as np
        y_pred = Model_Utils.DumbClassifier(y_true)
        event = y_true.sum()
        ind = np.random.randint(0, len(y_true), size=event)
        y_pred[ind] = 1
        return y_pred
    
    @staticmethod
    def save_model_results(results, schema, filepath):
        if not os.path.exists(filepath):
            score_schema = {'model_name': 'str', 
                        'model': 'str',
                        'params': 'str',
                        'acc': 'float',
                        'precision': 'float',
                        'recall': 'float',
                        'f1_score': 'float',
                        'roc_auc': 'float',
                        'pr_auc': 'float',
                        'lift': 'float'}
            scores = pd.DataFrame(columns = score_schema.keys()).astype(score_schema)
        else:
            scores = pd.read_csv(filepath)
        row = []
        
        for key in schema.keys():
            row.append(results[key])
        
        scores.loc[len(scores)] = row
        scores.to_csv(filepath, index = False)
        
    @staticmethod
    def LightGBMClassifier_CV(X, y, cat_cols, cv = 5, group = None, params = None):
        import lightgbm
        from sklearn.model_selection import StratifiedGroupKFold
        
        cv = StratifiedGroupKFold(n_splits=cv, shuffle = True, random_state = 42)
        
        if params == None:
            params = {'random_state': 42
                     ,'objective': 'binary'
                     ,'verbose': -1
                     ,'n_jobs': -1}
        
        scores = {'acc': [],
                        'precision': [],
                        'recall': [],
                        'f1': [],
                        'roc_auc': [],
                        'pr_auc': [],
                        'lift': []}

        split = 1
        
        for train_ind, valid_ind in cv.split(X, y, groups=group):
            X_train, y_train = X.iloc[train_ind], y.iloc[train_ind]
            X_valid, y_valid = X.iloc[valid_ind], y.iloc[valid_ind]

            X_train[cat_cols] = X_train[cat_cols].astype("category")
            X_valid[cat_cols] = X_valid[cat_cols].astype("category")
            
            lgb = lightgbm.LGBMClassifier(**params)
            lgb.fit(X_train, y_train)

            y_proba = lgb.predict_proba(X_valid)[:, 1]
            results = Model_Utils.model_evals(y_valid, y_proba)
            print(f'LightGBM, Val CV{split}: {results}"')
            
            split += 1
            
            for key in results.keys():
                if key in scores.keys():
                    scores[key].append(results[key])
        
        for key in results.keys():
            if key in scores.keys():
                scores[key] = np.array(scores[key]).mean()
            
        print(f'LightGBM, Average CV: {scores}"')
        return lgb, scores

    
    @staticmethod
    def LightGBMClassifier_pred(X, y, X_test, y_test, cat_cols, params = None):
        import lightgbm
        
        if params == None:
            params = {'random_state': 42
                     ,'objective': 'binary'
                     ,'verbose': -1
                     ,'n_jobs': -1}
        
        scores = {'acc': [],
                        'precision': [],
                        'recall': [],
                        'f1': [],
                        'roc_auc': [],
                        'pr_auc': [],
                        'lift': []}

        
        X[cat_cols] = X[cat_cols].astype("category")
        X_test[cat_cols] = X_test[cat_cols].astype("category")
            
        lgb = lightgbm.LGBMClassifier(**params)
        lgb.fit(X, y)

        y_proba = lgb.predict_proba(X_test)[:, 1]
        results = Model_Utils.model_evals(y_test, y_proba)
        print(f'LightGBM, Test: {results}"')
        
        return lgb, y_proba, results
    
    
    @staticmethod
    def CatBoostClassifier_CV(X, y, cat_cols, cv = 5, group = None, params = None):
        import catboost
        from sklearn.model_selection import StratifiedGroupKFold
        
        cv = StratifiedGroupKFold(n_splits=cv, shuffle = True, random_state = 42)
        
        if params == None:
             params = {'boosting_type' : "Plain",
                      'eval_metric': 'PRAUC',
                      'random_seed': 42,
                      'learning_rate': 0.05,
                      'use_best_model': True,
                      'iterations': 1000}
        
        scores = {'acc': [],
                        'precision': [],
                        'recall': [],
                        'f1': [],
                        'roc_auc': [],
                        'pr_auc': [],
                        'lift': []}

        split = 1
        
        for train_ind, valid_ind in cv.split(X, y, groups=group):
            X_train, y_train = X.iloc[train_ind], y.iloc[train_ind]
            X_valid, y_valid = X.iloc[valid_ind], y.iloc[valid_ind]
            
            
            clf = CatBoostClassifier(**params)
        
            train_pool = Pool(X_train, y_train, cat_features=cat_cols)
            val_pool = Pool(X_valid, y_valid, cat_features=cat_cols)
            
            clf.fit(train_pool, eval_set=val_pool, verbose=False)
            
            y_proba = clf.predict_proba(X_valid)[:, 1]
            results = Model_Utils.model_evals(y_valid, y_proba)
            print(f'CatBoost, Val CV{split}: {results}"')
            
            split += 1
            
            for key in results.keys():
                if key in scores.keys():
                    scores[key].append(results[key])
        
        for key in results.keys():
            if key in scores.keys():
                scores[key] = np.array(scores[key]).mean()
            
        print(f'CatBoost, Average CV: {scores}"')
        return clf, scores
    
    
    @staticmethod
    def CatBoostClassifier_pred(X, y, X_test, y_test, cat_cols, params = None):
        import catboost
    
        if params == None:
             params = {'boosting_type' : "Plain",
                      'eval_metric': 'PRAUC',
                      'random_seed': 42,
                      'learning_rate': 0.05,
                      'use_best_model': True,
                      'iterations': 1000}
                
        clf = CatBoostClassifier(**params)
        
        train_pool = Pool(X, y, cat_features=cat_cols)
        test_pool = Pool(X_test, y_test, cat_features=cat_cols)
            
        clf.fit(train_pool, eval_set=test_pool, verbose=False)
            
        y_proba = clf.predict_proba(X_test)[:, 1]
        results = Model_Utils.model_evals(y_test, y_proba)
        print(f'CatBoost, Test: {results}"')
            
        return clf, y_proba, results

    
    @staticmethod
    def train_test_split(X, y, test_size = 0.2):
        from sklearn.model_selection import train_test_split
        import polars as pl
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
        pl.from_pandas(pd.concat([y_train, X_train], axis = 1)).write_csv('model_abt_train.csv')
        del X_train
        del y_train
        gc.collect()
        
        pl.from_pandas(pd.concat([y_test, X_test], axis = 1)).write_csv('model_abt_test.csv')
        del X_test
        del y_test
        gc.collect()

In [9]:
params_lgb = {'colsample_bytree': 0.8091363920278533, 
              'gamma': 1.6740930883959255, 
              'learning_rate': 0.0816628382148218, 
              'max_depth': 7, 'min_child_weight': 1.7065771300111194, 
              'n_estimators': 250, 'random_state': 42, 
              'reg_alpha': 7.114616964626139, 'reg_lambda': 8.853463059493436, 
              'subsample': 0.5494975675591438, 'verbose': -1}


params_catboost = {'bagging_temperature': 3.0469773840224983,
                   'boosting_type': 'Ordered', 
                   'border_count': 96, 
                   'colsample_bylevel': 0.5008191566781925, 
                   'depth': 5, 'iterations': 250, 
                   'l2_leaf_reg': 8.0, 
                   'learning_rate': 0.19362752313765308, 
                   'max_ctr_complexity': 5.0, 
                   'one_hot_max_size': 50, 
                   'random_state': 42, 
                   'verbose': -1}


In [10]:
X.shape

(1221327, 450)

In [11]:
X_test.shape

(305332, 450)

In [12]:
#CatBoost
clf, y_pred, cat_scores = Model_Utils.CatBoostClassifier_pred(X, y, X_test, y_test, cat_cols, params_catboost)


CatBoost, Test: {'event_rate': 0.0011757693265036092, 'acc': 0.9687356713348093, 'precision': 0.5097493036211699, 'recall': 0.019156285983460693, 'f1': 0.03692493946731235, 'roc_auc': 0.8518577582368849, 'pr_auc': 0.18576100417804817, 'lift': 16.292554629253537}"


In [13]:
lgb, lgb_pred, lgb_scores = Model_Utils.LightGBMClassifier_pred(X, y, X_test, y_test, cat_cols, params_lgb)

LightGBM, Test: {'event_rate': 0.000985812165118625, 'acc': 0.9687553220756423, 'precision': 0.521594684385382, 'recall': 0.01643462786559196, 'f1': 0.03186523239293688, 'roc_auc': 0.8561347087184998, 'pr_auc': 0.18964341686977695, 'lift': 16.6711554666343}"


In [14]:
mean_vote = [np.mean(k) for k in zip(lgb_pred, y_pred)]
max_vote = [np.max(k) for k in zip(lgb_pred, y_pred)]

In [15]:
Model_Utils.model_evals(y_test, pd.Series(mean_vote))

{'event_rate': 0.0009530609303970759,
 'acc': 0.9687684225695309,
 'precision': 0.5292096219931272,
 'recall': 0.01612059039045326,
 'f1': 0.0312880942706217,
 'roc_auc': 0.8564343500178961,
 'pr_auc': 0.19210358190432258,
 'lift': 16.914543316487542}

In [16]:
Model_Utils.model_evals(y_test, pd.Series(max_vote))

{'event_rate': 0.001506556797191254,
 'acc': 0.9686734439888384,
 'precision': 0.48695652173913045,
 'recall': 0.023448131477022925,
 'f1': 0.044741835613702194,
 'roc_auc': 0.8555645074387427,
 'pr_auc': 0.19049408734010242,
 'lift': 15.564054087266008}

In [17]:
Model_Utils.cutoff_perc(y_test, lgb_pred, 0.1)

30533
0.08083977608976073


{'event_rate': 0.09999606985183342,
 'acc': 0.9004722728046848,
 'precision': 0.1587842263854317,
 'recall': 0.507484559824139,
 'f1': 0.24188599226643387,
 'roc_auc': 0.8561347087184998,
 'pr_auc': 0.18964341686977695,
 'lift': 5.075045055031573}

In [18]:
with open('catboost_champ.joblib', 'wb') as f:
    joblib.dump(clf, f)
    
with open('lgb_champ.joblib', 'wb') as f:
    joblib.dump(lgb, f)

In [19]:
X.shape

(1221327, 450)

In [20]:
X_test.shape

(305332, 450)

In [21]:
X = pd.concat([X, X_test], ignore_index=True)
y = pd.concat([y, y_test], ignore_index = True)

In [22]:
del X_test
del y_test
gc.collect()

0

In [23]:
X.shape

(1526659, 450)

In [24]:
X[cat_cols] = X[cat_cols].astype("category")
lgbf = lightgbm.LGBMClassifier(**params_lgb)
lgbf.fit(X, y)

with open('lgb_champ_full.joblib', 'wb') as f:
    joblib.dump(lgbf, f)

In [25]:
clff = CatBoostClassifier(**params_catboost)
train_pool = Pool(X, y, cat_features=cat_cols)
clff.fit(train_pool, verbose=False)
            
with open('catboost_champ_full.joblib', 'wb') as f:
    joblib.dump(clff, f)

In [26]:
cat_imp = pd.DataFrame(clf.feature_names_)
cat_imp['Importance'] = clf.feature_importances_
cat_imp['perc_imp'] = cat_imp['Importance']/cat_imp['Importance'].sum()
cat_imp = cat_imp.sort_values(by = 'Importance', ascending = False)
cat_imp.head(20)

Unnamed: 0,0,Importance,perc_imp
410,first_birth_259D,3.252309,0.032523
408,first_sex_738L,2.72409,0.027241
127,pmtnum_254L,2.571376,0.025714
259,max_numberofcontrsvalue_358L,2.404486,0.024045
101,mobilephncnt_593L,2.227366,0.022274
119,numrejects9m_859L,2.193117,0.021931
18,requesttype_4525192L,2.186818,0.021868
407,first_incometype_1044T,2.04779,0.020478
437,mean_depth2_pmts_overdue_1140A,2.039812,0.020398
281,max_totalamount_6A,1.953187,0.019532


In [27]:
lgb_imp = pd.DataFrame(lgb.feature_name_)
lgb_imp['Importance'] = lgb.feature_importances_
lgb_imp['perc_imp'] = lgb_imp['Importance']/lgb_imp['Importance'].sum()
lgb_imp = lgb_imp.sort_values(by = 'Importance', ascending = False)
lgb_imp.head(20)

Unnamed: 0,0,Importance,perc_imp
131,price_1097A,168,0.0224
410,first_birth_259D,133,0.017733
25,annuity_780A,128,0.017067
291,min_dateofcredstart_739D,127,0.016933
127,pmtnum_254L,123,0.0164
278,max_residualamount_856A,110,0.014667
66,eir_270L,94,0.012533
17,pmtssum_45A,93,0.0124
326,min_refreshdate_3813885D,85,0.011333
247,max_dateofcredstart_739D,85,0.011333


In [28]:
cat_imp.to_csv('CatBoost_Feature_Imp.csv', index = False)
lgb_imp.to_csv('LightGBM_Feature_Imp.csv', index = False)

In [29]:
catf_imp = pd.DataFrame(clff.feature_names_)
catf_imp['Importance'] = clff.feature_importances_
catf_imp['perc_imp'] = catf_imp['Importance']/catf_imp['Importance'].sum()
catf_imp = catf_imp.sort_values(by = 'Importance', ascending = False)
catf_imp.head(20)

Unnamed: 0,0,Importance,perc_imp
345,mean_numberofoverdueinstlmax_1151L,2.995362,0.029954
410,first_birth_259D,2.706137,0.027061
408,first_sex_738L,2.691655,0.026917
127,pmtnum_254L,2.362397,0.023624
407,first_incometype_1044T,2.361663,0.023617
259,max_numberofcontrsvalue_358L,2.063867,0.020639
101,mobilephncnt_593L,2.038168,0.020382
18,requesttype_4525192L,1.991101,0.019911
291,min_dateofcredstart_739D,1.972814,0.019728
119,numrejects9m_859L,1.956379,0.019564


In [30]:
lgbf_imp = pd.DataFrame(lgbf.feature_name_)
lgbf_imp['Importance'] = lgbf.feature_importances_
lgbf_imp['perc_imp'] = lgbf_imp['Importance']/lgbf_imp['Importance'].sum()
lgbf_imp = lgbf_imp.sort_values(by = 'Importance', ascending = False)
lgbf_imp.head(20)

Unnamed: 0,0,Importance,perc_imp
131,price_1097A,161,0.021467
410,first_birth_259D,140,0.018667
127,pmtnum_254L,133,0.017733
25,annuity_780A,129,0.0172
291,min_dateofcredstart_739D,117,0.0156
278,max_residualamount_856A,110,0.014667
66,eir_270L,81,0.0108
214,mean_outstandingdebt_522A,81,0.0108
17,pmtssum_45A,80,0.010667
55,cntpmts24_3658933L,80,0.010667
