In [None]:
import os
import gc 
from glob import glob
from pathlib import Path
from datetime import datetime
import numpy as np
import pandas as pd
import polars as pl
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import lightgbm
import torch
import torch.nn as nn

from sklearn.model_selection import StratifiedGroupKFold
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve, average_precision_score
import scikitplot as skplt
from scikitplot.metrics import plot_cumulative_gain
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostClassifier, Pool
import xgboost
from collections import Counter

import shap
import pickle
import warnings
warnings.filterwarnings(action='ignore')

In [None]:
ROOT            = Path("/kaggle/input/home-credit-risk-model-train-test")
SCHEMA_PATH     = Path("/kaggle/input/schema-home-credit-risk-data")
MODEL           = Path("/kaggle/input/homecredit-risk-model-stability/other/catboost_lgb_champ/3")
RFE             = Path("/kaggle/input/home-credit-rfe-models/other/rfe_models/1")
PRED            = Path("/kaggle/input/03-model-evaluation")
PCA             = Path("/kaggle/input/05-pca-num-cols")


In [None]:
def set_data_types(df, schema):
    
    for col, dtype in schema.items():
        if dtype == 'category':
            dtype = 'object'
        df[col] = df[col].astype(dtype, errors = 'ignore')
    return df

In [None]:
X = pl.read_csv(ROOT/'model_abt_train.csv').to_pandas()
X_test = pl.read_csv(ROOT/'model_abt_test.csv').to_pandas()

schema = pd.read_csv(SCHEMA_PATH / "data_schema.csv", names = ['Columns', 'dtype'])
schema = schema[schema['Columns'].isin(X.columns)]
schema = schema.set_index('Columns')['dtype'].to_dict()

X = set_data_types(X, schema)
X_test = set_data_types(X_test, schema)
del schema

In [None]:
id_val = X[['case_id', 'WEEK_NUM']]
id_test_val = X_test[['case_id', 'WEEK_NUM']]

y = X['target']
y_test = X_test['target']

cols = X.drop(['case_id', 'WEEK_NUM', 'target', 'decision_month', 'decision_weekday'], axis = 1).columns

X = X.drop(['case_id', 'WEEK_NUM', 'target', 'decision_month', 'decision_weekday'], axis = 1)
X_test = X_test.drop(['case_id', 'WEEK_NUM', 'target', 'decision_month', 'decision_weekday'], axis = 1)

In [None]:
with open(ROOT/'cat_cols.pkl', 'rb') as f:
    cat_cols = pickle.load(f)
num_cols = [x for x, dtypes in X.dtypes.items() if dtypes in ['float', 'int']]
X[cat_cols] = X[cat_cols].astype(str)
X_test[cat_cols] = X_test[cat_cols].astype(str)

In [None]:
class Model_Utils:
    @staticmethod
    def model_evals(y_true, y_proba, cutoff = 0.5):
        from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, average_precision_score, roc_auc_score
        """
        Returns model evaluation metrics for a binary classification model

        Parameters:
        -----------
            y_true: int (0,1) 
                Actual binary labels

            y_proba: float (between 0 and 1)
                Probability scores output of model 

        Returns:
        --------
            result: dict
                Dictionary of metrics and their results based on the input
                    - event rate (% predicted 1's)
                    - accuracy
                    - roc_auc
                    - pr_auc
                    - recall
                    - precision
                    - f1
                    - lift
        """

        y_pred = (y_proba > cutoff).astype(int)

        event_rate = y_pred.mean()

        accuracy = accuracy_score(y_true, y_pred)

        roc_auc = roc_auc_score(y_true, y_proba)

        pr_auc = average_precision_score(y_true, y_proba)

        recall = recall_score(y_true, y_pred)

        precision = precision_score(y_true, y_pred)

        f1 = f1_score(y_true, y_pred)

        lift = recall / event_rate

        return {'event_rate': event_rate,
                'acc': accuracy, 
                'precision': precision, 
                'recall': recall, 
                'f1': f1, 
                'roc_auc': roc_auc,
                'pr_auc': pr_auc, 
                'lift': lift}
    
    @staticmethod
    def cutoff_perc(y_actual, y_proba, percent):
        n_cutoff = int(len(y_actual) * percent)

        scores = pd.DataFrame({'actual': y_actual, 'proba': y_proba})
        scores = scores.sort_values(by = 'proba', ascending= False)
        scores['rank'] = scores['proba'].rank(ascending = False)
        scores['ranked_pred'] = np.where(scores['rank']<= n_cutoff, 1, 0)

        print(len(scores[scores['ranked_pred'] == 1]))
        print(scores[scores['ranked_pred'] == 1]['proba'].min())
        return Model_Utils.model_evals(y_actual, y_proba, cutoff = scores[scores['ranked_pred'] == 1]['proba'].min())
    
    @staticmethod
    def DumbClassifier(y_true):
        import numpy as np
        y_pred = np.zeros(len(y_true))
        return y_pred
    
    @staticmethod
    def RandomChanceClassifier(y_true):
        import numpy as np
        y_pred = Model_Utils.DumbClassifier(y_true)
        event = y_true.sum()
        ind = np.random.randint(0, len(y_true), size=event)
        y_pred[ind] = 1
        return y_pred
    
    @staticmethod
    def save_model_results(results, schema, filepath):
        if not os.path.exists(filepath):
            score_schema = {'model_name': 'str', 
                        'model': 'str',
                        'params': 'str',
                        'acc': 'float',
                        'precision': 'float',
                        'recall': 'float',
                        'f1_score': 'float',
                        'roc_auc': 'float',
                        'pr_auc': 'float',
                        'lift': 'float'}
            scores = pd.DataFrame(columns = score_schema.keys()).astype(score_schema)
        else:
            scores = pd.read_csv(filepath)
        row = []
        
        for key in schema.keys():
            row.append(results[key])
        
        scores.loc[len(scores)] = row
        scores.to_csv(filepath, index = False)
        
    @staticmethod
    def LightGBMClassifier_CV(X, y, cat_cols, cv = 5, group = None, params = None):
        import lightgbm
        from sklearn.model_selection import StratifiedGroupKFold
        
        cv = StratifiedGroupKFold(n_splits=cv, shuffle = True, random_state = 42)
        
        if params == None:
            params = {'random_state': 42
                     ,'objective': 'binary'
                     ,'verbose': -1
                     ,'n_jobs': -1}
        
        scores = {'acc': [],
                        'precision': [],
                        'recall': [],
                        'f1': [],
                        'roc_auc': [],
                        'pr_auc': [],
                        'lift': []}

        split = 1
        
        for train_ind, valid_ind in cv.split(X, y, groups=group):
            X_train, y_train = X.iloc[train_ind], y.iloc[train_ind]
            X_valid, y_valid = X.iloc[valid_ind], y.iloc[valid_ind]

            X_train[cat_cols] = X_train[cat_cols].astype("category")
            X_valid[cat_cols] = X_valid[cat_cols].astype("category")
            
            lgb = lightgbm.LGBMClassifier(**params)
            lgb.fit(X_train, y_train)

            y_proba = lgb.predict_proba(X_valid)[:, 1]
            results = Model_Utils.model_evals(y_valid, y_proba)
            print(f'LightGBM, Val CV{split}: {results}"')
            
            split += 1
            
            for key in results.keys():
                if key in scores.keys():
                    scores[key].append(results[key])
        
        for key in results.keys():
            if key in scores.keys():
                scores[key] = np.array(scores[key]).mean()
            
        print(f'LightGBM, Average CV: {scores}"')
        return lgb, scores

    
    @staticmethod
    def LightGBMClassifier_pred(X, y, X_test, y_test, cat_cols, params = None):
        import lightgbm
        
        if params == None:
            params = {'random_state': 42
                     ,'objective': 'binary'
                     ,'verbose': -1
                     ,'n_jobs': -1}
        
        scores = {'acc': [],
                        'precision': [],
                        'recall': [],
                        'f1': [],
                        'roc_auc': [],
                        'pr_auc': [],
                        'lift': []}

        
        X[cat_cols] = X[cat_cols].astype("category")
        X_test[cat_cols] = X_test[cat_cols].astype("category")
            
        lgb = lightgbm.LGBMClassifier(**params)
        lgb.fit(X, y)

        y_proba = lgb.predict_proba(X_test)[:, 1]
        results = Model_Utils.model_evals(y_test, y_proba)
        print(f'LightGBM, Test: {results}"')
        
        return lgb, y_proba, results
    
    
    @staticmethod
    def CatBoostClassifier_CV(X, y, cat_cols, cv = 5, group = None, params = None):
        import catboost
        from sklearn.model_selection import StratifiedGroupKFold
        
        cv = StratifiedGroupKFold(n_splits=cv, shuffle = True, random_state = 42)
        
        if params == None:
             params = {'boosting_type' : "Plain",
                      'eval_metric': 'PRAUC',
                      'random_seed': 42,
                      'learning_rate': 0.05,
                      'use_best_model': True,
                      'iterations': 1000}
        
        scores = {'acc': [],
                        'precision': [],
                        'recall': [],
                        'f1': [],
                        'roc_auc': [],
                        'pr_auc': [],
                        'lift': []}

        split = 1
        
        for train_ind, valid_ind in cv.split(X, y, groups=group):
            X_train, y_train = X.iloc[train_ind], y.iloc[train_ind]
            X_valid, y_valid = X.iloc[valid_ind], y.iloc[valid_ind]
            
            
            clf = CatBoostClassifier(**params)
        
            train_pool = Pool(X_train, y_train, cat_features=cat_cols)
            val_pool = Pool(X_valid, y_valid, cat_features=cat_cols)
            
            clf.fit(train_pool, eval_set=val_pool, verbose=False)
            
            y_proba = clf.predict_proba(X_valid)[:, 1]
            results = Model_Utils.model_evals(y_valid, y_proba)
            print(f'CatBoost, Val CV{split}: {results}"')
            
            split += 1
            
            for key in results.keys():
                if key in scores.keys():
                    scores[key].append(results[key])
        
        for key in results.keys():
            if key in scores.keys():
                scores[key] = np.array(scores[key]).mean()
            
        print(f'CatBoost, Average CV: {scores}"')
        return clf, scores
    
    
    @staticmethod
    def CatBoostClassifier_pred(X, y, X_test, y_test, cat_cols, params = None):
        import catboost
    
        if params == None:
             params = {'boosting_type' : "Plain",
                      'eval_metric': 'PRAUC',
                      'random_seed': 42,
                      'learning_rate': 0.05,
                      'use_best_model': True,
                      'iterations': 1000}
                
        clf = CatBoostClassifier(**params)
        
        train_pool = Pool(X, y, cat_features=cat_cols)
        test_pool = Pool(X_test, y_test, cat_features=cat_cols)
            
        clf.fit(train_pool, eval_set=test_pool, verbose=False)
            
        y_proba = clf.predict_proba(X_test)[:, 1]
        results = Model_Utils.model_evals(y_test, y_proba)
        print(f'CatBoost, Test: {results}"')
            
        return clf, y_proba, results

    
    @staticmethod
    def train_test_split(X, y, test_size = 0.2):
        from sklearn.model_selection import train_test_split
        import polars as pl
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
        pl.from_pandas(pd.concat([y_train, X_train], axis = 1)).write_csv('model_abt_train.csv')
        del X_train
        del y_train
        gc.collect()
        
        pl.from_pandas(pd.concat([y_test, X_test], axis = 1)).write_csv('model_abt_test.csv')
        del X_test
        del y_test
        gc.collect()

In [None]:
y_dumb =  Model_Utils.DumbClassifier(y)
results  = Model_Utils.model_evals(y,y_dumb)
print(results)
results['model_name'] = 'Dumb Classifier'
results['model'] = 'Predict All Major'
results['params'] = None
#Model_Utils.save_model_results(results, score_schema, "baseline.csv")
del results
gc.collect()

In [None]:
y_rand = Model_Utils.RandomChanceClassifier(y)
results  = Model_Utils.model_evals(y, y_rand)
print(results)
results['model_name'] = 'Random Chance Classifier'
results['model'] = 'Randomly Predict at Event Rate'
results['params'] = None
#Model_Utils.save_model_results(results, score_schema, "baseline.csv")
del results
gc.collect()

In [None]:
plot_roc(y, y_rand, 'red', 'Random Chance (ROC-AUC)')
plot_roc(y, y_rand, 'blue', 'Dumb Classifier (ROC-AUC)')

In [None]:
plot_pr(y, y_rand, 'red', 'Random Chance (ROC-AUC)')
plot_pr(y, y_rand, 'blue', 'Dumb Classifier (ROC-AUC)')

In [None]:
rfe_95 = joblib.load(RFE/'lgb_train_rfe_num_95.joblib')

In [None]:
X_test[cat_cols] = X_test[cat_cols].astype('category')

In [None]:
y_test

In [None]:
y_95 = rfe_95.predict_proba(X_test[rfe_95.feature_name_])[:,1]
Model_Utils.model_evals(y_test, y_95)

In [None]:
champ_predictions = pd.read_csv(PRED/"Test_predictions.csv")

In [None]:
champ_predictions.head()

In [None]:
def plot_roc(y_true, y_proba, color = 'red', label = 'ROC-AUC'):
    fpr, tpr, thresholds = roc_curve(y_true, y_proba)
    roc_auc = roc_auc_score(y_true, y_proba)
    
    plt.plot(fpr, tpr, color=color, lw=2, label=f'{label} (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='black', lw=2, linestyle='--')
    plt.fill_between(fpr, tpr, alpha=0.2, color=color)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")
    #plt.show()
    
def plot_pr(y_true, y_proba, color = 'red', label = 'PR-AUC'):
    precision, recall, thresholds = precision_recall_curve(y_true, y_proba)
    pr_auc = average_precision_score(y_true, y_proba)
    
    plt.plot(recall, precision, color=color, lw=2, label=f'{label} (area = %0.2f)' % pr_auc)
    plt.fill_between(recall, precision, alpha=0.2, color=color)
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.legend(loc="upper right")
    #plt.show()

In [None]:
plot_roc(y_test, y_95)

In [None]:
plot_roc(champ_predictions['target'], champ_predictions['lgb_pred'])

In [None]:
plot_pr(champ_predictions['target'], champ_predictions['lgb_pred'])

In [None]:
plot_pr(y_test, y_95, 'blue')

In [None]:
plot_cumulative_gain(champ_predictions['target'], np.vstack([1 - champ_predictions['lgb_pred'],  champ_predictions['lgb_pred']]).T)

In [None]:

# Create a DataFrame to hold the data
data = pd.DataFrame({'y_actual': y_test, 'y_proba': y_95})

# Sort the data by predicted probability scores in descending order
data = data.sort_values(by='y_proba', ascending=False)

# Calculate cumulative gains
data['cumulative_actual'] = data['y_actual'].cumsum()
data['cumulative_percentage'] = data['cumulative_actual'] / data['y_actual'].sum()

# Create percentiles
data['percentile'] = np.arange(1, len(data) + 1) / len(data)

# Plot the cumulative gains chart
plt.figure(figsize=(10, 6))
plt.plot(data['percentile'], data['cumulative_percentage'], label='Model')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Random')

plt.xlabel('Percentile')
plt.ylabel('Cumulative Gains')
plt.title('Cumulative Gains Chart')
plt.legend()
plt.show()

In [None]:
data

In [None]:
Model_Utils.cutoff_perc(champ_predictions['target'], champ_predictions['lgb_pred'], 0.2)

In [None]:
import shap

X[cat_cols] = X[cat_cols].astype('category')
X_test[cat_cols] = X_test[cat_cols].astype('category')
# Initialize the explainer with the model and training data
explainer = shap.Explainer(champ_lgb)

# Compute SHAP values for the test data
shap_values = explainer(X_test)
# Summary plot
#shap.summary_plot(shap_values.values, X_test)



In [None]:
champ_predictions.plot.scatter(x = 'first_birth_259D', y = 'lgb_pred' )

In [None]:
shap_values.values

In [None]:
shap_values.values.T[0].T.shape

In [None]:
'first_income_type_1044T' in X_test.columns

In [None]:
pd.DataFrame(X_test.groupby(['target'])['mean_outstandingdebt_522A'].mean())

In [None]:
stat = X_test.groupby(['min_education_1138M']).agg(count = pd.NamedAgg(column = 'case_id', aggfunc = 'count'),
                                       target = pd.NamedAgg(column = 'target', aggfunc = 'sum'))
stat['default'] = stat['target']/stat['count']
stat

In [None]:
5037/()

In [None]:
shap.summary_plot(shap_values.values.T[1].T, X_test, max_display = 20)

In [None]:
shap.summary_plot(shap_values.values.T[0].T, X_test, max_display = 20)

In [None]:
cum_lift = pd.DataFrame(columns = ['Cutoff', 'Event Rate', 'Accuracy', 'Precision', 'Recall', 'F1', 'Lift'])
cum_lift

In [None]:
for i in np.arange(0,1, 0.01):
    results = Model_Utils.cutoff_perc(y_test, y_95, i)
    row = [i]
    for metric, val in results.items():
        if metric in ['event_rate', 'acc', 'precision', 'recall', 'f1', 'lift']:
            row.append(val)
    cum_lift.loc[len(cum_lift)] = row

In [None]:
for i in np.arange(0,1, 0.01):
    results = Model_Utils.cutoff_perc(champ_predictions['target'], champ_predictions['lgb_pred'], i)
    row = [i]
    for metric, val in results.items():
        if metric in ['event_rate', 'acc', 'precision', 'recall', 'f1', 'lift']:
            row.append(val)
    cum_lift.loc[len(cum_lift)] = row

In [None]:
cum_lift

In [None]:
cum_lift.to_csv('gain_95.csv')

In [None]:
champ_predictions['rank_proba_lgb'] = 1-champ_predictions['lgb_pred'].rank(pct=True)

In [None]:
def impact_sizing(df, target_col, rank_col, impact_col, cutoff):
    df['pred'] = np.where(df[rank_col]<=cutoff, 1, 0)
    df['tag'] = np.where((df['pred'] == 1) & (df['target'] == 1), 'TP',
                          np.where((df['pred'] == 1) & (df['target'] == 0), 'FP', 
                                    np.where((df['pred'] == 0) & (df['target'] == 1), 'FN',
                                              np.where((df['pred'] == 0) & (df['target'] == 0), 'TN', None
                        ))))
    risk = len(df[(df['pred']==0) & (df['target']==1)])/(len(df[df['pred']==0]) + 0.000001)
    print(risk)
    return pd.DataFrame(df.groupby('tag')[impact_col].sum()).T

In [None]:
impact_sizing(champ_predictions, 'target', 'rank_proba_lgb', 'credamount_770A_y', 1)

In [None]:
champ_predictions[champ_predictions['rank_proba_lgb']<=0.1][]

In [None]:
champ_predictions.columns