In [None]:
# Load libraries
import numpy as np
import pandas as pd
from matplotlib import pyplot
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use("fivethirtyeight")
import os
from pandas import read_csv
from pandas import set_option
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

In [None]:
# test_data = pd.read_csv("cstest.csv")
train_data = pd.read_csv("cstrain.csv")

In [None]:
train_data.info()

In [None]:
class_0 = train_data.SeriousDlqin2yrs.value_counts()[0]
class_1 = train_data.SeriousDlqin2yrs.value_counts()[1]
print("Total number of class_0: {}".format(class_0))
print("Total number of class_1: {}".format(class_1))
print("Event rate: {} %".format(class_1/(class_0+class_1) *100))

In [None]:
train_data.loc[train_data["age"] < 18]

In [None]:
train_data.loc[train_data["age"] == 0, "age"] = train_data.age.median()

In [None]:
age_working = train_data.loc[(train_data["age"] >= 18) & (train_data["age"] < 60)]
age_senior = train_data.loc[(train_data["age"] >= 60)]

age_working_impute = age_working.MonthlyIncome.mean()
age_senior_impute = age_senior.MonthlyIncome.mean()

In [None]:
train_data["MonthlyIncome"] = np.absolute(train_data["MonthlyIncome"])

In [None]:
train_data["MonthlyIncome"] = train_data["MonthlyIncome"].fillna(99999)

In [None]:
train_data["MonthlyIncome"] = train_data["MonthlyIncome"].astype('int64')

In [None]:
train_data.loc[((train_data["age"] >= 18) & (train_data["age"] < 60)) & (train_data["MonthlyIncome"] == 99999),\
               "MonthlyIncome"] = age_working_impute
train_data.loc[(train_data["age"] >= 60) & (train_data["MonthlyIncome"] == 99999), "MonthlyIncome"] = age_senior_impute

In [None]:
train_data.loc[train_data["MonthlyIncome"] == 99999]

In [None]:
train_data["NumberOfDependents"] = np.absolute(train_data["NumberOfDependents"])
train_data["NumberOfDependents"] = train_data["NumberOfDependents"].fillna(0)
train_data["NumberOfDependents"] = train_data["NumberOfDependents"].astype('int64')

In [None]:
train_data["CombinedDefaulted"] = (train_data["NumberOfTimes90DaysLate"] + train_data["NumberOfTime60-89DaysPastDueNotWorse"])\
                                        + train_data["NumberOfTime30-59DaysPastDueNotWorse"]

In [None]:
train_data.loc[(train_data["CombinedDefaulted"] >= 1), "CombinedDefaulted"] = 1

In [None]:
train_data["CombinedCreditLoans"] = train_data["NumberOfOpenCreditLinesAndLoans"] + \
                                        train_data["NumberRealEstateLoansOrLines"]




In [None]:
train_data.loc[(train_data["CombinedCreditLoans"] <= 5), "CombinedCreditLoans"] = 0
train_data.loc[(train_data["CombinedCreditLoans"] > 5), "CombinedCreditLoans"] = 1

In [None]:
train_data["WithDependents"] = train_data["NumberOfDependents"]
train_data.loc[(train_data["WithDependents"] >= 1), "WithDependents"] = 1

In [None]:
train_data["MonthlyDebtPayments"] = train_data["DebtRatio"] * train_data["MonthlyIncome"]
train_data["MonthlyDebtPayments"] = np.absolute(train_data["MonthlyDebtPayments"])
train_data["MonthlyDebtPayments"] = train_data["MonthlyDebtPayments"].astype('int64')



In [None]:
train_data["age"] = train_data["age"].astype('int64')
train_data["MonthlyIncome"] = train_data["MonthlyIncome"].astype('int64')

In [None]:
train_data["age_map"] = train_data["age"]
train_data.loc[(train_data["age"] >= 18) & (train_data["age"] < 60), "age_map"] = 1
train_data.loc[(train_data["age"] >= 60), "age_map"] = 0

In [None]:
train_data["age_map"] = train_data["age_map"].replace(0, "working")
train_data["age_map"] = train_data["age_map"].replace(1, "senior")



In [None]:
train_data = pd.concat([train_data, pd.get_dummies(train_data.age_map,prefix='is')], axis=1)

In [None]:
train_data.drop(["Unnamed: 0","NumberOfOpenCreditLinesAndLoans",
                 "NumberOfTimes90DaysLate","NumberRealEstateLoansOrLines","NumberOfTime60-89DaysPastDueNotWorse",
                 "WithDependents","age_map","is_senior","is_working", "MonthlyDebtPayments"], axis=1, inplace=True)

In [None]:
train_data.columns

In [None]:
corr = train_data.corr()
plt.figure(figsize=(10,8))
sns.heatmap(corr, annot=True, fmt=".2g")

In [None]:
# ### Test dataset 정리 ###

# def cleaned_dataset(dataset):
#     dataset.loc[dataset["age"] <= 18, "age"] = dataset.age.median()

#     age_working = dataset.loc[(dataset["age"] >= 18) & (dataset["age"] < 60)]
#     age_senior = dataset.loc[(dataset["age"] >= 60)]

#     age_working_impute = age_working.MonthlyIncome.mean()
#     age_senior_impute = age_senior.MonthlyIncome.mean()

#     dataset["MonthlyIncome"] = np.absolute(dataset["MonthlyIncome"])
#     dataset["MonthlyIncome"] = dataset["MonthlyIncome"].fillna(99999)
#     dataset["MonthlyIncome"] = dataset["MonthlyIncome"].astype('int64')

#     dataset.loc[((dataset["age"] >= 18) & (dataset["age"] < 60)) & (dataset["MonthlyIncome"] == 99999),\
#                    "MonthlyIncome"] = age_working_impute
#     dataset.loc[(train_data["age"] >= 60) & (dataset["MonthlyIncome"] == 99999), "MonthlyIncome"] = age_senior_impute
#     dataset["NumberOfDependents"] = np.absolute(dataset["NumberOfDependents"])
#     dataset["NumberOfDependents"] = dataset["NumberOfDependents"].fillna(0)
#     dataset["NumberOfDependents"] = dataset["NumberOfDependents"].astype('int64')

#     dataset["CombinedDefaulted"] = (dataset["NumberOfTimes90DaysLate"] + dataset["NumberOfTime60-89DaysPastDueNotWorse"])\
#                                             + dataset["NumberOfTime30-59DaysPastDueNotWorse"]

#     dataset.loc[(dataset["CombinedDefaulted"] >= 1), "CombinedDefaulted"] = 1

#     dataset["CombinedCreditLoans"] = dataset["NumberOfOpenCreditLinesAndLoans"] + \
#                                             dataset["NumberRealEstateLoansOrLines"]
#     dataset.loc[(dataset["CombinedCreditLoans"] <= 5), "CombinedCreditLoans"] = 0
#     dataset.loc[(dataset["CombinedCreditLoans"] > 5), "CombinedCreditLoans"] = 1

#     dataset.drop(["Unnamed: 0","NumberOfOpenCreditLinesAndLoans",\
#                   "NumberOfTimes90DaysLate","NumberRealEstateLoansOrLines","NumberOfTime60-89DaysPastDueNotWorse"], axis=1, inplace=True)

# cleaned_dataset(test_data)

In [None]:
train_data.shape
#test_data.shape

In [None]:
train_data.SeriousDlqin2yrs.value_counts()
# imbalanced in target value
# need to be changed in certain imbalanced ratio - undersampling or oversampling or ...

In [None]:
#Create independent and depedent features - Separate features into input and output
columns = train_data.columns.tolist()
#Filter the columns to remove data we do not want
columns = [c for c in columns if c not in ["SeriousDlqin2yrs"]]
#Store the variable we are predicting
target = "SeriousDlqin2yrs"
#Define a random state
state = np.random.RandomState(42)
X = train_data[columns]
Y = train_data[target]

print(X.shape)
print(Y.shape)

In [None]:
good = train_data[train_data['SeriousDlqin2yrs']==0]
bad = train_data[train_data['SeriousDlqin2yrs']==1]
print(good.shape, bad.shape)

(139974, 9) (10026, 9)


In [None]:
### NATE - Table 5

import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import make_scorer, roc_auc_score
from sklearn.model_selection import cross_validate

# 모델 라이브러리 import
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

# 데이터를 준비 (X: input features, Y: target feature)
# 예시: X, Y 는 실제 데이터로 대체되어야 함
# X = pd.DataFrame(...) # 8개 input feature로 구성된 데이터프레임
# Y = pd.Series(...)    # target feature

# 사용할 분류 모델들
models = {
    "Logistic Regression": LogisticRegression(),
    "Linear Discriminant Analysis": LinearDiscriminantAnalysis(),
    "K-Nearest Neighbor": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

# 5-fold 교차 검증 설정
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# 평가 기준 정의
scoring = {
           'accuracy': 'accuracy',
           'AUC': make_scorer(roc_auc_score, needs_proba=True, multi_class='ovr'),
           'MCC': make_scorer(matthews_corrcoef),
           'F1':'f1'
           }

# 결과를 저장할 리스트
results = []

# 각 모델에 대해 교차 검증 수행
for name, model in models.items():
    cv_results = cross_validate(model, X, Y, cv=kf, scoring=scoring)
    results.append({
        'Model': name,
        'Accuracy Mean': np.mean(cv_results['test_accuracy']),
        'Accuracy Std': np.std(cv_results['test_accuracy']),
        'AUC Mean': np.mean(cv_results['test_AUC']),
        'AUC Std': np.std(cv_results['test_AUC']),
        'MCC Mean': np.mean(cv_results['test_MCC']),
        'MCC Std': np.std(cv_results['test_MCC']),
        'F1 Mean': np.mean(cv_results['test_F1']),
        'F1 Std': np.std(cv_results['test_F1'])
    })

# 결과를 데이터프레임으로 변환하여 출력
results_df = pd.DataFrame(results)
print(results_df)


In [None]:
# NATE - Table 6 - SMOTE
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import make_scorer, matthews_corrcoef, roc_auc_score, f1_score
# from sklearn.utils.fixes import loguniform

from imblearn.over_sampling import SMOTE

import warnings
warnings.filterwarnings('ignore')  # 경고 무시

# 데이터 준비
# 실제 데이터로 대체 필요
# 예시로 랜덤 데이터 생성 (8개의 피처, 불균형 타겟)
from sklearn.datasets import make_classification

# X, Y = make_classification(n_samples=1000, n_features=8,
#                            n_informative=5, n_redundant=2,
#                            n_clusters_per_class=2, weights=[0.93, 0.07],
#                            flip_y=0, random_state=42)

# SMOTE를 적용할 불균형 비율 설정
desired_ratios = [0.15, 0.32, 0.50, 1.00]  # 15%, 32%, 50%, 100%

# 모델별 하이퍼파라미터 그리드 정의
param_grids = {
    'Random Forest': {
        'n_estimators': [100, 1000],
        'max_features': ['auto', 'sqrt'],
        'max_depth': [1, 20],
        'min_samples_split': [2, 5, 10],  # min_samples_split cannot be 1
        'min_samples_leaf': [1, 2, 4, 8],
        'bootstrap': [True, False]
    },
    'Gradient Boosting': {
        'n_estimators': [100, 1000],
        'learning_rate': [0.01, 0.1],
        'max_depth': [1, 20],
        'min_samples_split': [2, 5, 10],  # min_samples_split cannot be 1
        'min_samples_leaf': [1, 2, 4, 8],
        'subsample': [0.8, 1.0]
    },
    'XGBoost': {
        'n_estimators': [100, 1000],
        'learning_rate': [0.01, 0.1],
        'max_depth': [1, 20],
        'min_child_weight': [2, 5, 10],  # 대체 min_samples_split
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]  # 대체 min_samples_leaf
    },
    'Logistic Regression': {
        'penalty': ['none', 'l1', 'l2', 'elasticnet'],
        # 'C': loguniform(1e-5, 100),
        'solver': ['newton-cg', 'lbfgs', 'liblinear']
    }
}

# 평가 지표 정의
scoring = {
    'AUC': 'roc_auc',
    'MCC': make_scorer(matthews_corrcoef),
    'F1': 'f1'
}

# 결과를 저장할 리스트 초기화
results = []

# K-Fold 교차 검증 설정
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# 각 불균형 비율에 대해 SMOTE 적용 및 모델 학습
for ratio in desired_ratios:
    print(f"\n=== SMOTE 적용: Minority 비율 {int(ratio*100)}% ===")

    # SMOTE 적용
    smote = SMOTE(sampling_strategy=ratio, random_state=42)
    X_res, Y_res = smote.fit_resample(X, Y)

    print(f"Resampled dataset shape: {np.bincount(Y_res)}")

    for model_name in ['Random Forest', 'Gradient Boosting', 'XGBoost', 'Logistic Regression']:
        print(f"\n--- 모델: {model_name} ---")

        # 모델 초기화
        if model_name == 'Random Forest':
            model = RandomForestClassifier(random_state=42)
        elif model_name == 'Gradient Boosting':
            model = GradientBoostingClassifier(random_state=42)
        elif model_name == 'XGBoost':
            model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
        elif model_name == 'Logistic Regression':
            model = LogisticRegression(random_state=42, max_iter=1000)

        # 하이퍼파라미터 그리드 설정
        param_grid = param_grids[model_name]

        # GridSearchCV 설정
        grid_search = GridSearchCV(
            estimator=model,
            param_grid=param_grid,
            scoring=scoring,
            refit='AUC',  # AUC를 기준으로 최적 모델 선택
            cv=kf,
            n_jobs=-1,
            verbose=0
        )

        # Grid Search 수행
        grid_search.fit(X_res, Y_res)

        # 최적 모델의 교차 검증 결과 추출
        cv_results = grid_search.cv_results_
        best_index = grid_search.best_index_

        auc_scores = cv_results['mean_test_AUC']
        mcc_scores = cv_results['mean_test_MCC']
        f1_scores = cv_results['mean_test_F1']

        best_auc = cv_results['mean_test_AUC'][best_index]
        best_mcc = cv_results['mean_test_MCC'][best_index]
        best_f1 = cv_results['mean_test_F1'][best_index]

        # 결과 저장
        results.append({
            'SMOTE Ratio (%)': int(ratio * 100),
            'Model': model_name,
            'Best AUC': best_auc,
            'Best MCC': best_mcc,
            'Best F1 Score': best_f1,
            'Best Parameters': grid_search.best_params_
        })

        print(f"Best AUC: {best_auc:.4f}, Best MCC: {best_mcc:.4f}, Best F1: {best_f1:.4f}")
        print(f"Best Parameters: {grid_search.best_params_}")

# 결과를 DataFrame으로 변환
results_df = pd.DataFrame(results)

# 결과 출력
print("\n=== 최종 결과 ===")
print(results_df)


In [None]:
# NATE - Table 6 - NearMiss

import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import make_scorer, matthews_corrcoef, roc_auc_score, f1_score
# from sklearn.utils.fixes import loguniform

from imblearn.under_sampling import NearMiss

import warnings
warnings.filterwarnings('ignore')  # 경고 무시

# 데이터 준비
# 실제 데이터로 대체 필요
# 예시로 랜덤 데이터 생성 (8개의 피처, 불균형 타겟)
from sklearn.datasets import make_classification

# X, Y = make_classification(n_samples=1000, n_features=8,
#                            n_informative=5, n_redundant=2,
#                            n_clusters_per_class=2, weights=[0.93, 0.07],
#                            flip_y=0, random_state=42)

# NearMiss를 적용할 불균형 비율 설정
desired_ratios = [0.15, 0.32, 0.50, 1.00]  # 15%, 32%, 50%

# 모델별 하이퍼파라미터 그리드 정의
param_grids = {
    'Random Forest': {
        'n_estimators': [100, 1000],
        'max_features': ['auto', 'sqrt'],
        'max_depth': [1, 20],
        'min_samples_split': [2, 5, 10],  # min_samples_split cannot be 1
        'min_samples_leaf': [1, 2, 4, 8],
        'bootstrap': [True, False]
    },
    'Gradient Boosting': {
        'n_estimators': [100, 1000],
        'learning_rate': [0.01, 0.1],
        'max_depth': [1, 20],
        'min_samples_split': [2, 5, 10],  # min_samples_split cannot be 1
        'min_samples_leaf': [1, 2, 4, 8],
        'subsample': [0.8, 1.0]
    },
    'XGBoost': {
        'n_estimators': [100, 1000],
        'learning_rate': [0.01, 0.1],
        'max_depth': [1, 20],
        'min_child_weight': [2, 5, 10],  # 대체 min_samples_split
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]  # 대체 min_samples_leaf
    },
    'Logistic Regression': {
        'penalty': ['none', 'l1', 'l2', 'elasticnet'],
        # 'C': loguniform(1e-5, 100),
        'solver': ['newton-cg', 'lbfgs', 'liblinear']
    }
}

# 평가 지표 정의
scoring = {
    'AUC': 'roc_auc',
    'MCC': make_scorer(matthews_corrcoef),
    'F1': 'f1'
}

# 결과를 저장할 리스트 초기화
results = []

# K-Fold 교차 검증 설정
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# 각 불균형 비율에 대해 NearMiss 적용 및 모델 학습
for ratio in desired_ratios:
    print(f"\n=== NearMiss 적용: Minority 비율 {int(ratio*100)}% ===")

    # NearMiss 적용
    nearmiss = NearMiss(sampling_strategy=ratio)
    X_res, Y_res = nearmiss.fit_resample(X, Y)

    print(f"Resampled dataset shape: {np.bincount(Y_res)}")

    for model_name in ['Random Forest', 'Gradient Boosting', 'XGBoost', 'Logistic Regression']:
        print(f"\n--- 모델: {model_name} ---")

        # 모델 초기화
        if model_name == 'Random Forest':
            model = RandomForestClassifier(random_state=42)
        elif model_name == 'Gradient Boosting':
            model = GradientBoostingClassifier(random_state=42)
        elif model_name == 'XGBoost':
            model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
        elif model_name == 'Logistic Regression':
            model = LogisticRegression(random_state=42, max_iter=1000)

        # 하이퍼파라미터 그리드 설정
        param_grid = param_grids[model_name]

        # GridSearchCV 설정
        grid_search = GridSearchCV(
            estimator=model,
            param_grid=param_grid,
            scoring=scoring,
            refit='AUC',  # AUC를 기준으로 최적 모델 선택
            cv=kf,
            n_jobs=-1,
            verbose=0
        )

        # Grid Search 수행
        grid_search.fit(X_res, Y_res)

        # 최적 모델의 교차 검증 결과 추출
        cv_results = grid_search.cv_results_
        best_index = grid_search.best_index_

        auc_scores = cv_results['mean_test_AUC']
        mcc_scores = cv_results['mean_test_MCC']
        f1_scores = cv_results['mean_test_F1']

        best_auc = cv_results['mean_test_AUC'][best_index]
        best_mcc = cv_results['mean_test_MCC'][best_index]
        best_f1 = cv_results['mean_test_F1'][best_index]

        # 결과 저장
        results.append({
            'NearMiss Ratio (%)': int(ratio * 100),
            'Model': model_name,
            'Best AUC': best_auc,
            'Best MCC': best_mcc,
            'Best F1 Score': best_f1,
            'Best Parameters': grid_search.best_params_
        })

        print(f"Best AUC: {best_auc:.4f}, Best MCC: {best_mcc:.4f}, Best F1: {best_f1:.4f}")
        print(f"Best Parameters: {grid_search.best_params_}")

# 결과를 DataFrame으로 변환
results_df = pd.DataFrame(results)

# 결과 출력
print("\n=== 최종 결과 ===")
print(results_df)


In [None]:
### BEST model - GB using SMOTE - Table 8
import time
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import make_scorer, matthews_corrcoef, roc_auc_score, f1_score
from imblearn.over_sampling import SMOTE
from sklearn.datasets import make_classification

# # Data preparation
# X, Y = make_classification(n_samples=1000, n_features=8,
#                            n_informative=5, n_redundant=2,
#                            n_clusters_per_class=2, weights=[0.93, 0.07],
#                            flip_y=0, random_state=42)

# Apply SMOTE with desired ratio
smote = SMOTE(sampling_strategy=1.00, random_state=42)
X_res, Y_res = smote.fit_resample(X, Y)

# Define Gradient Boosting model with best hyperparameters
best_params = {
    'n_estimators': 1000,
    'learning_rate': 0.1,
    'max_depth': 20,
    'min_samples_leaf': 4,
    'min_samples_split': 10,
    'subsample': 0.8,
    'random_state': 42
}
gb_best_model = GradientBoostingClassifier(**best_params)

# K-Fold cross-validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Scoring functions
scoring = {
    'AUC': make_scorer(roc_auc_score, needs_proba=True),
    'MCC': make_scorer(matthews_corrcoef),
    'F1': make_scorer(f1_score)
}

# Dictionary to store results and computation time for each metric
results = {}
for metric_name, scorer in scoring.items():
    start_time = time.time()
    scores = cross_val_score(gb_best_model, X_res, Y_res, scoring=scorer, cv=kf, n_jobs=-1)
    elapsed_time = time.time() - start_time
    results[metric_name] = {
        'Score': scores.mean(),
        'Time (s)': elapsed_time
    }

# Output the results
print("\n=== Best Model Evaluation Results ===")
for metric, result in results.items():
    print(f"{metric}: {result['Score']:.4f} (Time taken: {result['Time (s)']:.2f} seconds)")


In [None]:
### BEST model - XGB using NearMiss - Table 9
import time
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import make_scorer, matthews_corrcoef, roc_auc_score, f1_score
from imblearn.under_sampling import NearMiss
from sklearn.datasets import make_classification

# # Data preparation (generate synthetic dataset)
# X, Y = make_classification(n_samples=1000, n_features=8,
#                            n_informative=5, n_redundant=2,
#                            n_clusters_per_class=2, weights=[0.93, 0.07],
#                            flip_y=0, random_state=42)

# Apply NearMiss with desired ratio 1.00 (full undersampling)
nearmiss = NearMiss(sampling_strategy=1.00)
X_res, Y_res = nearmiss.fit_resample(X, Y)

# Define XGBoost model with specified best hyperparameters
best_params = {
    'n_estimators': 1000,
    'learning_rate': 0.01,
    'max_depth': 20,
    'min_child_weight': 10,
    'subsample': 1.0,
    'colsample_bytree': 0.8,
    'random_state': 42,
    'use_label_encoder': False,
    'eval_metric': 'logloss'
}
xgb_best_model = XGBClassifier(**best_params)

# K-Fold cross-validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Scoring functions
scoring_functions = {
    'AUC': make_scorer(roc_auc_score, needs_proba=True),
    'MCC': make_scorer(matthews_corrcoef),
    'F1': make_scorer(f1_score)
}

# Dictionary to store results and computation time for each metric
results = {}
for metric_name, scorer in scoring_functions.items():
    start_time = time.time()
    scores = cross_val_score(xgb_best_model, X_res, Y_res, scoring=scorer, cv=kf, n_jobs=-1)
    elapsed_time = time.time() - start_time
    results[metric_name] = {
        'Score': scores.mean(),
        'Time (s)': elapsed_time
    }

# Output the results
print("\n=== Best Model Evaluation Results ===")
for metric, result in results.items():
    print(f"{metric}: {result['Score']:.4f} (Time taken: {result['Time (s)']:.2f} seconds)")


In [None]:
#Random Forest with 50 % bad in total
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

forest = RandomForestClassifier(
    random_state=42,
    n_estimators=1000,
    max_depth=20,
    max_features="auto",
    min_samples_leaf=1,
    min_samples_split=2,
    class_weight="balanced",
    bootstrap=False
)

smote = SMOTE(sampling_strategy=1.00, random_state=42)
X_res, Y_res = smote.fit_resample(X, Y)

X_train, X_val, y_train, y_val = train_test_split(X_res,y_res,random_state=42)
forest.fit(X_train,y_train)

X_train2, X_test2, y_train2, y_test2 = train_test_split(X, Y, test_size=0.30, random_state=42)

y_scores_proba = forest.predict_proba(X_train)
y_scores = y_scores_proba[:,1]


In [None]:
pip install shap

In [None]:
row_to_show = 5
data_for_prediction1 = X_test2.iloc[row_to_show]  # use 1 row of data here. Could use multiple rows if desired
data_for_prediction_array_notdefaulted = data_for_prediction1.values.reshape(1, -1)


forest.predict_proba(data_for_prediction_array_notdefaulted)

In [None]:
row_to_show = 22
data_for_prediction2 = X_test2.iloc[row_to_show]  # use 1 row of data here. Could use multiple rows if desired
data_for_prediction_array_defaulted = data_for_prediction2.values.reshape(1, -1)

forest.predict_proba(data_for_prediction_array_defaulted)

In [None]:
import shap  # package used to calculate Shap values

# Create object that can calculate shap values
explainer = shap.TreeExplainer(forest)

# Calculate Shap values
shap_values = explainer.shap_values(data_for_prediction1)

In [None]:
#Shap force plot - not defaulted
shap.initjs()
shap.force_plot(explainer.expected_value[1], shap_values[1], data_for_prediction1)

In [None]:
#Shap force plot - defaulted
shap.initjs()
shap_values2 = explainer.shap_values(data_for_prediction2)
shap.force_plot(explainer.expected_value[1], shap_values2[1], data_for_prediction2)

In [None]:
#Shap decision plot -  Not defaluted
shap.initjs()
shap.decision_plot(explainer.expected_value[1], shap_values[1], data_for_prediction1, feature_display_range=slice(None, -16, -1))

In [None]:
#Shap decision plot -  defaluted
shap.initjs()
shap.decision_plot(explainer.expected_value[1], shap_values2[1], data_for_prediction2, feature_display_range=slice(None, -16, -1))

In [None]:
#Global explanation

X_importance = X_test2

# Explain model predictions using shap library:
explainer = shap.TreeExplainer()
shap_values = explainer.shap_values(X_importance)

In [None]:
# 1. feature importance plot - Global interpretability
import shap

shap_values = shap.TreeExplainer(forest).shap_values(X_test2)
shap.summary_plot(shap_values, X_test2, plot_type="bar")

In [None]:
# shap summary_plot
shap.initjs()
shap.summary_plot(shap_values, X_importance)