# XGBoost with Grid Search

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier

In [2]:
df = pd.read_csv('data/UCI_Credit_Card.csv')
y = df['default.payment.next.month']
X = df.drop(columns=['default.payment.next.month'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [3]:
def get_metrics(model):
    metrics = {}
    
    y_hat_train = model.predict(X_train)
    y_hat_test = model.predict(X_test)
    
    metrics['train_accuracy'] = accuracy_score(y_train, y_hat_train)
    metrics['test_accuracy'] = accuracy_score(y_test, y_hat_test)
    
    metrics['train_f1'] = f1_score(y_train, y_hat_train)
    metrics['test_f1'] = f1_score(y_test, y_hat_test)
    
    metrics['train_precision'] = precision_score(y_train, y_hat_train)
    metrics['test_precision'] = precision_score(y_test, y_hat_test)
    
    metrics['train_recall'] = recall_score(y_train, y_hat_train)
    metrics['test_recall'] = recall_score(y_test, y_hat_test)
    
    return metrics

## Baseline Model

In [4]:
clf0 = XGBClassifier()
clf0.fit(X_train, y_train)
get_metrics(clf0)

{'train_accuracy': 0.8242666666666667,
 'test_accuracy': 0.8262666666666667,
 'train_f1': 0.4862266112266112,
 'test_f1': 0.4790083966413434,
 'train_precision': 0.6950222882615156,
 'test_precision': 0.6892980437284235,
 'train_recall': 0.37390087929656274,
 'test_recall': 0.36703431372549017}

## Data Cleaning

In [5]:
to_drop = ['ID']
categoricals = ['SEX', 'EDUCATION', 'MARRIAGE']
continuous = ['LIMIT_BAL', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']
target = ['default.payment.next.month']

In [6]:
X_train.drop(to_drop, axis=1, inplace=True)
X_test.drop(to_drop, axis=1, inplace=True)
for col in ['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']:
    X_train[col] = X_train[col].map(lambda x: 0 if x <= 0 else x)
    X_test[col] = X_test[col].map(lambda x: 0 if x <= 0 else x)

In [7]:
clf1 = XGBClassifier()
clf1.fit(X_train, y_train)
get_metrics(clf1)

{'train_accuracy': 0.8235555555555556,
 'test_accuracy': 0.8268,
 'train_f1': 0.4856180357605597,
 'test_f1': 0.4814371257485031,
 'train_precision': 0.6904937361827561,
 'test_precision': 0.6907216494845361,
 'train_recall': 0.3745003996802558,
 'test_recall': 0.3694852941176471}

## Creating Dummy Variables

In [8]:
for col in categoricals:
    X_train[col] = X_train[col].map(str)
    X_test[col] = X_test[col].map(str)
X_train = pd.get_dummies(X_train, drop_first=True, dtype=int)
X_test = pd.get_dummies(X_test, drop_first=True, dtype=int)

In [9]:
clf2 = XGBClassifier()
clf2.fit(X_train, y_train)
get_metrics(clf2)

{'train_accuracy': 0.8233333333333334,
 'test_accuracy': 0.8268,
 'train_f1': 0.4843689194448048,
 'test_f1': 0.48019207683073234,
 'train_precision': 0.6902033271719039,
 'test_precision': 0.6920415224913494,
 'train_recall': 0.373101518784972,
 'test_recall': 0.36764705882352944}

## Transforming Continuous Variables

In [10]:
scaler = MinMaxScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

In [11]:
clf3 = XGBClassifier()
clf3.fit(X_train, y_train)
get_metrics(clf3)

{'train_accuracy': 0.8233333333333334,
 'test_accuracy': 0.8268,
 'train_f1': 0.4843689194448048,
 'test_f1': 0.48019207683073234,
 'train_precision': 0.6902033271719039,
 'test_precision': 0.6920415224913494,
 'train_recall': 0.373101518784972,
 'test_recall': 0.36764705882352944}

In [12]:
X_train = X_train.applymap(lambda x: np.log(x+1))
X_test = X_test.applymap(lambda x: np.log(x+1))

In [13]:
clf4 = XGBClassifier()
clf4.fit(X_train, y_train)
get_metrics(clf4)

{'train_accuracy': 0.8233333333333334,
 'test_accuracy': 0.8268,
 'train_f1': 0.4843689194448048,
 'test_f1': 0.48019207683073234,
 'train_precision': 0.6902033271719039,
 'test_precision': 0.6920415224913494,
 'train_recall': 0.373101518784972,
 'test_recall': 0.36764705882352944}

## Addressing Class Imbalance

In [15]:
df[target[0]].value_counts(), y_train.value_counts(), y_test.value_counts()

(0    23364
 1     6636
 Name: default.payment.next.month, dtype: int64,
 1    17496
 0    17496
 dtype: int64,
 0    5868
 1    1632
 Name: default.payment.next.month, dtype: int64)

In [14]:
X_train_smote, y_train_smote = SMOTE().fit_resample(X_train, y_train)
X_train = pd.DataFrame(X_train_smote, columns=X_train.columns)
y_train = pd.Series(y_train_smote)

In [16]:
clf5 = XGBClassifier()
clf5.fit(X_train, y_train)
get_metrics(clf5)

{'train_accuracy': 0.7828360768175583,
 'test_accuracy': 0.7882666666666667,
 'train_f1': 0.761809234241294,
 'test_f1': 0.5080545229244113,
 'train_precision': 0.8434788644408968,
 'test_precision': 0.5137844611528822,
 'train_recall': 0.6945587562871514,
 'test_recall': 0.5024509803921569}

## Grid Search

In [19]:
param_grid = {
    'learning_rate': [0.1, 0.2],
    'max_depth': [6],
    'min_child_weight': [1, 2],
    'subsample': [0.5, 0.7],
    'n_estimators': [100],
}

In [20]:
clf = XGBClassifier()
grid_clf = GridSearchCV(clf, param_grid, scoring='accuracy', cv=None, n_jobs=1)
grid_clf.fit(X_train, y_train)
grid_clf.best_params_

{'learning_rate': 0.2,
 'max_depth': 6,
 'min_child_weight': 1,
 'n_estimators': 100,
 'subsample': 0.7}

In [21]:
clf6 = XGBClassifier(learning_rate= 0.2, max_depth=6, min_child_weight=1, n_estimators=100, subsample=0.7)
clf6.fit(X_train, y_train)
get_metrics(clf6)

{'train_accuracy': 0.8976909007773205,
 'test_accuracy': 0.8050666666666667,
 'train_f1': 0.8924214195564638,
 'test_f1': 0.48119233498935415,
 'train_precision': 0.9408820174882778,
 'test_precision': 0.5716694772344013,
 'train_recall': 0.848708276177412,
 'test_recall': 0.41544117647058826}