# Bayes Classification

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE

In [2]:
df = pd.read_csv('data/UCI_Credit_Card.csv')
y = df['default.payment.next.month']
X = df.drop(columns=['default.payment.next.month'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [3]:
def get_metrics(model):
    metrics = {}
    
    y_hat_train = model.predict(X_train)
    y_hat_test = model.predict(X_test)
    
    metrics['train_accuracy'] = accuracy_score(y_train, y_hat_train)
    metrics['test_accuracy'] = accuracy_score(y_test, y_hat_test)
    
    metrics['train_f1'] = f1_score(y_train, y_hat_train)
    metrics['test_f1'] = f1_score(y_test, y_hat_test)
    
    metrics['train_precision'] = precision_score(y_train, y_hat_train)
    metrics['test_precision'] = precision_score(y_test, y_hat_test)
    
    metrics['train_recall'] = recall_score(y_train, y_hat_train)
    metrics['test_recall'] = recall_score(y_test, y_hat_test)
    
    return metrics

## Baseline Model

In [4]:
clf0 = GaussianNB()
clf0.fit(X_train, y_train)
get_metrics(clf0)

{'train_accuracy': 0.3655111111111111,
 'test_accuracy': 0.3605333333333333,
 'train_f1': 0.3865589549673427,
 'test_f1': 0.38004136504653574,
 'train_precision': 0.24622290343770528,
 'test_precision': 0.2408256880733945,
 'train_recall': 0.898880895283773,
 'test_recall': 0.9007352941176471}

## Data Cleaning

In [5]:
to_drop = ['ID']
categoricals = ['SEX', 'EDUCATION', 'MARRIAGE']
continuous = ['LIMIT_BAL', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']
target = ['default.payment.next.month']

In [6]:
X_train.drop(to_drop, axis=1, inplace=True)
X_test.drop(to_drop, axis=1, inplace=True)
for col in ['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']:
    X_train[col] = X_train[col].map(lambda x: 0 if x <= 0 else x)
    X_test[col] = X_test[col].map(lambda x: 0 if x <= 0 else x)

In [7]:
clf1 = GaussianNB()
clf1.fit(X_train, y_train)
get_metrics(clf1)

{'train_accuracy': 0.3616,
 'test_accuracy': 0.35786666666666667,
 'train_f1': 0.3862587591864639,
 'test_f1': 0.3803396809058157,
 'train_precision': 0.24565217391304348,
 'test_precision': 0.24071661237785016,
 'train_recall': 0.903277378097522,
 'test_recall': 0.9056372549019608}

## Creating Dummy Variables

In [8]:
for col in categoricals:
    X_train[col] = X_train[col].map(str)
    X_test[col] = X_test[col].map(str)
X_train = pd.get_dummies(X_train, drop_first=True, dtype=int)
X_test = pd.get_dummies(X_test, drop_first=True, dtype=int)

In [9]:
clf2 = GaussianNB()
clf2.fit(X_train, y_train)
get_metrics(clf2)

{'train_accuracy': 0.36186666666666667,
 'test_accuracy': 0.35813333333333336,
 'train_f1': 0.38630535134210975,
 'test_f1': 0.38027806385169927,
 'train_precision': 0.24570465419747717,
 'test_precision': 0.24071056062581486,
 'train_recall': 0.9030775379696243,
 'test_recall': 0.9050245098039216}

## Transforming Continuous Variables

In [10]:
scaler = MinMaxScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train))
X_test = pd.DataFrame(scaler.transform(X_test))

In [11]:
clf3 = GaussianNB()
clf3.fit(X_train, y_train)
get_metrics(clf3)

{'train_accuracy': 0.7517333333333334,
 'test_accuracy': 0.7577333333333334,
 'train_f1': 0.515860634425377,
 'test_f1': 0.5158539834798828,
 'train_precision': 0.4554637281910009,
 'test_precision': 0.4563884959924564,
 'train_recall': 0.5947242206235012,
 'test_recall': 0.5931372549019608}

In [12]:
X_train = X_train.applymap(lambda x: np.log(x+1))
X_test = X_test.applymap(lambda x: np.log(x+1))

In [13]:
clf4 = GaussianNB()
clf4.fit(X_train, y_train)
get_metrics(clf4)

{'train_accuracy': 0.7519111111111111,
 'test_accuracy': 0.7594666666666666,
 'train_f1': 0.5154513888888889,
 'test_f1': 0.5191897654584222,
 'train_precision': 0.4556476365868631,
 'test_precision': 0.45943396226415095,
 'train_recall': 0.5933253397282174,
 'test_recall': 0.5968137254901961}

## Addressing Class Imbalance

In [14]:
df[target[0]].value_counts(), y_train.value_counts(), y_test.value_counts()

(0    23364
 1     6636
 Name: default.payment.next.month, dtype: int64,
 0    17496
 1     5004
 Name: default.payment.next.month, dtype: int64,
 0    5868
 1    1632
 Name: default.payment.next.month, dtype: int64)

In [15]:
X_train, y_train = SMOTE().fit_resample(X_train, y_train)

In [16]:
clf5 = GaussianNB()
clf5.fit(X_train, y_train)
get_metrics(clf5)

{'train_accuracy': 0.5931355738454503,
 'test_accuracy': 0.4149333333333333,
 'train_f1': 0.6903990431662498,
 'test_f1': 0.39906874828814026,
 'train_precision': 0.5571975148302853,
 'test_precision': 0.25696649029982366,
 'train_recall': 0.9072930955647005,
 'test_recall': 0.8927696078431373}