In [1]:
import base64
from datetime import date, datetime
import json
import os
import random
import re
import time

import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib import font_manager, rc
import seaborn as sns
plt.rcParams['axes.unicode_minus'] = False
# Jupyter Notebook
%matplotlib inline
%config InlineBackend.figure_format='retina'

from IPython.display import display
from tqdm import tqdm

# Set random seed for reproducibility
def set_seeds(seed=777):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)

SEED = 777
set_seeds(SEED)
print(f'Random seed set to: {SEED}')
# Checking
print(np.random.rand(3))
set_seeds(SEED)
print(np.random.rand(3))

# Display entire DataFrame
def print_all(df):
    with pd.option_context('display.max_rows', None, 
                           'display.max_columns', None, 
                           'display.float_format', '{:,.4f}'.format):
        display(df)
# Display entire columns
def print_cols(df, n=5): 
    with pd.option_context('display.max_columns', None,
                          'display.float_format', '{:,.4f}'.format):
        print(df.shape)
        display(df[:n])

Random seed set to: 777
[0.15266373 0.30235661 0.06203641]
[0.15266373 0.30235661 0.06203641]


In [3]:
# ===================================================================
# Configurations

# Path
# Get the path of the current script file
CURRENT_DIR = os.getcwd()

# Move up one level to the parent directory
PARENT_DIR = os.path.dirname(os.getcwd())

# Path to data directory
DATA_ROOT = os.path.join(CURRENT_DIR, 'data')

# Path to dataset files
TRAIN_DATA_PATH = os.path.join(DATA_ROOT, 'train_scaled.csv')
TEST_DATA_PATH = os.path.join(DATA_ROOT, 'test_scaled.csv')

# Data Load

In [5]:
# Target column
target_col = 'def_pay'

# Load dataset
train_df = pd.read_csv(TRAIN_DATA_PATH)
test_df = pd.read_csv(TEST_DATA_PATH)

train_df[target_col] = train_df[target_col].astype('category')
test_df[target_col] = test_df[target_col].astype('category')

print_cols(train_df, 2)
print_cols(test_df, 2)

(24000, 27)


Unnamed: 0,limit_bal,sex_female,age,pay_1,pay_2,pay_3,pay_4,pay_5,pay_6,bill_amt1,bill_amt2,bill_amt3,bill_amt4,bill_amt5,bill_amt6,pay_amt1,pay_amt2,pay_amt3,pay_amt4,pay_amt5,pay_amt6,edu_1,edu_2,edu_3,marr_1,marr_2,def_pay
0,-0.6761,1,-1.1345,-0.4695,-0.4,-0.3889,-0.3446,-0.3127,-0.3195,-0.0815,-0.0624,-0.0387,0.0062,0.0393,0.0865,-0.2138,-0.1676,-0.1818,-0.2166,-0.0966,-0.2932,0,1,0,0,1,0
1,0.9401,1,1.4672,-0.4695,-0.4,-0.3889,-0.3446,-0.3127,-0.3195,0.0198,0.0663,0.099,0.1729,0.2424,0.2854,-0.1873,-0.1752,-0.1831,-0.1844,-0.1866,-0.1795,0,1,0,0,0,0


(6000, 27)


Unnamed: 0,limit_bal,sex_female,age,pay_1,pay_2,pay_3,pay_4,pay_5,pay_6,bill_amt1,bill_amt2,bill_amt3,bill_amt4,bill_amt5,bill_amt6,pay_amt1,pay_amt2,pay_amt3,pay_amt4,pay_amt5,pay_amt6,edu_1,edu_2,edu_3,marr_1,marr_2,def_pay
0,-0.1374,1,-0.9177,-0.4695,-0.4,-0.3889,-0.3446,-0.3127,-0.3195,-0.6753,-0.6666,-0.5963,-0.6015,0.4911,0.4874,-0.2319,-0.0118,-0.0353,4.3094,-0.1187,-0.125,0,1,0,0,1,0
1,0.2474,0,-0.3757,2.1586,-0.4,-0.3889,-0.3446,-0.3127,-0.3195,1.4848,1.6005,0.5041,-0.6737,-0.6642,-0.6534,0.1158,-0.0854,-0.2882,-0.3152,-0.3228,-0.2932,0,1,0,0,1,1


In [7]:
# Split target(y), features(X), train, test datasets

X_train = train_df.drop(target_col, axis=1)
y_train = train_df[target_col]

X_test = test_df.drop(target_col, axis=1)
y_test = test_df[target_col]

# Gaussian Naive Bayes (GNB)

In [12]:
from model_GNB import SimpleGaussianNB

from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, average_precision_score, log_loss

# Make sure target is integer (0/1)
y_train_int = y_train.astype(int)
y_test_int = y_test.astype(int)

# GaussianNB built from scratch
model = SimpleGaussianNB()
model.fit(X_train, y_train) # train on integers

# Predict class labels (0 or 1)
my_preds = model.predict(X_test)

# Predict probability of default (class 1)
my_proba = model.predict_proba(X_test)
my_proba = my_proba[:, 1] # second column = positive class probability P(default)

print('GaussianNB (from scratch)')
print(f'Accuracy: {accuracy_score(y_test_int, my_preds):.4f}') # % of correct predictions (0 or 1)
print(f'F1 Score: {f1_score(y_test_int, my_preds):.4f}') # balance of precision & recall
print(f'ROC-AUC: {roc_auc_score(y_test_int, my_proba):.4f}') # how well model ranks defaulters higher
print(f'PR-AUC: {average_precision_score(y_test_int, my_proba):.4f}') # precision-focused AUC; strong on imbalanced data
print(f'Log Loss: {log_loss(y_test_int, my_proba):.4f}') # measures how confident and correct predictions are; penalty for wrong confidence

# sklearn GaussianNB (benchmark)
sklearn_gnb = GaussianNB()
sklearn_gnb.fit(X_train, y_train_int)

# Predict class labels and probability
sklearn_preds = sklearn_gnb.predict(X_test)
sklearn_proba = sklearn_gnb.predict_proba(X_test)[:, 1]

print('\nGaussianNB (sklearn)')
print(f'Accuracy: {accuracy_score(y_test_int, sklearn_preds):.4f}')
print(f'F1 Score: {f1_score(y_test_int, sklearn_preds):.4f}')
print(f'ROC-AUC: {roc_auc_score(y_test_int, sklearn_proba):.4f}')
print(f'PR-AUC: {average_precision_score(y_test_int, sklearn_proba):.4f}')
print(f'Log Loss: {log_loss(y_test_int, sklearn_proba):.4f}')

GaussianNB (from scratch)
Accuracy: 0.7713
F1 Score: 0.5206
ROC-AUC: 0.7471
PR-AUC: 0.4918
Log Loss: 1.6005

GaussianNB (sklearn)
Accuracy: 0.7713
F1 Score: 0.5206
ROC-AUC: 0.7471
PR-AUC: 0.4915
Log Loss: 1.6016
