# Bayesian classifier  for Credit Card Default Dataset

In [None]:
## TODO ?
# check the correlation between variables and if it is possible to reduce the dimensionality ?


In [43]:
## IMPORTING LIBRARIES  ##
import math
import numpy as np
import matplotlib.pyplot as pyplot

## IMPORTING DATASET ##
## source: https://www.kaggle.com/uciml/default-of-credit-card-clients-dataset/data ##
default_credit_card_dataset=np.genfromtxt('UCI_Credit_Card.csv',delimiter=',')

# Remove the first line of the data set, which only contain the column names
dataset = default_credit_card_dataset[1:-1]

In [44]:
## WARMING UP (will not be in the final delivery of the project)  ##
## TRYING TO REPRODUCE MAHYAR'S VISUAL EXPLORATORY DATA ANALYSIS USING NUMPY INSTEAD OF PANDAS ##

# Select rows where last column (default_payment_next_month) is greater than 0 and only keep the first column (LIMIT_BAL)
# print dataset[dataset[:,-1] > 0][:,1]

bins = 30
pyplot.hist(dataset[:,1], bins = bins, color='m',label = 'Total',alpha=0.5)
pyplot.hist(dataset[dataset[:,-1] > 0][:,1], bins = bins, color='b',label = 'Default')
pyplot.xlabel('Credit Limit (NT dollar)')
pyplot.ylabel('Number of Accounts')
pyplot.title('Fig.1 : Credit Limit ',fontweight="bold", size=12)
pyplot.legend()
pyplot.show()

In [45]:
class gauss_density_estimator:
    def __init__(self,n_dims):
        self.mu = np.zeros((1,n_dims))
        self.n_dims = n_dims
        self.sigma_sq = np.ones(n_dims)
        
    def train(self, train_data):
        self.mu = np.mean(train_data, axis = 0)
        self.sigma_sq =  np.sum((train_data - self.mu) ** 2.0, axis = 0) / train_data.shape[0]
        
    def compute_predictions(self, test_data):
        c = -self.n_dims * np.log(2*np.pi)/2.0 - np.log(np.prod(self.sigma_sq))/2.0
        log_prob = c - np.sum((test_data -  self.mu)**2.0/ (2.0 * self.sigma_sq),axis=1)
        return log_prob

In [46]:
class classif_bayes:

    def __init__(self,modeles_mv, priors):
        self.modeles_mv = modeles_mv
        self.priors = priors
        if len(self.modeles_mv) != len(self.priors):
            print 'Le nombre de modeles MV doit etre egale au nombre de priors!'
        
        self.n_classes = len(self.modeles_mv)
                                                            
    # Retourne une matrice de taille nb. ex. de test x nombre de classes contenant les log
    # probabilités de chaque exemple de test sous chaque modèle MV. 
    def compute_predictions(self, test_data, eval_by_group=False):
        log_pred = np.empty((test_data.shape[0],self.n_classes))

        for i in range(self.n_classes):
            # ici il va falloir utiliser modeles_mv[i] et priors pour remplir
            # chaque colonne de log_pred (c'est plus efficace de faire tout une
            # colonne a la fois)
            
            log_pred[:,i] = self.modeles_mv[i].compute_predictions(test_data) +  np.log(self.priors[i])

        return log_pred

In [47]:
## CREATING TEST AND TRAINING GROUPS ##
# 1- separate the two classes
dataset_default_payment_records = dataset[dataset[:,-1] > 0]
dataset_nodefault_payment_records = dataset[dataset[:,-1] < 1]

# 2- remove the last column (which is the label)
# dataset_default_payment_records = np.delete(dataset_default_payment_records, -1, axis=1)
# dataset_nodefault_payment_records = np.delete(dataset_nodefault_payment_records, -1, axis=1)

# 2- shuffle each class dataset
np.random.seed(123)
np.random.shuffle(dataset_default_payment_records)
np.random.shuffle(dataset_nodefault_payment_records)

# 3- divide each class in 60-40 proportion (for training and testing purposes)
default_dataset_number_of_records = len(dataset_default_payment_records)
nodefault_dataset_number_of_records = len(dataset_nodefault_payment_records)

split_index_default = int(default_dataset_number_of_records * 0.6)
split_index_nodefault = int(nodefault_dataset_number_of_records * 0.6)

dataset_default_for_training = dataset_default_payment_records[:split_index_default]
dataset_default_for_testing = dataset_default_payment_records[split_index_default:]

dataset_nodefault_for_training = dataset_nodefault_payment_records[:split_index_nodefault]
dataset_nodefault_for_testing = dataset_nodefault_payment_records[split_index_nodefault:]

In [48]:
## TRAINING THE DIAGONAL GAUSSIAN MODELS ##

# Reminder : shape return a tuple (number of rows, number of columns)
# minus 2 because the first column (ID) and the last (labels) are irrelevants when training
number_of_dimensions = dataset_default_for_training.shape[1] - 2

# Initializing Gaussian kernel
default_payment_diagonal_gaussian_model = gauss_density_estimator(number_of_dimensions)
nodefault_payment_diagonal_gaussian_model = gauss_density_estimator(number_of_dimensions)

# Training - we don't keep the first and the last column (respectively ID and labels) for the training
default_payment_diagonal_gaussian_model.train(dataset_default_for_training[:,1:-1])
nodefault_payment_diagonal_gaussian_model.train(dataset_nodefault_for_training[:,1:-1])

In [49]:
## BAYES CLASSIFIER INITIALIZATION ##

# 1- Calculating priors for each class
total_number_of_records = dataset.shape[0]
# if there the division is made only by int numbers, the result will be 0
# need to convert at least one int to float to avoid that
total_number_of_records = float(total_number_of_records)

priors_nodefault_payment = nodefault_dataset_number_of_records / total_number_of_records
priors_default_payment = default_dataset_number_of_records / total_number_of_records

# 2- Initializing bayes classifier
models_array = [default_payment_diagonal_gaussian_model, nodefault_payment_diagonal_gaussian_model]
priors_array = [priors_default_payment, priors_nodefault_payment]
default_payment_bayes_classifier = classif_bayes(models_array, priors_array)

In [50]:
## BAYES CLASSIFIER ERROR RATES 24 DIMENSIONS ##

# 1- Get the total dataset training
dataset_train = np.concatenate([dataset_default_for_training, dataset_nodefault_for_training])
dataset_test = np.concatenate([dataset_default_for_testing, dataset_nodefault_for_testing])

log_prob_train=default_payment_bayes_classifier.compute_predictions(dataset_train[:,1:-1])
log_prob_test=default_payment_bayes_classifier.compute_predictions(dataset_test[:,1:-1])

classesPred_train = log_prob_train.argmax(1)+1
classesPred_test = log_prob_test.argmax(1)+1

print "Error Rate (training) %.2f%%" % ((1-(classesPred_train==dataset_train[:,-1]).mean())*100.0)
print "Error Rate (test) %.2f%%" % ((1-(classesPred_test==dataset_test[:,-1]).mean())*100.0)

Error Rate (training) 84.24%
Error Rate (test) 84.51%
