In [None]:
# BErnoulli Naive Bayes

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.naive_bayes import BernoulliNB
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score

df = pd.read_csv('./data/train.csv')
test_df = pd.read_csv('./data/test.csv')


to_encode = []
for column in df.columns:
    if column == 'LoanID':
        continue
    if df[column].dtype == 'object':
        to_encode.append(column)
    elif df[column].dtype not in ['float64', 'int64', 'float', 'int']:
        to_encode.append(column)

# for column in to_encode:
#     le = LabelEncoder() #for now let's use labelEncoder
#     # le = OneHotEncoder()
#     print('column is ', column)
#     df[column] = le.fit_transform(df[column])
#     test_df[column] = le.transform(test_df[column])

# one hot encoding
df = pd.get_dummies(df, columns=to_encode, drop_first=True, dtype=int)
test_df = pd.get_dummies(test_df, columns=to_encode, drop_first=True, dtype=int)
print('encoding done')

#turns out it doesn't make a difference with the new encoder


bnb_clf = BernoulliNB(binarize=True)
train_df, validation_df = train_test_split(df, test_size=0.25, random_state=17)
# , stratify=df['Default'] if we add this somehow we get lower validation score

train_df = train_df.drop(columns=['LoanID'])
validation_df = validation_df.drop(columns=['LoanID'])

x_train_df = train_df.drop(columns=['Default'])
y_train_df = train_df['Default']


bnb_clf.fit(x_train_df, y_train_df)

x_validation_df = validation_df.drop(columns=['Default'])
y_validation_df = validation_df['Default']

y_validation_pred = bnb_clf.predict(x_validation_df)

train_acc = accuracy_score(y_train_df, bnb_clf.predict(x_train_df))
valid_acc = accuracy_score(y_validation_df, y_validation_pred)

print(f'Training accuracy {train_acc}')
print(f'Validation accuracy {valid_acc}')


newdf = pd.DataFrame({"LoanID": test_df['LoanID'], "Default": bnb_clf.predict(X=test_df.drop(columns=['LoanID']))})
newdf.to_csv('./csv_submissions/bernoulli_naive_bayes_out.csv', index=False)

encoding done
Training accuracy 0.8833930564530341
Validation accuracy 0.8847072645388682


In [None]:
# Gaussian Naive Bayes

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.naive_bayes import CategoricalNB
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score

df = pd.read_csv('./data/train.csv')
test_df = pd.read_csv('./data/test.csv')

categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

categorical_cols.remove('LoanID') # we don't want to encode this

print(categorical_cols)

df = df.drop(columns='LoanID')

df = pd.get_dummies(df, columns=categorical_cols, drop_first=True, dtype=int)
test_df = pd.get_dummies(test_df, columns=categorical_cols, drop_first=True, dtype=int)




#both have been one hot encoded
train_df, validation_df = train_test_split(df, test_size=0.2, random_state=17)

x_train_df = train_df.drop(columns=['Default'])
y_train_df = train_df['Default']
x_validation_df = validation_df.drop(columns=['Default'])
y_validation_df = validation_df['Default']

ids = test_df['LoanID']
test_df = test_df.drop(columns=['LoanID'])

scaler = StandardScaler()
scaler.fit_transform(x_train_df)
scaler.transform(x_validation_df)
scaler.transform(test_df)

cnb_clf = CategoricalNB(priors=[0.5, 0.5])
cnb_clf.fit(x_train_df, y_train_df)


y_validation_pred = cnb_clf.predict(x_validation_df)

train_acc = accuracy_score(y_train_df, cnb_clf.predict(x_train_df))
valid_acc = accuracy_score(y_validation_df, y_validation_pred)

print(f'Training accuracy {train_acc}')
print(f'Validation accuracy {valid_acc}')


newdf = pd.DataFrame({"LoanID": ids, "Default": cnb_clf.predict(X=test_df)})
newdf.to_csv('./csv_submissions/gaussian_naive_bayes_out.csv', index=False)

# pretty obvious that this would perform terribly as the data features don't form a gaussian distribution

['Education', 'EmploymentType', 'MaritalStatus', 'HasMortgage', 'HasDependents', 'LoanPurpose', 'HasCoSigner']
Training accuracy 0.6589238837113958
Validation accuracy 0.6599520266301155


In [None]:
# Categorical Naive Bayes

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.naive_bayes import CategoricalNB
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score

df = pd.read_csv('./data/train.csv')
test_df = pd.read_csv('./data/test.csv')

categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

others = []
for column in df.columns:
    if column not in categorical_cols:
        others.append(column)

categorical_cols.remove('LoanID') # we don't want to encode this

print(categorical_cols)

df = df.drop(columns='LoanID')

df = pd.get_dummies(df, columns=categorical_cols, drop_first=True, dtype=int)
test_df = pd.get_dummies(test_df, columns=categorical_cols, drop_first=True, dtype=int)




#both have been one hot encoded
train_df, validation_df = train_test_split(df, test_size=0.3, random_state=17)

x_train_df = train_df.drop(columns=['Default'])
y_train_df = train_df['Default']
x_validation_df = validation_df.drop(columns=['Default'])
y_validation_df = validation_df['Default']

ids = test_df['LoanID']
test_df = test_df.drop(columns=['LoanID'])

scaler = StandardScaler()
scaler.fit_transform(x_train_df)
scaler.transform(x_validation_df)
scaler.transform(test_df)

cnb_clf = CategoricalNB(fit_prior=False)
cnb_clf.fit(x_train_df, y_train_df)


y_validation_pred = cnb_clf.predict(x_validation_df)

train_acc = accuracy_score(y_train_df, cnb_clf.predict(x_train_df))
valid_acc = accuracy_score(y_validation_df, y_validation_pred)

print(f'Training accuracy {train_acc}')
print(f'Validation accuracy {valid_acc}')


newdf = pd.DataFrame({"LoanID": ids, "Default": cnb_clf.predict(X=test_df)})
newdf.to_csv('categorical_naive_bayes_out.csv', index=False)

# this also doesn't work that well as we have categorical columns mixed with continuous columns like income and it overfits training data

#On similar lines, we expect multinomial naive bayes to fail...

['Education', 'EmploymentType', 'MaritalStatus', 'HasMortgage', 'HasDependents', 'LoanPurpose', 'HasCoSigner']
Training accuracy 0.9169120166721448
Validation accuracy 0.5923895307094837
