### BAYESIAN METHODS
## Bernoulli Naive Bayes
# Steps
- Encode categorical columns using pd.getdummies(one-hot)
- Create an instance of BernoulliNB with binarize = True so that all features are mapped to binary values
- Simply fit the training data and then predict on the test data.
- Scaling is not required in any of the below methods as no distance measure is used

In [33]:
# Bernoulli Naive Bayes

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.naive_bayes import BernoulliNB
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score

df = pd.read_csv('./data/train.csv')
test_df = pd.read_csv('./data/test.csv')


to_encode = []
for column in df.columns:
    if column == 'LoanID':
        continue
    if df[column].dtype == 'object':
        to_encode.append(column)
    elif df[column].dtype not in ['float64', 'int64', 'float', 'int']:
        to_encode.append(column)

# for column in to_encode:
#     le = LabelEncoder() #for now let's use labelEncoder
#     # le = OneHotEncoder()
#     print('column is ', column)
#     df[column] = le.fit_transform(df[column])
#     test_df[column] = le.transform(test_df[column])

# one hot encoding
df = pd.get_dummies(df, columns=to_encode, drop_first=True, dtype=int)
test_df = pd.get_dummies(test_df, columns=to_encode, drop_first=True, dtype=int)
print('encoding done')

#turns out it doesn't make a difference with the new encoder


bnb = BernoulliNB()
param_grid = {
    "binarize": [True],
    "fit_prior": [True, False]
}
bnb_clf = RandomizedSearchCV(estimator=bnb, param_distributions= param_grid)

# bnb_clf = BernoulliNB(binarize=True, fit_prior=False)
train_df, validation_df = train_test_split(df, test_size=0.25, random_state=17)
# , stratify=df['Default'] if we add this somehow we get lower validation score

train_df = train_df.drop(columns=['LoanID'])
validation_df = validation_df.drop(columns=['LoanID'])

x_train_df = train_df.drop(columns=['Default'])
y_train_df = train_df['Default']


bnb_clf.fit(x_train_df, y_train_df)

x_validation_df = validation_df.drop(columns=['Default'])
y_validation_df = validation_df['Default']

y_validation_pred = bnb_clf.predict(x_validation_df)

train_acc = accuracy_score(y_train_df, bnb_clf.predict(x_train_df))
valid_acc = accuracy_score(y_validation_df, y_validation_pred)

print(f'Training accuracy {train_acc}')
print(f'Validation accuracy {valid_acc}')

print(f'Best params are {bnb_clf.best_params_}')


newdf = pd.DataFrame({"LoanID": test_df['LoanID'], "Default": bnb_clf.predict(X=test_df.drop(columns=['LoanID']))})
newdf.to_csv('./csv_submissions/bernoulli_naive_bayes_out.csv', index=False)

encoding done




Training accuracy 0.8833930564530341
Validation accuracy 0.8847072645388682
Best params are {'fit_prior': True, 'binarize': True}


## Gaussian Naive Bayes
- Similar steps like bernoulli except use GaussianNB this time

In [37]:
# Gaussian Naive Bayes

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.naive_bayes import CategoricalNB, GaussianNB
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import PowerTransformer

df = pd.read_csv('./data/train.csv')
test_df = pd.read_csv('./data/test.csv')

categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

categorical_cols.remove('LoanID') # we don't want to encode this

print(categorical_cols)

df = df.drop(columns=['LoanID'])

ids = test_df['LoanID']
test_df = test_df.drop(columns=['LoanID'])

for column in categorical_cols:
    le = LabelEncoder() #for now let's use labelEncoder
    df[column] = le.fit_transform(df[column])
    test_df[column] = le.transform(test_df[column])


# df = pd.get_dummies(df, columns=categorical_cols, drop_first=True, dtype=int)
# test_df = pd.get_dummies(test_df, columns=categorical_cols, drop_first=True, dtype=int)


# pt = PowerTransformer()
# df = pt.fit_transform(df.drop(columns=['Default']))

#both have been one hot encoded
train_df, validation_df = train_test_split(df, test_size=0.2, random_state=17, stratify=df['Default'])

print(train_df.head())
print('dissapear')
x_train_df = train_df.drop(columns=['Default'])
y_train_df = train_df['Default']
x_validation_df = validation_df.drop(columns=['Default'])
y_validation_df = validation_df['Default']



# scaler = StandardScaler()
# scaler.fit_transform(x_train_df)
# scaler.transform(x_validation_df)
# scaler.transform(test_df)

pt = PowerTransformer()
x_train_df = pt.fit_transform(x_train_df)
x_validation_df = pt.transform(x_validation_df)
test_df = pt.transform(test_df)

gnb_clf = GaussianNB(priors=[0.5, 0.5])
gnb_clf.fit(x_train_df, y_train_df)


y_validation_pred = gnb_clf.predict(x_validation_df)

train_acc = accuracy_score(y_train_df, gnb_clf.predict(x_train_df))
valid_acc = accuracy_score(y_validation_df, y_validation_pred)

print(f'Training accuracy {train_acc}')
print(f'Validation accuracy {valid_acc}')


newdf = pd.DataFrame({"LoanID": ids, "Default": gnb_clf.predict(X=test_df)})
newdf.to_csv('./csv_submissions/gaussian_naive_bayes_out.csv', index=False)

# pretty obvious that this would perform terribly 
# as the data features don't form a gaussian distribution as seen in eda, 
# they form something more like a uniform distribution



['Education', 'EmploymentType', 'MaritalStatus', 'HasMortgage', 'HasDependents', 'LoanPurpose', 'HasCoSigner']
        Age  Income  LoanAmount  CreditScore  MonthsEmployed  NumCreditLines  \
156247   22  146848      122264          427             102               1   
6305     43  106772      133332          342              16               3   
201675   59   17794       31479          409             118               2   
33277    29  114495        6845          395              67               4   
74647    25  107260      204081          598              31               4   

        InterestRate  LoanTerm  DTIRatio  Education  EmploymentType  \
156247         10.47        48      0.14          0               0   
6305            5.05        60      0.17          0               2   
201675         15.95        12      0.21          0               3   
33277           6.74        36      0.71          1               0   
74647           4.74        60      0.37          0  

## Categorical Naive Bayes


In [47]:
# Categorical Naive Bayes

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.naive_bayes import CategoricalNB
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score

df = pd.read_csv('./data/train.csv')
test_df = pd.read_csv('./data/test.csv')

categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

others = []
for column in df.columns:
    if column not in categorical_cols:
        others.append(column)


categorical_cols.remove('LoanID') # we don't want to encode this




# for column in others:
#     req_mean = df[column].mean()
#     def f1(el):
#         if el >= req_mean:
#             return 1
#         else:
#             return 0
#     df[column] = df[column].apply(f1)

print(categorical_cols)

df = df.drop(columns='LoanID')

df = pd.get_dummies(df, columns=categorical_cols, drop_first=True, dtype=int)
test_df = pd.get_dummies(test_df, columns=categorical_cols, drop_first=True, dtype=int)

assert df.columns.all() == test_df.columns.all()


#both have been one hot encoded
train_df, validation_df = train_test_split(df, test_size=0.3, random_state=17)

x_train_df = train_df.drop(columns=['Default'])
y_train_df = train_df['Default']
x_validation_df = validation_df.drop(columns=['Default'])
y_validation_df = validation_df['Default']

ids = test_df['LoanID']
test_df = test_df.drop(columns=['LoanID'])

# scaler = StandardScaler()
# scaler.fit_transform(x_train_df)
# scaler.transform(x_validation_df)
# scaler.transform(test_df)


param_grid = {
    "fit_prior": [False, True],
    # "alpha": [2.0, 5.0]
}
# cnb_clf = CategoricalNB(fit_prior=False)
# cnb_clf.fit(x_train_df, y_train_df)

cnb_clf = CategoricalNB()
rs = RandomizedSearchCV(cnb_clf, param_distributions=param_grid, cv=10)

rs.fit(x_train_df, y_train_df)
y_validation_pred = rs.predict(x_validation_df)

train_acc = accuracy_score(y_train_df, rs.predict(x_train_df))
valid_acc = accuracy_score(y_validation_df, y_validation_pred)

print(f'Training accuracy {train_acc}')
print(f'Validation accuracy {valid_acc}')

print('best parameters are', rs.best_params_)

newdf = pd.DataFrame({"LoanID": ids, "Default": rs.predict(X=test_df)})
newdf.to_csv('categorical_naive_bayes_out.csv', index=False)

# this also doesn't work that well as we have categorical columns mixed with continuous columns like income and it overfits training data




['Education', 'EmploymentType', 'MaritalStatus', 'HasMortgage', 'HasDependents', 'LoanPurpose', 'HasCoSigner']


Traceback (most recent call last):
  File "/home/prateek/anaconda3/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 982, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/prateek/anaconda3/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 415, in __call__
    return estimator.score(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/prateek/anaconda3/lib/python3.12/site-packages/sklearn/base.py", line 764, in score
    return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
                             ^^^^^^^^^^^^^^^
  File "/home/prateek/anaconda3/lib/python3.12/site-packages/sklearn/naive_bayes.py", line 102, in predict
    jll = self._joint_log_likelihood(X)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/prateek/anaconda3/lib/python3.12/site-packages/sklearn/naive_bayes.py", line 1513, in _joint_log_l

Training accuracy 0.9169120166721448
Validation accuracy 0.5923895307094837
best parameters are {'fit_prior': False}
