# Data preparation

## Read csv file

In [None]:
import pandas as pd

df = pd.read_csv('assignment/loan_data_set.csv')
df.info()

## Fill empty cells

In [None]:
fillna_mapping = {
    'LoanAmount': df['LoanAmount'].mean(skipna=True),
    'Gender': df['Gender'].mode(dropna=True).iloc[0],
    'Married': df['Married'].mode(dropna=True).iloc[0],
    'Dependents': df['Dependents'].mode(dropna=True).iloc[0],
    'Education': df['Education'].mode(dropna=True).iloc[0],
    'Self_Employed': df['Self_Employed'].mode(dropna=True).iloc[0],
    'ApplicantIncome': df['ApplicantIncome'].mode(dropna=True).iloc[0],
    'CoapplicantIncome': df['CoapplicantIncome'].mode(dropna=True).iloc[0],
    'Credit_History': df['Credit_History'].mode(dropna=True).iloc[0],
    'Loan_Amount_Term': df['Loan_Amount_Term'].mode(dropna=True).iloc[0],
}
df.fillna(fillna_mapping, inplace=True)
df.info()

## Encode labels

In [None]:
label_mapping = {
    'Male': 0,
    'Female': 1,
    'No': 0,
    'Yes': 1,
    '0': 0,
    '1': 1,
    '2': 2,
    '3+': 3,
    'Not Graduate': 0,
    'Graduate': 1,
    'N': 0,
    'Y': 1,
}
df.replace(label_mapping, inplace=True)
# cast ApplicantIncome column to float
df['ApplicantIncome'] = df['ApplicantIncome'].astype(float)
df.info()

## Drop Loan_ID column

In [None]:
df.drop('Loan_ID', axis='columns', inplace=True)
df.info()

## Prepare model Input/Output

In [None]:
#input column
X = df[['Gender',
 'Married',
 'Dependents',
 'Education',
 'Self_Employed',
 'ApplicantIncome',
 'CoapplicantIncome',
 'LoanAmount',
 'Loan_Amount_Term',
 'Credit_History'] ]
Y = df['Loan_Status']

## Class Balance

In [None]:
Y.value_counts().plot(kind='pie', autopct='%1.1f%%')
Y.info()

In [None]:
from imblearn.over_sampling import SMOTE

sm = SMOTE()
X, Y = sm.fit_resample(X, Y)
Y.value_counts().plot(kind='pie', autopct='%1.1f%%')
Y.info()


## Train/Test split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
X_train.info()
X_test.info()


# Models

## Utility functions

### Model evaluation

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, \
    roc_auc_score, RocCurveDisplay
import seaborn as sns


def evaluate_model(Y_true, Y_pred, Y_pred_proba):
    # confusion matrix
    cf_matrix = confusion_matrix(Y_pred, Y_true, labels=[0,1])
    sns.heatmap(cf_matrix, annot=True)
    # Accuracy Score
    print ('Accuracy Score :{:.2f}'.format(accuracy_score(Y_pred, Y_true)*100))
    # Precision Score
    print ('Precision Score :{:.2f}'.format(precision_score(Y_pred, Y_true, pos_label=0)*100))
    # Recall Score
    print ('Recall Score :{:.2f}'.format(recall_score(Y_pred, Y_true, pos_label=0)*100))
    # F1 Score
    print ('F1 Score :{:.2f}'.format(f1_score(Y_pred, Y_true, pos_label=0)*100))
    # AUC Score
    print('AUC: {:.2f}'.format(roc_auc_score(Y_true, Y_pred_proba[:, 1])*100))
    # plot ROC curve
    RocCurveDisplay.from_predictions(Y_true, Y_pred_proba[:, 1]) 

### Saving models

In [None]:
import pickle

def save_model(model, modelname):
    with open(f"models/{modelname}", 'wb') as f:
        pickle.dump(model, f)

### Using saved model

In [None]:
import pickle
def predict_result(inputvector, modelname):
    with open(f"models/{modelname}", 'rb') as f:
        model = pickle.load(f)
    return model.predict([inputvector])

## Bayes

In [None]:
from sklearn.naive_bayes import CategoricalNB
bayes_model = CategoricalNB(alpha=0)
# train model
bayes_model.fit(X_train , Y_train)
# predict on test split
Y_pred_bayes = bayes_model.predict(X_test)
Y_pred_bayes_proba = bayes_model.predict_proba(X_test)
# evaluate model
evaluate_model(Y_test, Y_pred_bayes, Y_pred_bayes_proba)

In [None]:
save_model(bayes_model, 'bayes')
predict_result([0,1,2,1,0,4006,1526,168,360,1], 'bayes')