In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


path = os.getcwd()
df_test = pd.read_csv(path + '/test.csv')
df_train = pd.read_csv(path + '/train.csv')

df_test.head()

In [None]:
df_train.head()

<h1> Data Overview </h1>

In [None]:
#dimension
print('Dimension of Test Data ', df_test.shape)
print('Dimension of Train Data', df_train.shape)

In [None]:
#data type
df_train.dtypes

In [None]:
#missing value
def missing_data(data):
    df_train_missing = pd.DataFrame(data.isna().sum())
    df_train_missing.reset_index(level = 0, inplace = True)
    df_train_missing.columns = ['Column Name', 'Total Missing Values']
    df_train_missing = df_train_missing.sort_values(by = 'Total Missing Values', ascending = False)
    return df_train_missing

missing_data(df_train)

In [None]:
#missing data df_train

missing_data(df_test)

In [None]:
#separate categorical and integer
df_train_cat = df_train.select_dtypes(include = ['object'])
df_train_numeric = df_train.select_dtypes(include = ['int64', 'float64']).dropna() #drop the no to show distribution graph

In [None]:
#categorical value
for column in df_train_cat.columns[1:]: 
    sns.set()
    fig, ax = plt.subplots()
    sns.countplot(x=column, data=df_train_cat) 
    fig.set_size_inches(5,3)

In [None]:
#numeric distribution

for column in df_train_numeric.columns:
    sns.set()
    fig, ax = plt.subplots()
    sns.distplot(df_train_numeric[column])
    fig.set_size_inches(5, 3)

In [None]:
#correlation matrix
sns.heatmap(df_train.corr(), linewidths=.5)
fig.set_size_inches(15, 5)

<p> Key findings = Applicant Income and Co-applicant Income is highly correlated with the amount of loan borrowed </p> 
<b> Recommendation </b> <li> drop the Applicant and Co-applicant Income column and generate new ratio of loan and income to a new column </li> <li> Predict <i> Loan Amount </i> based on <i> Applicant and Co-applicant Income </i> with KNN </li> 
<li> drop other columns with na </li>
<li> encode all categorical data </li>
<li> upsample loan status 'zero' </li>

<h1> Feature Engineering </h1>

In [None]:
#predict loan amount
df_loan_predict = df_train[['ApplicantIncome', 'LoanAmount']]
df_loan_predict_test = df_loan_predict[df_loan_predict['LoanAmount'].isnull()].drop(['LoanAmount'], axis = 1)
df_loan_predict_train = df_loan_predict[df_loan_predict['LoanAmount'].notnull()]

In [None]:
df_loan_predict_train.head()

In [None]:
#create model
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split


x = df_loan_predict_train.drop(['LoanAmount'], axis = 1)
y = df_loan_predict_train['LoanAmount']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state  =42)

knn = KNeighborsRegressor()
knn.fit(x_train, y_train)
pred_knn = knn.predict(x_test)

In [None]:
knn.score(x_test, pred_knn)

In [None]:
#fill missing value with KNN
prediction_data = knn.predict(df_loan_predict_test)
predicted_loan = pd.DataFrame({'LoanAmount': prediction_data})

df_loan_predict_test = df_loan_predict_test.reset_index(drop = True)
df_loan_predict_test['LoanAmount'] = predicted_loan

df_loan_predict_test

In [None]:
#merge missing value to the df_train
df_train_test = df_train.copy()

In [None]:
#append table
df_train_test_null = df_train_test[df_loan_predict['LoanAmount'].isnull()].reset_index(drop = True)
df_train_test_notnull =  df_train_test[df_train_test['LoanAmount'].notnull()].reset_index(drop = True) #need to reset index so it can be appended
df_train_test_null = df_train_test_null.drop(['LoanAmount'], axis = 1)
df_train_test_null['LoanAmount'] = df_loan_predict_test['LoanAmount']
df_train_test_null
df_loan_clean = df_train_test_notnull.append(df_train_test_null)

In [None]:
df_loan_clean.isna().sum()

In [None]:
#drop row with na
df_naan_clean = df_loan_clean.dropna()

In [None]:
#drop applicant and coapplicant income and generate loan to income column
df_naan_clean['LoanToIncome'] = df_naan_clean['LoanAmount']/df_naan_clean['ApplicantIncome'] * 100
df_test['LoanToIncome'] = df_test['LoanAmount']/df_test['ApplicantIncome'] * 100
df_clean = df_naan_clean.drop(['Loan_ID'], axis = 1)
df_test = df_test.drop(['Loan_ID'], axis = 1)

In [None]:
df_clean.head()

In [None]:
df_test_naan = df_test.dropna()
df_test_naan.head()

In [None]:
#encode categorical data
df_clean.select_dtypes(include = ['object']).head()

In [None]:
#encode categorical data
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

le = LabelEncoder()

#dependent
le.fit(df_clean['Dependents'])
df_clean['Dependents'] = le.transform(df_clean['Dependents'])
df_test_naan['Dependents'] = le.transform(df_test_naan['Dependents'])

#Education
le.fit(df_clean['Education'])
df_clean['Education'] = le.transform(df_clean['Education'])
df_test_naan['Education'] = le.transform(df_test_naan['Education'])

#Gender
le.fit(df_clean['Gender'])
df_clean['Gender'] = le.transform(df_clean['Gender'])
df_test_naan['Gender'] = le.transform(df_test_naan['Gender'])

#Married
le.fit(df_clean['Married'])
df_clean['Married'] = le.transform(df_clean['Married'])
df_test_naan['Married'] = le.transform(df_test_naan['Married'])

#Property_Area
le.fit(df_clean['Property_Area'])
df_clean['Property_Area'] = le.transform(df_clean['Property_Area'])
df_test_naan['Property_Area'] = le.transform(df_test_naan['Property_Area'])

#Self_Employed
le.fit(df_clean['Self_Employed'])
df_clean['Self_Employed'] = le.transform(df_clean['Self_Employed'])
df_test_naan['Self_Employed'] = le.transform(df_test_naan['Self_Employed'])

#Loan Status
le.fit(df_clean['Loan_Status'])
df_clean['Loan_Status'] = le.transform(df_clean['Loan_Status'])

In [None]:
df_clean.head()

In [None]:
df_test_naan.head()

In [None]:
#upsample
df_clean.Loan_Status.value_counts()

In [None]:
#upsampling 0 loan

from sklearn.utils import resample

minority = df_clean[df_clean.Loan_Status == 0]
majority = df_clean[df_clean.Loan_Status == 1]
df_minority = resample(minority, replace = True, n_samples = 341, random_state = 303)
df_upsampled = pd.concat([majority, df_minority])
df_upsampled.Loan_Status.value_counts()

In [None]:
#minmax
from sklearn.preprocessing import MinMaxScaler

mm = MinMaxScaler()
mm.fit(df_upsampled[['LoanToIncome', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']])

df_upsampled[['LoanToIncome', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']] = mm.transform(df_upsampled[['LoanToIncome', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']])

In [None]:
df_upsampled.head()

In [None]:
#test data can't be transformed

#df_test_naan[['LoanToIncome', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']] = mm.transform(df_test_naan[['LoanToIncome', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']])

In [None]:
df_test_naan.head()

In [None]:
#model building
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.svm import SVC, LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

x = df_upsampled.drop(['Loan_Status'], axis = 1)
y = df_upsampled['Loan_Status']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 42)

#model function
logreg = LogisticRegression()
logreg.fit(x_train, y_train)
pred_logreg = logreg.predict(x_test)

In [None]:
#svc
clf = svm.SVC()
clf.fit(x_train, y_train)
pred_clf = clf.predict(x_test)

In [None]:
#random forest
rfc = RandomForestClassifier(n_estimators = 200)
rfc.fit(x_train, y_train)
pred_rfc = rfc.predict(x_test)

In [None]:
#neural networks
mlpc = MLPClassifier(hidden_layer_sizes = (6, 6, 6), max_iter = 500)
mlpc.fit(x_train, y_train)
pred_mlpc = mlpc.predict(x_test)

In [None]:
#Perceptorn
prec = Perceptron()
prec.fit(x_train, y_train)
pred_prec = prec.predict(x_test)

In [None]:
#SGDClassifier
sgdc = SGDClassifier()
sgdc.fit(x_train, y_train)
pred_sgdc = sgdc.predict(x_test)

In [None]:
#KNN
knn = KNeighborsClassifier()
knn.fit(x_train, y_train)
pred_knn = knn.predict(x_test)

In [None]:
#Naive Bayes
gauss = GaussianNB()
gauss.fit(x_train, y_train)
pred_gauss = gauss.predict(x_test)

In [None]:
#Decision tree
dstree = DecisionTreeClassifier()
dstree.fit(x_train, y_train)
pred_dstree = dstree.predict(x_test)

In [None]:
#model evaluation

#accuracy score
rfc_score = accuracy_score(y_test, pred_rfc)
mlpc_score = accuracy_score(y_test, pred_mlpc)
logreg_score = accuracy_score(y_test, pred_logreg)
prec_score = accuracy_score(y_test, pred_prec)
sgdc_score = accuracy_score(y_test, pred_sgdc)
knn_score = accuracy_score(y_test, pred_knn)
gauss_score = accuracy_score(y_test, pred_gauss)
dstree_score = accuracy_score(y_test, pred_dstree)

modelResult = pd.DataFrame({
    'Model': ['Random Forest', 'Neural Networks', 'Logistic Regression', 
             'Preceptorn', 'SGDC', 'KNN', 'Naive Bayes', 'Decisiton Tree'],
    'Score': [rfc_score, mlpc_score, logreg_score, prec_score, sgdc_score, knn_score, gauss_score, dstree_score]
    
})

modelResult.sort_values(by = 'Score', ascending = False)

<p> <b> <i> Random Forest </i> and <i> Decision Tree </i> appear to be top models </b> </p>

In [None]:
#confusion matrix 
from sklearn.metrics import confusion_matrix

print("Random Forest {}".format(confusion_matrix(y_test, pred_rfc, labels = [1, 0])))

print("Decision Tree {}".format(confusion_matrix(y_test, pred_dstree, labels = [1, 0])))

<h3> Hyperparameter Tuning </h3>

In [None]:
#hyperparameter tuning

print("Random Forest Parameter \n")
print(rfc.get_params())