# Importing the modules

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import matplotlib

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

#Import all the metrics for validation and evaluation
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

# Loading the Dataset

In [2]:
df = pd.read_csv('Loan Prediction.csv')
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
3,LP001011,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y
4,LP001013,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,Y


# Summary of the dataset

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df['Loan_Status'].value_counts()

# Preprocessing

In [None]:
# drop unnecessary columns
cols = ['Loan_ID','Gender', 'Married','Education', 'Loan_Amount_Term']
df = df.drop(columns=cols, axis=1)
df.head()

## 1. Datatype Conversion

In [None]:
df.dtypes

In [None]:
df.Dependents.unique()

In [None]:
df = df.replace({
    'Dependents':'[+]'
},'',regex=True)

In [None]:
df.Dependents.unique()

In [None]:
df.isna().sum()

In [None]:
#For Categorical data
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='most_frequent', missing_values=np.nan)
imputer = imputer.fit(df[['Dependents','Self_Employed']])
df[['Dependents','Self_Employed']] = imputer.transform(df[['Dependents','Self_Employed']])

In [None]:
df.isna().sum()

In [None]:
imputer1 = SimpleImputer(strategy='mean', missing_values=np.nan)
imputer1 = imputer.fit(df[['LoanAmount','Credit_History']])
df[['LoanAmount', 'Credit_History']] = imputer1.transform(df[['LoanAmount','Credit_History']])

In [None]:
df.isna().sum()

In [None]:
df.Dependents = df.Dependents.astype(int)
df.dtypes

In [None]:
df.head()

In [None]:
df['Total_income'] = df['ApplicantIncome']+df['CoapplicantIncome']
df.head()

In [None]:
cols1 = ['ApplicantIncome','CoapplicantIncome']
df = df.drop(columns=cols1, axis=1)
df.head()

## Finding Duplicate Values

In [None]:
print("There are {} duplicate values.".format(df.duplicated().sum()))
df[df.duplicated(keep=False)].head(10)

In [None]:
# remove duplicate values
df = df.drop_duplicates()
df = df.reset_index(drop=True)

In [None]:
# check for duplicate values after removing duplicates
print("There are {} duplicate values.".format(df.duplicated().sum()))
df.head()

## 2. Dealing with Outliers

In [None]:
#Boxplot to check outliers in income of the applicant
sns.boxplot(x=df['Total_income'])
plt.show()

In [None]:
# calculating IQR and upper limit and lower limit to find outliers
Q1 = df.Total_income.quantile(0.25)
Q3 = df.Total_income.quantile(0.75)
IQR = Q3 - Q1
upperlimit = Q3 + (IQR * 1.5)
lowerlimit = Q1 - (IQR * 1.5)

In [None]:
# Droping the rows containing height beyond lower and upper limit.
df_1 = df[(df.Total_income > lowerlimit) & (df.Total_income < upperlimit)]
df_1.shape

In [None]:
sns.boxplot(x=df_1['Total_income'])
plt.show()

In [None]:
df_1.head()

In [None]:
df_1.shape

# Exploratory Data Analysis

In [None]:
fig,ax = plt.subplots(2,3,figsize=(16,10))
sns.countplot('Loan_Status', data=df_1,ax=ax[0][0])
sns.countplot('Self_Employed', data=df_1,ax=ax[0][1])
sns.countplot('Property_Area', data=df_1,ax=ax[0][2])
sns.countplot('Credit_History', data=df_1,ax=ax[1][0])
sns.countplot('Dependents', data=df_1,ax=ax[1][1])

# Numerical attributes visulaization


In [None]:
xdf = df_1.Total_income
plt.hist(xdf,bins=10,rwidth=0.6)
plt.title('Histogram of TotalIncome')
plt.xlabel('Total_Income')
plt.ylabel('Frequency')
plt.show()

In [None]:
# # sns.distplot(df["LoanAmount"])
# xdf1 = df_1.LoanAmount
# plt.hist(xdf1,bins=10,rwidth=0.8)
# plt.title('Histogram of LoanAmount')
# plt.xlabel('Loan_Amount')
# plt.ylabel('Frequency')
# plt.show()

In [None]:
# sns.distplot(df_1["Loan_Amount_Term"])

In [None]:
sns.distplot(df_1['Credit_History'])

In [None]:
sns.pairplot(df_1, hue = 'Loan_Status')
plt.show()

## Correlation Matrix

In [None]:
corr = df_1.corr()
sns.set(rc={'figure.figsize':(15,10)})
sns.heatmap(data= corr,annot=True)

plt.savefig("corr.png", bbox_inches="tight")
plt.show()

In [None]:
#Separate the features from the label
#The label here is the Loan_Status

X = df_1.drop('Loan_Status', axis=1)
y = df_1['Loan_Status']



In [None]:
X.head()

In [None]:
y.head()

In [None]:
#Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, x_test, Y_train, y_test = train_test_split(X, y , test_size = 0.30, shuffle = True,
                                                    random_state=5)

In [None]:
X_train.shape

# Creating Pipeline for the model

In [None]:
#Pipeline is to create a sequence of preprocessing actions
from sklearn.pipeline import Pipeline

#Standard Scaler normalise the numeric data so that large values does not
#create biasness in the training

#OneHotEndcoder converts categorical data to numerical, it also creates individual
#for each option in the categories
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder

In [None]:
#Separate the Categorical and Numerical Columns
#Numeric columns
numeric_cols = X.select_dtypes(include=['int64','float64']).columns
print(numeric_cols)

#categorical columns
categorical_cols = X.select_dtypes(include=['object']).columns
print(categorical_cols)

In [None]:
numeric_index = [X.columns.get_loc(col) for col in numeric_cols]
numeric_index

In [None]:
categorical_index = [X.columns.get_loc(col) for col in categorical_cols]
categorical_index

In [None]:
#Building the Numeric Transformation Pipeline
numeric_transformer = Pipeline(steps = [
    ('scaler',MinMaxScaler(feature_range = (0,1))) #Normalize the data
] )

categorical_transformer = Pipeline(steps = [
    ('imputeC',SimpleImputer(strategy='most_frequent')), #Handline numeric missing value with median
    ('oneHot',OneHotEncoder(handle_unknown='ignore')) #Normalize the data
] )

In [None]:
from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(
    transformers = [('numeric',numeric_transformer,numeric_index),
                ('categorical',categorical_transformer,categorical_index)]
)

# Using KNeighbours

In [None]:
from sklearn.neighbors import KNeighborsClassifier


In [None]:
error_rate = []

for i in range(1,40):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn_model = Pipeline(steps=
                         [
                            ('prep', preprocessor),
                            ('est', knn)
                        ])
    knn_model.fit(X_train,Y_train)
    pred_i = knn_model.predict(x_test)
    error_rate.append(np.mean(pred_i != y_test))

plt.figure(figsize=(10,6))
plt.plot(range(1,40),error_rate,color='blue', linestyle='dashed', 
         marker='o',markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')
print("Minimum error:-",min(error_rate),"at K =",error_rate.index(min(error_rate)))

In [None]:

acc = []
# Will take some time
from sklearn import metrics
for i in range(1,40):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn_model = Pipeline(steps=
                         [
                            ('prep', preprocessor),
                            ('est', knn)
                        ])
    knn_model.fit(X_train,Y_train)
    yhat = knn_model.predict(x_test)
    acc.append(metrics.accuracy_score(y_test, yhat))
    
plt.figure(figsize=(10,6))
plt.plot(range(1,40),acc,color = 'blue',linestyle='dashed', 
         marker='o',markerfacecolor='red', markersize=10)
plt.title('accuracy vs. K Value')
plt.xlabel('K')
plt.ylabel('Accuracy')
print("Maximum accuracy:-",max(acc),"at K =",acc.index(max(acc)))

In [None]:
knn = KNeighborsClassifier(n_neighbors = 24)

In [None]:
knn_model = Pipeline(steps=
                        [
                            ('prep', preprocessor),
                            ('est', knn)
                        ])

In [None]:
from sklearn import set_config

set_config(display='diagram')
#fit data
knn_model.fit(X_train, Y_train)

In [None]:
y_pred = knn_model.predict(x_test)
print(y_pred)

In [None]:
#Summarise the fit of the model
report1 = classification_report(y_test, y_pred, target_names=['Y','N'])
print("Report : \n{}".format(report1))

#Confusion Matrix
cm1 = confusion_matrix(y_test, y_pred)
# print(cm)
sns.heatmap(cm1,annot=True,cmap='Blues')

# Model Score Checking

In [None]:
knn_model.score(X_train, Y_train)

In [None]:
knn_model.score(x_test, y_test)

# DecisionTreeClassifier

In [None]:
#Create the estimator for training
from sklearn.tree import DecisionTreeClassifier

#Create the model
dtmodel = DecisionTreeClassifier()



In [None]:
#Assemble the individual blocks to form the main pipeline
testmodel = Pipeline(
    steps = [
      ('preprocessor', preprocessor), #Preprocessing
      ('classifier'  , dtmodel)  #Model
    ]
)

In [None]:
#Training the model using pipeline
# Display Pipeline
from sklearn import set_config
set_config(display='diagram')
#fit data
testmodel.fit(X_train, Y_train)

In [None]:
y_pred1 = testmodel.predict(x_test)
print(y_pred1)

In [None]:
#Summarise the fit of the model
report = classification_report(y_test, y_pred1, target_names=['Y','N'])
print("Report : \n{}".format(report))

#Confusion Matrix
cm = confusion_matrix(y_test, y_pred1)
# print(cm)
sns.heatmap(cm,annot=True,cmap='Blues')

In [None]:
testmodel.score(X_train, Y_train)

In [None]:
testmodel.score(x_test, y_test)

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
estimater = LogisticRegression()

In [None]:
pipelinemodel = Pipeline(steps=
                        [
                            ('prep', preprocessor),
                            ('est', estimater)
                        ])

In [None]:
#Training the model using pipeline
# Display Pipeline
from sklearn import set_config
set_config(display='diagram')
#fit data
pipelinemodel.fit(X_train, Y_train)

In [None]:
y_test.value_counts()

In [None]:
y_pred2 = pipelinemodel.predict(x_test)
print(y_pred2)

In [None]:
#Summarise the fit of the model
report3 = classification_report(y_test, y_pred2, target_names=['Y','N'])
print("Report : \n{}".format(report3))

#Confusion Matrix
cm = confusion_matrix(y_test, y_pred2)
# print(cm)
sns.heatmap(cm,annot=True,cmap='Blues')

In [None]:
pipelinemodel.score(X_train, Y_train)

In [None]:
pipelinemodel.score(x_test,y_test)