In [192]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stat
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report,confusion_matrix,f1_score
import warnings
warnings.filterwarnings('ignore')
plt.style.use('fivethirtyeight')

In [193]:
df = pd.read_csv('../input/loan-predication/train_u6lujuX_CVtuZ9i (1).csv')
df.head()

# Descriptive analysis

In [194]:
df.describe()

# ApplicantIncome, CoapplicantIncome, LoanAmount columns are skewed.

In [195]:
df.describe(include='object')

In [196]:
df.shape

# Univariate analysis

In [197]:
df.head()

In [198]:
# Checking outliers

plt.figure(figsize=(18,6))
plt.subplot(131)
sns.boxplot(df['ApplicantIncome'],color='b')
plt.subplot(132)
sns.boxplot(df['CoapplicantIncome'],color='b')
plt.subplot(133)
sns.boxplot(df['LoanAmount'],color='b')
plt.suptitle('Checking Outliers',fontsize = 20)
plt.show()

In [199]:
# Impact of categories in variables

plt.figure(figsize=(18,10))
plt.subplot(231)
df['Gender'].value_counts().plot(kind='pie',shadow=True,autopct = '%.2f%%',colors=['orange','y'])

plt.subplot(232)
df['Married'].value_counts().plot(kind='pie',shadow=True,autopct = '%.2f%%')

plt.subplot(233)
df['Education'].value_counts().plot(kind='pie',shadow=True,autopct = '%.2f%%',colors=['m','violet'])

plt.subplot(234)
df['Self_Employed'].value_counts().plot(kind='pie',shadow=True,autopct = '%.2f%%',colors=['orange','y'])

plt.subplot(235)
df['Property_Area'].value_counts().plot(kind='pie',shadow=True,autopct = '%.2f%%')

plt.subplot(236)
df['Loan_Status'].value_counts().plot(kind='pie',shadow=True,autopct = '%.2f%%',colors=['m','violet'])
plt.suptitle('Univariate Analysis - Impact of categories',fontsize = 20)
plt.show()



In [200]:
# Checking counts of categories in variables

plt.figure(figsize=(20,10))
plt.subplot(231)
sns.countplot(df['Gender'],palette='rocket_r')

plt.subplot(232)
sns.countplot(df['Married'],palette='viridis')

plt.subplot(233)
sns.countplot(df['Education'],palette='magma')

plt.subplot(234)
sns.countplot(df['Self_Employed'],palette='rocket_r')

plt.subplot(235)
sns.countplot(df['Property_Area'],palette='viridis')

plt.subplot(236)
sns.countplot(df['Loan_Status'],palette='magma')
plt.suptitle('Univariate Analysis - Count plot',fontsize = 20)
plt.show()

In [201]:
# Checking Distribution of variables

plt.figure(figsize=(20,10))
plt.subplot(221)
sns.distplot(df['ApplicantIncome'],color='black')

plt.subplot(222)
sns.distplot(df['CoapplicantIncome'],color='m')

plt.subplot(223)
sns.distplot(df['LoanAmount'],color='g')

plt.subplot(224)
sns.distplot(df['Loan_Amount_Term'],color='r')
plt.suptitle('Univariate Analysis - Distplot',fontsize = 20)
plt.show()

# Bivariate Analysis

In [202]:
df.head()

In [203]:
# Comparing variables with Loan status

plt.figure(figsize=(20,10))
plt.subplot(231)
sns.countplot(df['Gender'],hue=df['Loan_Status'],palette='rocket_r')

plt.subplot(232)
sns.countplot(df['Married'],hue=df['Loan_Status'],palette='viridis')

plt.subplot(233)
sns.countplot(df['Education'],hue=df['Loan_Status'],palette='cubehelix')

plt.subplot(234)
sns.countplot(df['Self_Employed'],hue=df['Loan_Status'],palette='rocket_r')

plt.subplot(235)
sns.countplot(df['Credit_History'],hue=df['Loan_Status'],palette='viridis')

plt.subplot(236)
sns.countplot(df['Property_Area'],hue=df['Loan_Status'],palette='cubehelix')
plt.suptitle('Bivariate Analysis - Count plot',fontsize = 20)
plt.show()

In [204]:
# Impact of ApplicantIncome, CoapplicantIncome and LoanAmount on Loan_Status

plt.figure(figsize=(18,6))
plt.subplot(131)
sns.boxplot(df['ApplicantIncome'],df['Loan_Status'])
plt.subplot(132)
sns.boxplot(df['CoapplicantIncome'],df['Loan_Status'])
plt.subplot(133)
sns.boxplot(df['LoanAmount'],df['Loan_Status'])
plt.suptitle('Checking Outliers',fontsize = 20)
plt.show()

# Data Cleaning

In [205]:
# Removing the outliers from data

print('Before removing outliers {}'.format(df.shape))
df=df[df['ApplicantIncome']<20000]
df=df[df['CoapplicantIncome']<10000]
df=df[df['LoanAmount']<350]
print('After removing outliers {}'.format(df.shape))

In [206]:
df.drop('Loan_ID',axis=1,inplace=True)

# Data Preprocessing

In [207]:
df.isnull().sum()

In [208]:
# Assigning values for null values

df['Gender']=df['Gender'].fillna(df['Gender'].mode()[0])
df['Married']=df['Married'].fillna(df['Married'].mode()[0])
df['Dependents']=df['Dependents'].fillna(df['Dependents'].mode()[0])
df['Self_Employed']=df['Self_Employed'].fillna(df['Self_Employed'].mode()[0])
df['Loan_Amount_Term']=df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].median())
df['Credit_History']=df['Credit_History'].fillna(df['Credit_History'].median())

In [209]:
df.isnull().sum().sum()

In [210]:
df.select_dtypes(include='object').head()

In [211]:
# Converting categorical to numerical

df['Gender']=df['Gender'].replace(('Male','Female'),(2,1))
df['Married']=df['Married'].replace(('Yes','No'),(2,1))
df['Education']=df['Education'].replace(('Graduate','Not Graduate'),(2,1))
df['Self_Employed']=df['Self_Employed'].replace(('No','Yes'),(2,1))
df['Property_Area']=df['Property_Area'].replace(('Semiurban','Urban','Rural'),(3,2,1))
df['Dependents']=df['Dependents'].replace(('3+','2','1','0'),(3,2,1,0))
df['Loan_Status']=df['Loan_Status'].replace(('Y','N'),(2,1))

In [212]:
df.head()

In [213]:
# splitting dependent and independent variable

x=df.drop('Loan_Status',axis=1)
y=df['Loan_Status']

In [214]:
# Resampling for balancing the data

sm=SMOTE()
x_sm,y_sm = sm.fit_resample(x,y)

In [215]:
# Visualizing dependent variable

print('Before Resampling {}'.format(y.shape))
print('After Resampling {}'.format(y_sm.shape))
plt.figure(figsize=(10,8))
plt.subplot(121)
y.value_counts().plot(kind='pie',autopct = '%.2f%%',colors=['orange','y'])
plt.subplot(122)
y_sm.value_counts().plot(kind='pie',autopct = '%.2f%%',colors=['orange','y'])
plt.show()

In [216]:
# Train test split

xtrain,xtest,ytrain,ytest=train_test_split(x_sm,y_sm,test_size=0.2,random_state=10)

In [217]:
print('Shape of xtrain {}'.format(xtrain.shape))
print('Shape of ytrain {}'.format(ytrain.shape))
print('Shape of xtest {}'.format(xtest.shape))
print('Shape of ytest {}'.format(ytest.shape))

# Machine Learning Models

In [218]:
# Creating Logistic regression model

def logistic_reg(xtrain,xtest,ytrain,ytest):
    lr=LogisticRegression(solver='liblinear')
    lr.fit(xtrain,ytrain)
    ypred=lr.predict(xtest)
    print('***LogisticRegression***')
    print('Confusion matrix')
    print(confusion_matrix(ytest,ypred))
    print('Classification report')
    print(classification_report(ytest,ypred))
    print('f1_score : {}'.format(f1_score(ytest,ypred)))

In [219]:
# Creating RandomForestClassifier model

def random_forest(xtrain,xtest,ytrain,ytest):
    rf=RandomForestClassifier()
    rf.fit(xtrain,ytrain)
    ypred=rf.predict(xtest)
    print('***RandomForestClassifier***')
    print('Confusion matrix')
    print(confusion_matrix(ytest,ypred))
    print('Classification report')
    print(classification_report(ytest,ypred))
    print('f1_score : {}'.format(f1_score(ytest,ypred)))

In [220]:
# Creating GradientBoostingClassifier model

def g_boosting(xtrain,xtest,ytrain,ytest):
    gb=GradientBoostingClassifier()
    gb.fit(xtrain,ytrain)
    ypred=gb.predict(xtest)
    print('***GradientBoostingClassifier***')
    print('Confusion matrix')
    print(confusion_matrix(ytest,ypred))
    print('Classification report')
    print(classification_report(ytest,ypred))
    print('f1_score : {}'.format(f1_score(ytest,ypred)))

In [221]:
# Creating GradientBoostingClassifier model

def d_tree(xtrain,xtest,ytrain,ytest):
    dt=DecisionTreeClassifier()
    dt.fit(xtrain,ytrain)
    ypred=dt.predict(xtest)
    print('***DecisionTreeClassifier***')
    print('Confusion matrix')
    print(confusion_matrix(ytest,ypred))
    print('Classification report')
    print(classification_report(ytest,ypred))
    print('f1_score : {}'.format(f1_score(ytest,ypred)))

In [222]:
# Comparing three models

def compare_model(xtrain,xtest,ytrain,ytest):
    logistic_reg(xtrain,xtest,ytrain,ytest)
    print('-'*100)
    random_forest(xtrain,xtest,ytrain,ytest)
    print('-'*100)
    g_boosting(xtrain,xtest,ytrain,ytest)
    print('-'*100)
    d_tree(xtrain,xtest,ytrain,ytest)

In [223]:
compare_model(xtrain,xtest,ytrain,ytest)

# Model Improvement

In [235]:
# When compared with all models Random forest model performs well. It is small dataset if more data in dataset the accuracy will be high.

# Trying hyperparameter tunning

from sklearn.model_selection import GridSearchCV, cross_val_score

In [225]:
#Creating Parameters for GridSearchCV

params={'n_estimators':np.arange(100,500,100),
    'criterion':['gini','entropy'],
    'max_depth':[1,2,3],
    'min_samples_split':[2,3,4],
    'min_samples_leaf':[1,2,3]}

params

In [228]:
# Creating tunning model

rf=RandomForestClassifier()

grid_search=GridSearchCV(rf,params)
grid_search.fit(xtrain,ytrain)

In [229]:
# Finding best parameters

grid_search.best_params_

In [230]:
# Creating new model with best parameters

rf_gcv=grid_search.best_estimator_
rf_gcv.fit(xtrain,ytrain)

In [234]:
# Analysing performance of tuned model

ypred_gcv=rf_gcv.predict(xtest)
print('***RandomForestClassifier***')
print('Confusion matrix')
print(confusion_matrix(ytest,ypred_gcv))
print('Classification report')
print(classification_report(ytest,ypred_gcv))
print('f1_score : {}'.format(f1_score(ytest,ypred_gcv)))

In [239]:
cv_score=cross_val_score(rf,x,y,cv=5)
print('Score of normal(default) model')
print(cv_score)
print(np.mean(cv_score))
print('-'*100)

cv_score=cross_val_score(rf_gcv,x,y,cv=5)
print('Score of tuned model')
print(cv_score)
print(np.mean(cv_score))