
1. Import Relevant Modules

In [None]:
# we import relevant modules

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [None]:
# load and read the data
train = pd.read_csv('Loan Eligibility Prediction Datasets/train_ctrUa4K.csv')
train

In [None]:
test = pd.read_csv('Loan Eligibility Prediction Datasets/test_lAUu6dG.csv')
test

In [None]:
#copy the original dataset

train_original = train.copy()
test_original = test.copy()

In [None]:
train.columns

In [None]:
test.columns

In [None]:
train.info()

In [None]:
train.shape

In [None]:
test.shape

2. Univariate Analysis

In [None]:
# first the target variable
train.Loan_Status.value_counts(normalize=True)

In [None]:
train.Loan_Status.value_counts().plot.bar()

In [None]:
# then categorical features

cat_feat = ['Gender','Married','Self_Employed','Credit_History','Loan_Status']

for col in cat_feat:
    counts = train[col].value_counts(normalize=True).sort_index()
    fig = plt.figure(figsize=(9,6))
    ax = fig.gca()
    counts.plot.bar(ax=ax,color='steelblue')
    ax.set_title(col + ' '+ 'counts')
    ax.set_xlabel(col)
    ax.set_ylabel('Frequency')
plt.show()

In [None]:
# after ordinal features

ord_feat=['Dependents','Education','Property_Area']


In [None]:
for col in ord_feat:
    counts = train[col].value_counts(normalize=True).sort_index()
    fig = plt.figure(figsize=(9,6))
    ax = fig.gca()
    counts.plot.bar(ax=ax,color='steelblue')
    ax.set_title(col)
    ax.set_xlabel(col+' '+'counts')
    ax.set_ylabel("Frequency")
plt.show()

In [None]:
# finally numerical features

num_feat = ['ApplicantIncome','CoapplicantIncome','LoanAmount','Loan_Amount_Term']


for col in num_feat:
    feature = train[col]
    fig,ax = plt.subplots(2,1,figsize=(9,6))
    ax[0].hist(feature,bins=100)
    sns.boxplot(feature,x=col,ax=ax[1])


fig.show()

3. Bivariate Analysis of Features Against the Label 

In [None]:
Gender=pd.crosstab(train['Gender'],train['Loan_Status'])
Gender.div(Gender.sum(1).astype(float), axis=0).plot(kind='bar',stacked=True,figsize=(4,4))
plt.show()

In [None]:
for col in cat_feat:
    counts = train[col].value_counts(normalize=True).sort_index()
    fig = plt.figure(figsize=(9, 6))
    ax = fig.gca()
    counts.plot.bar(x = col , y = train.Loan_Status, ax = ax)
    ax.set_title('Label by ' + col)
    ax.set_ylabel("Loan Status")
plt.show()

In [None]:
Gender = pd.crosstab(train.Gender,train.Loan_Status)
Gender.div(Gender.sum(1).astype(float), axis=0).plot(kind='bar',stacked=True,figsize=(4,4))
plt.show()

Married=pd.crosstab(train['Married'],train['Loan_Status'])
Married.div(Married.sum(1).astype(float), axis=0).plot(kind='bar',stacked=True,figsize=(4,4))
plt.show()

Self_Employed = pd.crosstab(train.Self_Employed,train.Loan_Status)
Self_Employed.div(Self_Employed.sum(1).astype(float), axis=0).plot(kind='bar',stacked=True,figsize=(4,4))
plt.show()

Credit_History = pd.crosstab(train.Credit_History,train.Loan_Status)
Credit_History.div(Credit_History.sum(1).astype(float), axis=0).plot(kind='bar',stacked=True,figsize=(4,4))
plt.show()

'Dependents','Education','Property_Area'

Dependents = pd.crosstab(train.Dependents,train.Loan_Status)
Dependents.div(Dependents.sum(1).astype(float), axis=0).plot(kind='bar',stacked=True,figsize=(4,4))
plt.show()

Education = pd.crosstab(train.Education,train.Loan_Status)
Education.div(Education.sum(1).astype(float), axis=0).plot(kind='bar',stacked=True,figsize=(4,4))
plt.show()

Property_Area = pd.crosstab(train.Property_Area,train.Loan_Status)
Property_Area.div(Property_Area.sum(1).astype(float), axis=0).plot(kind='bar',stacked=True,figsize=(4,4))
plt.show()



In [None]:
train.groupby('Loan_Status')['ApplicantIncome'].mean().plot.bar()

In [None]:
train.ApplicantIncome.describe()

In [None]:
bins = [0,2500,4000,6000,81000]
groups = ['low','average','high','very_high']

train['Income_Bin'] = pd.cut(x=train['ApplicantIncome'],bins=bins,labels=groups)
Income_Bin = pd.crosstab(train.Income_Bin,train.Loan_Status)
Income_Bin.div(Income_Bin.sum(1).astype(float),axis=0).plot(kind='bar',stacked=True)
plt.xlabel('Applicant Income')
plt.ylabel('Percentage')

In [None]:
train.CoapplicantIncome.describe()

In [None]:
bins = [0,1000,3000,42000]
groups = ['low','average','high',]

train['Co_Income_Bin'] = pd.cut(x=train['CoapplicantIncome'],bins=bins,labels=groups)
Co_Income_Bin = pd.crosstab(train.Co_Income_Bin,train.Loan_Status)
Co_Income_Bin.div(Co_Income_Bin.sum(1).astype(float),axis=0).plot(kind='bar',stacked=True)
plt.xlabel('CoApplicant Income')
plt.ylabel('Percentage')

In [None]:
bins = [0,2500,4000,6000,81000]
groups = ['low','average','high','very_high']
train['Total_Income'] = train['ApplicantIncome'] + train['CoapplicantIncome']
train['Total_Income_Bin'] = pd.cut(x=train['Total_Income'],bins=bins,labels=groups)
Total_Income_Bin = pd.crosstab(train.Total_Income_Bin,train.Loan_Status)
Total_Income_Bin.div(Total_Income_Bin.sum(1).astype(float),axis=0).plot(kind='bar',stacked=True)
plt.xlabel('Total Applicant Income')
plt.ylabel('Percentage')

In [None]:
train.LoanAmount.describe()

In [None]:
bins = [0,100,200,700]
groups = ['low','average','high']

train['Loan_Bin'] = pd.cut(x=train['LoanAmount'],bins=bins,labels=groups)
Loan_Bin = pd.crosstab(train.Loan_Bin,train.Loan_Status)
Loan_Bin.div(Loan_Bin.sum(1).astype(float),axis=0).plot(kind='bar',stacked=True)
plt.xlabel('Loan Bin')
plt.ylabel('Percentage')

In [None]:
train.columns

In [None]:
train=train.drop(['Income_Bin', 'Income_Bin', 'Co_Income_Bin', 'Total_Income',
       'Total_Income_Bin', 'Loan_Bin'],axis=1)

In [None]:
train.Dependents.replace('3+',3,inplace=True)
test.Dependents.replace('3+',3,inplace=True)
train.Loan_Status.replace('Y',1,inplace=True)
train.Loan_Status.replace('N',0,inplace=True)

In [None]:
train.corr()

In [None]:
sns.heatmap(train.corr(),cmap='BuPu',annot=True)

4. Data Cleaning And Handling of Missing Values

In [None]:
# checking for missing values 

train[train.columns[train.isnull().any()]].isnull().sum()

In [None]:
miss_cat_feat = ['Gender','Married','Dependents','Self_Employed','Credit_History']

In [None]:
# filling the categorical missing values with the mode

for col in miss_cat_feat:
    train[col].fillna(train[col].mode()[0],inplace=True)

In [None]:
for col in miss_cat_feat:
    test[col].fillna(test[col].mode()[0],inplace=True)

In [None]:
train[train.columns[train.isnull().any()]].isnull().sum()

In [None]:
test[test.columns[test.isnull().any()]].isnull().sum()

In [None]:
train['Loan_Amount_Term'].fillna(train.Loan_Amount_Term.mode()[0],inplace=True)

In [None]:
test['Loan_Amount_Term'].fillna(test.Loan_Amount_Term.mode()[0],inplace=True)

In [None]:
# fill with median

train.LoanAmount.fillna(train.LoanAmount.median(),inplace=True)

In [None]:
test.LoanAmount.fillna(test.LoanAmount.median(),inplace=True)

In [None]:
# confirm no missing values 

train.isnull().sum()

In [None]:
test.isnull().sum()

5. Handling Outliers

In [None]:
# visualization,in this case boxplot, to show extent of outliers

plt.boxplot(train.LoanAmount,vert=False)

In [None]:
# normalize using log transformation
train['LoanAmountLog'] = np.log(train.LoanAmount)

test['LoanAmountLog'] = np.log(test.LoanAmount)

In [None]:
train.LoanAmountLog.hist(bins=20)

In [None]:
train = train.drop('Loan_ID',axis=1)
test = test.drop('Loan_ID',axis=1)


In [None]:
#covert features to dummy variables

train = pd.get_dummies(train)
test = pd.get_dummies(test)


In [None]:
#separate features from target variable

X = train.drop('Loan_Status',1)
y = train.Loan_Status


6. Model Training

In [None]:
# split data into training and validation sets

from sklearn.model_selection import train_test_split

X_train,X_val,y_train,y_val = train_test_split(X,y,test_size=0.3)

In [None]:
#utilise appropiate algorithms and estimators in this case LogisticRegression as a starting point 

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

model = LogisticRegression(C=1/0.01,solver='liblinear').fit(X_train,y_train)
predictions = model.predict(X_val)

print('Accuracy_score:',accuracy_score(y_val,predictions))

In [None]:
# check performance using various metrics. Classification report provides some of the metric measures together

from sklearn.metrics import classification_report

print(classification_report(y_val,predictions))

In [None]:
# test
pred_test = model.predict(test)

In [None]:
#submit first model
submission = pd.read_csv('C:/Users/Admin/Documents/Competition Files/Loan Eligibility Prediction Datasets/sample_submission_49d68Cx.csv')


In [None]:
submission[submission['Loan_Status']=='N']



In [None]:
submission.Loan_Status = pred_test
submission.Loan_ID = test_original.Loan_ID

In [None]:
submission.Loan_Status.replace(0,'N',inplace=True)
submission.Loan_Status.replace(1,'Y',inplace=True)

In [None]:
y_scores = model.predict_proba(X_val)

7. Metrics Measurement And Scores

In [None]:
from sklearn.metrics import roc_curve

fpr,tpr,thresholds = roc_curve(y_val,y_scores[:,1])

fig = plt.figure(figsize=(6,6))
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()

In [None]:
from sklearn.metrics import roc_auc_score

auc = roc_auc_score(y_val,y_scores[:,1])
print('AUC: ' + str(auc))

In [None]:
# submit 

In [None]:
train.columns

In [None]:
8. Model 2 

In [None]:
# Train the model
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
import numpy as np

# Define preprocessing for numeric columns 
numeric_features = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History','LoanAmountLog']
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

# Define preprocessing for categorical features
categorical_features = ['Gender_Female', 'Gender_Male', 'Married_No', 'Married_Yes',
       'Dependents_3', 'Dependents_0', 'Dependents_1', 'Dependents_2',
       'Education_Graduate', 'Education_Not Graduate', 'Self_Employed_No',
       'Self_Employed_Yes', 'Property_Area_Rural', 'Property_Area_Semiurban',
       'Property_Area_Urban']
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Create preprocessing and training pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('logregressor', LogisticRegression(C=1/0.01, solver="liblinear"))])


# fit the pipeline to train a logistic regression model on the training set
model = pipeline.fit(X_train, (y_train))
print (model)

In [None]:
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,recall_score,roc_curve,roc_auc_score

In [None]:
# Check Performance And Scores 

predictions = model.predict(X_val)
y_scores = model.predict_proba(X_val)
cm = confusion_matrix(y_val, predictions)
print ('Confusion Matrix:\n',cm, '\n')
print('Accuracy:', accuracy_score(y_val, predictions))
print("Overall Precision:",precision_score(y_val, predictions))
print("Overall Recall:",recall_score(y_val, predictions))
auc = roc_auc_score(y_val,y_scores[:,1])
print('\nAUC: ' + str(auc))

# calculate ROC curve
fpr, tpr, thresholds = roc_curve(y_val, y_scores[:,1])

# plot ROC curve
fig = plt.figure(figsize=(6, 6))
# Plot the diagonal 50% line
plt.plot([0, 1], [0, 1], 'k--')
# Plot the FPR and TPR achieved by our model
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()

In [None]:
# Submit Model 2
pd.DataFrame(submission,columns=['Loan_ID','Loan_Status']).to_csv('/Loan Eligibility Prediction Datasets/submission.csv')