## Problem Statement:
The department wants to build a model that will help them identify the potential customers who have higher probability of purchasing the loan. This will increase the success ratio while at the same time reduce the cost of the campaign.


In [None]:
#import the needed packages

from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.metrics import classification_report,roc_auc_score, confusion_matrix, accuracy_score,recall_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold, cross_val_score, train_test_split,GridSearchCV
#from sklearn.cross_validation import cross_val_predict
from sklearn.metrics import classification_report
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import confusion_matrix
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.metrics import make_scorer
from sklearn import metrics
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
import scipy.stats as stats
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib
%matplotlib inline
import warnings

import sklearn
import scipy

import sys
import os




# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
import os
print(os.listdir("../input"))
# Any results you write to the current directory are saved as output.

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Import the data to bank_df
bank_df=pd.read_csv("../input/Bank_Personal_Loan_Modelling.csv")

In [None]:
bank_df.head(10)

## Initial Data Analysis:

Initial data analysis in primary step for data analytics. Mostly its cover as part of EDA. But as name suggest,EDA is exploritary data analysis is done to analyze each of feature in data set to get some inferance or for the Hypothesis.

IDA on other hand perform to get familiar with data set. To identify the dependent and independent variable in data set. IDA step consists of :

    1. Shape of the data. Row and Column count.
    2. Get to know datatypes of the features of the dataset.
    3. Initial descritive analysis.
    4. Check if the missing values are present.
    4. Check if the data set in balanced dataset or not.

In [None]:
# Shape of training and test data set
def dataframe_shape(df):
    print("The dataframe has %d rows" %df.shape[0])
    print("The dataframe has %d columns" %df.shape[1])

dataframe_shape(bank_df)

In [None]:
# Columns/Feature in dataset
pd.DataFrame(bank_df.columns,index=None,copy=False).T

In [None]:
# First 3 observation
bank_df.head(3) # you can choose any number of rows by changing the number inside head function. Default it shows 5

In [None]:
# Last 3 observation
bank_df.tail(3) # you can choose any number of rows by changing the number inside tail function. Default it shows 5

In [None]:
# Random 3 observation
bank_df.sample(3) # you can choose any number of rows by changing the number inside sample function. Default it shows 1

In [None]:
# datatypes present into training dataset
def datatypes_insight(data):
    display(data.dtypes.to_frame().T)
    data.dtypes.value_counts().plot(kind="barh")

datatypes_insight(bank_df)

In [None]:
# Missing value identification

def Nan_value(data):
    display(data.apply(lambda x: sum(x.isnull())).to_frame().T)
    ##data.apply(lambda x: sum(x.isnull())).plot(kind="barh")

Nan_value(bank_df)

In [None]:
# Ploting the NAN values if any.
sns.heatmap(bank_df.isna(),yticklabels=False,cbar=False,cmap='viridis')

In [None]:
# Unique values in features
def unique_data(data):
    display(data.apply(lambda x: len(x.unique())).to_frame().T)
    data.apply(lambda x: len(x.unique())).plot(kind="barh")

unique_data(bank_df)

In [None]:
# check for imbalance dataset
fig, ax = plt.subplots(nrows=1, ncols=2,squeeze=True)
fig.set_size_inches(14,6)
frequency_colums= pd.crosstab(index=bank_df["Personal Loan"],columns="count")
frequency_colums.plot(kind='bar',ax=ax[0],color="c",legend=False,rot=True,fontsize=10)
frequency_colums.plot(kind='pie',ax=ax[1],subplots=True,legend=False,fontsize=10,autopct='%.2f')
ax[0].set_title('Frequency Distribution of Dependent variable: Survived',fontsize=10)
ax[1].set_title('Pie chart representation of Dependent variable: Survived',fontsize=10)

#adding the text labels
rects = ax[0].patches
labels = frequency_colums["count"].values
for rect, label in zip(rects, labels):
    height = rect.get_height()
    ax[0].text(rect.get_x() + rect.get_width()/2, height +1,label, ha='center', va='bottom',fontsize=10)
plt.show()

## Observation/Inferance from IDA- Initial Data Analysis:
1. Dataset has 5000 records with 14 features/variable. Data is not huge.
2. In Dataset we have mostly Integer data types.
3. There is NO missing value present in dataset.Thats great.
4. Dataset is likely to be imbalanced dataset.There ratio of Opted Personal Loan vs not opted is less than 90:10.We may need to find out a way to balance those 2 class.

## Data Visualization and Insight-EDA(Exploratory Data Analysis)

It is a good practice to understand the data first and try to gather as many insights from it. EDA is all about making sense of data in hand.

## Univariate Analysis:

In [None]:
#statistical analysis of data set
bank_df.describe().T

##### Observation: 
Experience minimum is -3.Experience can not be negetive value.We will treat this condition later.Assumption is Experience and Age are related.

### Data Distribution of each feature:

In [None]:
def distploting(df):
    col_value=df.columns.values.tolist()
    sns.set(context='notebook',style='whitegrid', palette='dark',font='sans-serif',font_scale=1.2,color_codes=True)
    
    fig, axes = plt.subplots(nrows=7, ncols=2,constrained_layout=True)
    count=0
    for i in range (7):
        for j in range (2):
            s=col_value[count+j]
            #axes[i][j].hist(df[s].values,color='c')
            sns.distplot(df[s].values,ax=axes[i][j],bins=30,color="c")
            axes[i][j].set_title(s,fontsize=17)
            fig=plt.gcf()
            fig.set_size_inches(8,20)
            plt.tight_layout()
        count=count+j+1
        
             
distploting(bank_df)

#### These values classify the samples into sets of similar samples. Within categorical features are the values nominal, ordinal, ratio, or interval based.
Among other things this helps us select the appropriate plots for visualization.
### Categorical feature:
###### ordinal:
- Family
- Education
###### nominal:
- ID
- Zip Code
- Securities Account
- CD Account
- Online
- Credit Card
### Numerical feature:
- Age
- Experience
- Income
- CCAvg
- Mortage

# Assumtions based on data analysis
We arrive at following assumptions based on data analysis done so far. We may validate these assumptions further before taking appropriate actions.

#### Correlating.

We want to know how well does each feature correlate with personal loan acceptance. We want to do this early in our project and match these quick correlations with modelled correlations later in the project.

#### Creating.

1. We may want to engineer the Mortgage & income feature to see the mortgage raitio vs income(monthly).
2. We may want to engineer the Income & CCAvg feature to see the CCAvg raitio vs income(monthly).
3. We may want to create new feature for Age and Experience bands. This turns a continous numerical feature into an ordinal categorical feature.

## Bi-Variate Analysis - With Pivot Table for Catagorical variable:

In [None]:
bank_df[['CreditCard', 'Personal Loan']].groupby(['CreditCard'], as_index=False).mean().sort_values(by='Personal Loan', ascending=False)

In [None]:
bank_df[['Online', 'Personal Loan']].groupby(['Online'], as_index=False).mean().sort_values(by='Personal Loan', ascending=False)

In [None]:
bank_df[['Family', 'Personal Loan']].groupby(['Family'], as_index=False).mean().sort_values(by='Personal Loan', ascending=False)

In [None]:
bank_df[['Education', 'Personal Loan']].groupby(['Education'], as_index=False).mean().sort_values(by='Personal Loan', ascending=False)

In [None]:
bank_df[['CD Account', 'Personal Loan']].groupby(['CD Account'], as_index=False).mean().sort_values(by='Personal Loan', ascending=False)

In [None]:
bank_df[['Securities Account', 'Personal Loan']].groupby(['Securities Account'], as_index=False).mean().sort_values(by='Personal Loan', ascending=False)

##### Observation by analyzing pivoting features:
1. **CD_Account: **We observe significant correlation (~0.5) among CD_Account=1 and Personal_Loan Accepted We decide to include this feature in our model.

2. **Securities Account/Education/Family: **We observe mild correlation (>.1) with Personal_Loan Accepted.We decide to include these features in our model.

3. **Credit_Card/Online(NetBanking Facility): **We observe less correlation (less than .1) with Personal_Loan Accepted.We may would  like to exclude both these features from our model.

# Analyze by visualizing data
Now we can continue confirming some of our assumptions using visualizations for analyzing the data.Let us start by understanding correlations between numerical features and our solution goal (Personal loan accepted).

In [None]:
g = sns.FacetGrid(bank_df, col='Personal Loan')
g.map(plt.hist,'Income', bins=20)

In [None]:
g = sns.FacetGrid(bank_df, col='Personal Loan')
g.map(plt.hist,'Mortgage', bins=20)

In [None]:
g = sns.FacetGrid(bank_df, col='Personal Loan')
g.map(plt.hist,'CCAvg', bins=20)

In [None]:
g = sns.FacetGrid(bank_df, col='Personal Loan')
g.map(plt.hist,'Age', bins=20)

In [None]:
g = sns.FacetGrid(bank_df, col='Personal Loan')
g.map(plt.hist,'Experience', bins=20)

##### Observations.

1. Customer having less income (Income<=100K) had high rejection rate.
2. Customer having 0 Mortgage had high rejection rate.
3. Customer having low CCAvg mostly rejected Personal loan offer. Custer having CCAvg between 2.5 to 6 has higher rate of acceptance of the offer
4. Most Customers are in 35-55 age range.
5. Most Customers are in 15-35 Experience range

##### Decisions.
1. We should consider Income & Mortgage in our model training.
2. We should band age and Experience group may be.

### Correlating numerical and ordinal features
We can combine multiple features for identifying correlations using a single plot. This can be done with numerical and categorical features which have numeric values.

Categorical:Online,CreditCard,Securities Account,CD Account.
Ordinal: Education & Family.

In [None]:
grid = sns.FacetGrid(bank_df, col='Personal Loan', row='Education', size=2.5, aspect=1.6)
grid.map(plt.hist, 'Income', alpha=.5, bins=20)
grid.add_legend();

##### Observation: 
1. Most customer having less than 100K anual income & Education qualification is undergraduate , Rejected the loan.
2. Mejority customers of bank are having less than <100K anual income and Education qualification is undergraduate.

In [None]:
grid = sns.FacetGrid(bank_df, col='Personal Loan', row='Family', size=2.5, aspect=1.6)
grid.map(plt.hist, 'Income', alpha=.5, bins=20)
grid.add_legend();

**Observation: 
**Single(Family Size-1/Couples(Family Size-2) people are mejority of the customer of Thera Bank.

In [None]:
grid = sns.FacetGrid(bank_df, col='Personal Loan', row='Online', size=2.5, aspect=1.6)
grid.map(plt.hist, 'Income', alpha=.5, bins=20)
grid.add_legend();

**Observation: **Customer having high income(Yearly income>120K,does not have net-banking(Online) facility.

In [None]:
grid = sns.FacetGrid(bank_df, col='Personal Loan', row='CreditCard', size=2.5, aspect=1.6)
grid.map(plt.hist, 'Income', alpha=.5, bins=20)
grid.add_legend();

##### Observation: 
1. Most of the customer of the Bank does not have Credit_card.
2. Customer Having high income,who does not have credit card,has higher rate of loan offer acceptance.

In [None]:
grid = sns.FacetGrid(bank_df, col='Personal Loan', row='Family', size=2.5, aspect=1.6)
grid.map(plt.hist, 'Mortgage', alpha=.5, bins=20)
grid.add_legend();

# Data Cleaning/Wrangle data:

We have collected several assumptions and decisions regarding our datasets and solution requirements. So far we did not have to change a single feature or value to arrive at these. Let us now execute our decisions and assumptions for correcting, creating, and completing goals.

##### Correcting by imputing the Data:
Experience feature we saw some negetive value. Lets fix that by compareing with Age.

In [None]:
# Compare the Age, Exp and Education for the person
pd.DataFrame(bank_df[bank_df["Experience"]>0][["Age","Education","Experience"]].sort_values("Age")).head()

In [None]:
#Lets see if we have any relationship bewteen Exp and Age
df = pd.DataFrame(bank_df.groupby("Age").mean()["Experience"]).reset_index()
fig.set_size_inches(20,6)
sns.lmplot(x='Age',y='Experience',data=df)
plt.ylabel("Experience(Mean)")
plt.title("Mean Experience by Age")
plt.show()

In [None]:
# From the plot, we can see Age and Experience has linear relationship.
#In data set the value was correct but it was captured with wrong sign.let replace the values with absolute value.
bank_df["Experience"] = bank_df["Experience"].apply(abs)

## Create new Feature data:

##### Create perhead income:

In [None]:
bank_df["PP_income_M"] = (((bank_df["Income"]*1000)/12)-((bank_df["CCAvg"]*1000)/12))

In [None]:
g = sns.FacetGrid(bank_df, col='Personal Loan')
g.map(plt.hist,'PP_income_M', bins=20)

## Correcting by dropping features:
This is a good starting goal to execute. By dropping features we are dealing with fewer data points. Speeds up our notebook and eases the analysis.

Based on our assumptions and decisions we want to drop the ID and Zip features.

In [None]:
bank_df = bank_df.drop(['ID','ZIP Code'], axis=1)

# Co-relation Map

In [None]:
corr = bank_df.corr()

mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True

plt.figure(figsize=(20,15))
sns.heatmap(corr, mask=mask,annot=True,square=True,cmap="coolwarm")

In [None]:
plt.figure(figsize=(20, 20))
sns.pairplot(bank_df,hue="Personal Loan")

# Feature Scaling:

In [None]:
from sklearn.preprocessing import MinMaxScaler,StandardScaler,robust_scale
scaler = StandardScaler();

colscal=['Age', 'Experience', 'Income', 'CCAvg','PP_income_M']

scaler.fit(bank_df[colscal])
scaled_bank_df = pd.DataFrame(scaler.transform(bank_df[colscal]),columns=colscal)

bank_df =bank_df.drop(colscal,axis=1)
bank_df = scaled_bank_df.join(bank_df)

# Test_Train Split

In [None]:
X=bank_df[['Age','Experience', 'Income', 'Family', 'CCAvg', 'Education', 'Securities Account', 'CD Account', 'Online',
       'CreditCard','PP_income_M']]
y=bank_df["Personal Loan"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.30,random_state=101)

# Model:

### Logistic Regration:

In [None]:
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)

In [None]:
predict = logmodel.predict(X_test.values)
predictProb = logmodel.predict_proba(X_test.values)
acc_log=round(metrics.accuracy_score(predict,y_test)*100,2)

In [None]:
import pickle
filename = 'finalized_model.sav'
pickle.dump(logmodel, open(filename, 'wb'))

In [None]:
print("**"*40)
print('The accuracy of the Logistic is',metrics.accuracy_score(predict,y_test))
print("__"*40)
print("confusion_matrix :\n",confusion_matrix(y_test, predict))
print("__"*40)
print("\nclassification_report :\n",classification_report(y_test, predict))
print("__"*40)
print('Recall Score',recall_score(y_test, predict))
print('ROC AUC :', roc_auc_score(y_test, predictProb[:,1]))
print('Accuracy :',accuracy_score(y_test, predict))
print("**"*40)

# Logistic Regration with KFold Cross Validation:

In [None]:
score1 =cross_val_score(X=X,y=y,estimator=logmodel,scoring="recall",cv=10)
score2 =cross_val_score(X=X,y=y,estimator=logmodel,scoring="roc_auc",cv=10)
score3 =cross_val_score(X=X,y=y,estimator=logmodel,scoring="accuracy",cv=10)
score4 =cross_val_score(X=X,y=y,estimator=logmodel,scoring="f1",cv=10)
score5 =cross_val_score(X=X,y=y,estimator=logmodel,scoring="average_precision",cv=10)

In [None]:
print("**"*40)
print("Logistic Regression Cross Validation:")
print("\nCross Validation Recall :",score1.mean())
print("Cross Validation Roc Auc :",score2.mean())
print("Cross Validation accuracy :",score3.mean())
print("Cross Validation f1 :",score4.mean())
print("Cross Validation average_precision :",score5.mean())
print("**"*40)

# K-Nearest-Neighbors:

In [None]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train,y_train)

In [None]:
predict = knn.predict(X_test.values)
predictProb = knn.predict_proba(X_test.values)
acc_knn=round(metrics.accuracy_score(predict,y_test)*100,2)

In [None]:
print("**"*40)
print('The accuracy of the KNN is',metrics.accuracy_score(predict,y_test))
print("__"*40)
print("confusion_matrix :\n",confusion_matrix(y_test, predict))
print("__"*40)
print("\nclassification_report :\n",classification_report(y_test, predict))
print("__"*40)
print('Recall Score',recall_score(y_test, predict))
print('ROC AUC :', roc_auc_score(y_test, predictProb[:,1]))
print("**"*40)

# K-Nearest-Neighbors with KFold Cross Validation

In [None]:
score1 =cross_val_score(X=X,y=y,estimator=knn,scoring="recall",cv=10)
score2 =cross_val_score(X=X,y=y,estimator=knn,scoring="roc_auc",cv=10)
score3 =cross_val_score(X=X,y=y,estimator=knn,scoring="accuracy",cv=10)
score4 =cross_val_score(X=X,y=y,estimator=knn,scoring="f1",cv=10)
score5 =cross_val_score(X=X,y=y,estimator=knn,scoring="average_precision",cv=10)

In [None]:
print("KNN Cross Validation:")
print("**"*40)
print("\nCross Validation Recall :",score1.mean())
print("Cross Validation Roc Auc :",score2.mean())
print("Cross Validation accuracy :",score3.mean())
print("Cross Validation f1 :",score4.mean())
print("Cross Validation average_precision :",score5.mean())
print("**"*40)

# K-Nearest-Neighbors with GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV
k = np.arange(1,10,1)

In [None]:
parameters = {'n_neighbors': k, 
              'weights': ["uniform","distance"], 
              'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
             }

acc_scorer = make_scorer(accuracy_score)

In [None]:
grid_obj = GridSearchCV(knn, parameters, scoring=acc_scorer)
grid_obj = grid_obj.fit(X_train, y_train)

In [None]:
print("**"*40)
print('The accuracy of the KNN is',metrics.accuracy_score(predict,y_test))

In [None]:
predict = grid_obj.predict(X_test.values)
predictProb = grid_obj.predict_proba(X_test.values)

In [None]:
print("**"*40)
print('The accuracy of the KNN with GridSearchCV is',metrics.accuracy_score(y_test,predict))
print("__"*40)
print("confusion_matrix :\n",confusion_matrix(y_test, predict))
print("__"*40)
print("\nclassification_report :\n",classification_report(y_test, predict))
print("__"*40)
print('Recall Score',recall_score(y_test, predict))
print('ROC AUC :', roc_auc_score(y_test, predictProb[:,1]))
print('Accuracy :',accuracy_score(y_test, predict))
print("**"*40)

In [None]:
from sklearn import model_selection
# subsetting just the odd ones
neighbors = list(np.arange(1,20,2))

# empty list that will hold cv scores
cv_scores = []

# perform 10-fold cross validation
for k in neighbors:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores =model_selection.cross_val_score(knn, X_train, y_train, cv=10, scoring='accuracy')
    cv_scores.append(scores.mean())

In [None]:
# changing to misclassification error
MSE = [1 - x for x in cv_scores]

optimal_k = neighbors[MSE.index(min(MSE))]
print ("The optimal number of neighbors is %d" % optimal_k)

# plot misclassification error vs k
plt.plot(neighbors,MSE)
locator = matplotlib.ticker.MultipleLocator(2)
plt.gca().xaxis.set_major_locator(locator)
formatter = matplotlib.ticker.StrMethodFormatter("{x:.0f}")
plt.gca().xaxis.set_major_formatter(formatter)
plt.xlabel('Number of Neighbors K')
plt.ylabel('Misclassification Error')
plt.show()

# Naïve Bayes:

In [None]:
gb = GaussianNB()
gb.fit(X_train, y_train)

In [None]:
predict = gb.predict(X_test)
predictProb = gb.predict_proba(X_test)
acc_nb=round(metrics.accuracy_score(predict,y_test)*100,2)

In [None]:
print("**"*40)
print('The accuracy of the Naïve Bayes is',metrics.accuracy_score(predict,y_test))
print("__"*40)
print("confusion_matrix :\n",confusion_matrix(y_test, predict))
print("__"*40)
print("\nclassification_report :\n",classification_report(y_test, predict))
print("__"*40)
print('Recall Score',recall_score(y_test, predict))
print('ROC AUC :', roc_auc_score(y_test, predictProb[:,1]))
print('Accuracy :',accuracy_score(y_test, predict))
print("**"*40)

# Naïve Bayes with KFold cross validation:

In [None]:
score1 =cross_val_score(X=X,y=y,estimator=gb,scoring="recall",cv=10)
score2 =cross_val_score(X=X,y=y,estimator=gb,scoring="roc_auc",cv=10)
score3 =cross_val_score(X=X,y=y,estimator=gb,scoring="accuracy",cv=10)
score4 =cross_val_score(X=X,y=y,estimator=gb,scoring="f1",cv=10)
score5 =cross_val_score(X=X,y=y,estimator=gb,scoring="average_precision",cv=10)

In [None]:
print("Naïve Bayes Cross Validation:")
print("**"*40)
print("\nCross Validation Recall :",score1.mean())
print("Cross Validation Roc Auc :",score2.mean())
print("Cross Validation accuracy :",score3.mean())
print("Cross Validation f1 :",score4.mean())
print("Cross Validation average_precision :",score5.mean())
print("**"*40)

# Model evaluation
We can now rank our evaluation of all the models to choose the best one for our problem. While both Decision Tree and Random Forest score the same, we choose to use Random Forest as they correct for decision trees' habit of overfitting to their training set.

In [None]:
models = pd.DataFrame({
    'Model': ['KNN', 'Logistic Regression','Naive Bayes'],
    'Score': [acc_knn, acc_log, acc_nb, 
              ]})
models.sort_values(by='Score', ascending=False)

# References
This notebook has been created based on great work done solving the Titanic competition and other sources.
1. https://www.kaggle.com/startupsci/titanic-data-science-solutions .
2. https://www.kaggle.com/iconoclash/personal-loan-dataset-binary-classification.