# In this Jupyter notebook we demonstrate how to build a python Predictive Model with Scikit-learn.

The Dataset for personal loan classification is taken from: https://www.kaggle.com/itsmesunil/bank-loan-modelling

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plot
import seaborn as sns
%matplotlib inline
sns.set(style="ticks")

from scipy.stats import zscore
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import model_selection

In [None]:
# Insert Cloud Object Storage Credentials and Load Dataset here.

In [None]:
# Replace the credentials that you got from Watson Machine Learning service

wml_credentials = {
  "apikey": "<api key>",
  "instance_id": "<instance id>",
  "url": "<URL>"
}

In [None]:
data.columns = ["ID","Age","Experience","Income","ZIPCode","Family","CCAvg","Education","Mortgage","PersonalLoan","SecuritiesAccount","CDAccount","Online","CreditCard"]
data.columns

### Exploring the dataset

#### The dataset has 5000 rows of data and 14 attributes

In [None]:
data.shape

In [None]:
data.info()

#### No columns have null data in the file

In [None]:
data.apply(lambda x : sum(x.isnull()))

#### Eye balling the data

In [None]:
data.describe().transpose()

#### Finding unique data

In [None]:
data.apply(lambda x: len(x.unique()))

#### There are 52 records with negative experience. Before proceeding any further we need to clean the same

In [None]:
data[data['Experience'] < 0]['Experience'].count()

#### Clean the negative variable

In [None]:
dfExp = data.loc[data['Experience'] >0]
negExp = data.Experience < 0
column_name = 'Experience' 

#### Getting the customer ID who has negative experience

In [None]:
mylist = data.loc[negExp]['ID'].tolist()

#### There are 52 records with negative experience

In [None]:
negExp.value_counts()

#### So we Remove the negative experience records

In [None]:
for id in mylist:
    age = data.loc[np.where(data['ID']==id)]["Age"].tolist()[0]
    education = data.loc[np.where(data['ID']==id)]["Education"].tolist()[0]
    df_filtered = dfExp[(dfExp.Age == age) & (dfExp.Education == education)]
    exp = df_filtered['Experience'].median()
    data.loc[data.loc[np.where(data['ID']==id)].index, 'Experience'] = exp

#### Verify records with negative experience are there or not

In [None]:
data[data['Experience'] < 0]['Experience'].count()

In [None]:
data.describe().transpose()

In [None]:
sns.boxplot(x='Education',y='Income',hue='PersonalLoan',data=data)

**Observation** : It seems the customers whose education level is 1 is having more income. However customers who has taken the personal loan have the same income levels

In [None]:
sns.boxplot(x="Education", y='Mortgage', hue="PersonalLoan", data=data,color='yellow')

**Inference** : From the above chart it seems that customer who do not have personal loan and customer who has personal loan have high mortgage

In [None]:
sns.countplot(x="SecuritiesAccount", data=data,hue="PersonalLoan")

**Observation** : Majority of customers who does not have loan have securities account

In [None]:
sns.countplot(x='Family',data=data,hue='PersonalLoan',palette='Set1')

**Observation** : Family size does not have any impact in personal loan. But it seems families with size of 3 are more likely to take loan. When considering future campaign this might be good association.

In [None]:
sns.countplot(x='CDAccount',data=data,hue='PersonalLoan')

**Observation** : Customers who does not have CD account , does not have loan as well. This seems to be majority. But almost all customers who has CD account has loan as well

In [None]:
sns.boxplot(x=data.Family,y=data.Income,hue=data.PersonalLoan)

**Observation** : Looking at the above plot, families with income less than 100K are less likely to take loan, than families with high income

In [None]:
print('Credit card spending of Non-Loan customers: ',data[data.PersonalLoan == 0]['CCAvg'].median()*1000)
print('Credit card spending of Loan customers    : ', data[data.PersonalLoan == 1]['CCAvg'].median()*1000)

### Develop a Naive Bayes Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

Split the data in to Training(70%) and Testing(30%) 

In [None]:
train_set, test_set = train_test_split(data.drop(['ID','Experience'], axis=1), test_size=0.3 , random_state=100)

In [None]:
train_labels = train_set.pop('PersonalLoan')
test_labels = test_set.pop('PersonalLoan')

#### Train the Model and get Predictions

In [None]:
naive_model = GaussianNB()
naive_model.fit(train_set, train_labels)

prediction = naive_model.predict(test_set)
naive_model.score(test_set,test_labels)

#### The model scores an accuracy of 88.67%

In [None]:
print(prediction)

# Deploy the model to Watson Machine Learning

In [None]:
!pip install watson-machine-learning-client

In [None]:
from watson_machine_learning_client import WatsonMachineLearningAPIClient

In [None]:
client = WatsonMachineLearningAPIClient(wml_credentials)

In [None]:
instance_details = client.service_instance.get_details()

In [None]:
published_model = client.repository.store_model(model=naive_model, meta_props={'name':'Personal Loan Prediction Model'}, \
                                                training_data=train_set, training_target=train_labels)

In [None]:
import json
published_model_uid = client.repository.get_model_uid(published_model)
model_details = client.repository.get_details(published_model_uid)

print(json.dumps(model_details, indent=2))

In [None]:
models_details = client.repository.list_models()

In [None]:
loaded_model = client.repository.load(published_model_uid)

In [None]:
test_predictions = loaded_model.predict(test_set[:10])

In [None]:
# The predictions made by the model.
print(test_predictions)

In [None]:
created_deployment = client.deployments.create(published_model_uid, 'Deployment of Personal Loan Prediction model')

In [None]:
deployments = client.deployments.get_details()
scoring_endpoint = client.deployments.get_scoring_url(created_deployment)

In [None]:
print(scoring_endpoint)

In [None]:
#Age	Income	ZIPCode	Family	CCAvg	Education	Mortgage	SecuritiesAccount	CDAccount	Online	CreditCard
#39	139	95616	3	3.4	1	483	0	0	1	0
#29	31	92126	4	0.3	2	0	0	0	1	0

scoring_payload = { "fields":["Age","Income","ZIPCode","Family","CCAvg","Education","Mortgage","SecuritiesAccount","CDAccount","Online", "CreditCard"],"values":[[39,139,95616,3,3.4,1,483,0,0,1,0]]}
# scoring_payload = { "fields":["Age","Income","ZIPCode","Family","CCAvg","Education","Mortgage","SecuritiesAccount","CDAccount","Online", "CreditCard"],"values":[[29,31,92126,4,0.3,2,0,0,0,1,0]]}

In [None]:
predictions = client.deployments.score(scoring_endpoint, scoring_payload)

In [None]:
print(json.dumps(predictions, indent=2))

In [None]:
print(predictions['values'][0][0])