In [54]:
#https://www.analyticsvidhya.com/blog/2017/09/machine-learning-models-as-apis-using-flask/
import os 
import json
import numpy as np
import pandas as pd
from sklearn.externals import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin

import warnings
warnings.filterwarnings("ignore")

In [10]:
!ls DATA/

test.csv     training.csv


In [11]:
data = pd.read_csv('data/training.csv')

In [12]:
list(data.columns)

['Loan_ID',
 'Gender',
 'Married',
 'Dependents',
 'Education',
 'Self_Employed',
 'ApplicantIncome',
 'CoapplicantIncome',
 'LoanAmount',
 'Loan_Amount_Term',
 'Credit_History',
 'Property_Area',
 'Loan_Status']

In [13]:
data.shape

(614, 13)

In [14]:
for _ in data.columns:
    print("The number of null values in:{} == {}".format(_, data[_].isnull().sum()))

The number of null values in:Loan_ID == 0
The number of null values in:Gender == 13
The number of null values in:Married == 3
The number of null values in:Dependents == 15
The number of null values in:Education == 0
The number of null values in:Self_Employed == 32
The number of null values in:ApplicantIncome == 0
The number of null values in:CoapplicantIncome == 0
The number of null values in:LoanAmount == 22
The number of null values in:Loan_Amount_Term == 14
The number of null values in:Credit_History == 50
The number of null values in:Property_Area == 0
The number of null values in:Loan_Status == 0


In [15]:
data.head(5)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


### Here, Loan_status is our target variable, the rest are predictor variables. 
#### Loan_ID wouldn't help much in making predictions about defaulters hence we won't be considering that variable in our final model. Finding out the null/Nan values in the columns:

In [16]:
pred_var = ['Gender','Married','Dependents','Education','Self_Employed','ApplicantIncome','CoapplicantIncome',\
            'LoanAmount','Loan_Amount_Term','Credit_History','Property_Area']

X_train, X_test, y_train, y_test = train_test_split(data[pred_var], data['Loan_Status'], \
                                                    test_size=0.25, random_state=42)

In [17]:
X_train.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
92,Male,Yes,2,Not Graduate,No,3273,1820.0,81.0,360.0,1.0,Urban
304,Male,No,0,Graduate,No,4000,2500.0,140.0,360.0,1.0,Rural
68,Male,Yes,3+,Not Graduate,Yes,7100,0.0,125.0,60.0,1.0,Urban
15,Male,No,0,Graduate,No,4950,0.0,125.0,360.0,1.0,Urban
211,Male,Yes,3+,Graduate,No,3430,1250.0,128.0,360.0,0.0,Semiurban


### Checking out the values for the columns having missing values so that we can fill them up appropriately. Probably only for small dataset

In [18]:
missing_pred = ['Dependents', 'Self_Employed', 'Loan_Amount_Term', 'Gender', 'Married']

for _ in missing_pred:
    print("List of unique labels for {}:::{}".format(_, set(data[_])))

List of unique labels for Dependents:::set(['1', '0', '2', '3+', nan])
List of unique labels for Self_Employed:::set([nan, 'Yes', 'No'])
List of unique labels for Loan_Amount_Term:::set([nan, nan, nan, 12.0, nan, nan, nan, nan, 36.0, 300.0, 180.0, nan, 60.0, nan, nan, nan, 120.0, 84.0, nan, 480.0, nan, 360.0, 240.0, nan])
List of unique labels for Gender:::set([nan, 'Male', 'Female'])
List of unique labels for Married:::set([nan, 'Yes', 'No'])


In [19]:
from sklearn.base import BaseEstimator, TransformerMixin

class PreProcessing(BaseEstimator, TransformerMixin):
    """Custom Pre-Processing estimator for our use-case
    """

    def __init__(self):
        pass

    def transform(self, df):
        """Regular transform() that is a help for training, validation & testing datasets
           (NOTE: The operations performed here are the ones that we did prior to this cell)
        """
        pred_var = ['Gender','Married','Dependents','Education','Self_Employed','ApplicantIncome',\
                    'CoapplicantIncome','LoanAmount','Loan_Amount_Term','Credit_History','Property_Area']
        
        df = df[pred_var]
        
        df['Dependents'] = df['Dependents'].fillna(0)
        df['Self_Employed'] = df['Self_Employed'].fillna('No')
        df['Loan_Amount_Term'] = df['Loan_Amount_Term'].fillna(self.term_mean_)
        df['Credit_History'] = df['Credit_History'].fillna(1)
        df['Married'] = df['Married'].fillna('No')
        df['Gender'] = df['Gender'].fillna('Male')
        df['LoanAmount'] = df['LoanAmount'].fillna(self.amt_mean_)
        
        gender_values = {'Female' : 0, 'Male' : 1} 
        married_values = {'No' : 0, 'Yes' : 1}
        education_values = {'Graduate' : 0, 'Not Graduate' : 1}
        employed_values = {'No' : 0, 'Yes' : 1}
        property_values = {'Rural' : 0, 'Urban' : 1, 'Semiurban' : 2}
        dependent_values = {'3+': 3, '0': 0, '2': 2, '1': 1}
        
        # Here we'e replacign all string values like 'Male', 'Employed', 'Graduate' with corresponding
        # numerical values from dictionaries above
        df.replace({'Gender': gender_values, 'Married': married_values, 'Education': education_values, \
                    'Self_Employed': employed_values, 'Property_Area': property_values, \
                    'Dependents': dependent_values}, inplace=True)
        
        return df.as_matrix()

    def fit(self, df, y=None, **fit_params):
        """Fitting the Training dataset & calculating the required values from train
           e.g: We will need the mean of X_train['Loan_Amount_Term'] that will be used in
                transformation of X_test
        """
        
        self.term_mean_ = df['Loan_Amount_Term'].mean()
        self.amt_mean_ = df['LoanAmount'].mean()
        return self

In [20]:
# Convert y_train & y_test to np.array:
y_train = y_train.replace({'Y':1, 'N':0}).as_matrix()
y_test = y_test.replace({'Y':1, 'N':0}).as_matrix()

### We’ll create a pipeline to make sure that all the preprocessing steps that we do are just a single scikit-learn estimator

In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline

pipe = make_pipeline(PreProcessing(), RandomForestClassifier())

In [22]:
pipe

Pipeline(steps=[('preprocessing', PreProcessing()), ('randomforestclassifier', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False))])

In [23]:
#grid search for optimal parameters
param_grid = {"randomforestclassifier__n_estimators" : [5,10,15,20,25,30],
             "randomforestclassifier__max_depth" : [None, 6, 8, 10,12],
             "randomforestclassifier__max_leaf_nodes": [None, 2,5, 7,10,15,20], 
             "randomforestclassifier__min_impurity_split": [0.1, 0.2, 0.3,0.4]}

In [24]:
from sklearn.model_selection import train_test_split, GridSearchCV

grid = GridSearchCV(pipe, param_grid=param_grid, cv=3)

In [25]:
grid

GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(steps=[('preprocessing', PreProcessing()), ('randomforestclassifier', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'randomforestclassifier__max_leaf_nodes': [None, 2, 5, 7, 10, 15, 20], 'randomforestclassifier__n_estimators': [5, 10, 15, 20, 25, 30], 'randomforestclassifier__max_depth': [None, 6, 8, 10, 12], 'randomforestclassifier__min_impurity_split': [0.1, 0.2, 0.3, 0.4]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

### Fitting the training data on the pipeline estimator

In [26]:
%%time
grid.fit(X_train, y_train)

CPU times: user 3min 42s, sys: 1.29 s, total: 3min 43s
Wall time: 3min 43s


GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(steps=[('preprocessing', PreProcessing()), ('randomforestclassifier', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'randomforestclassifier__max_leaf_nodes': [None, 2, 5, 7, 10, 15, 20], 'randomforestclassifier__n_estimators': [5, 10, 15, 20, 25, 30], 'randomforestclassifier__max_depth': [None, 6, 8, 10, 12], 'randomforestclassifier__min_impurity_split': [0.1, 0.2, 0.3, 0.4]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [27]:
print("Best parameters: {}".format(grid.best_params_))

Best parameters: {'randomforestclassifier__max_leaf_nodes': 15, 'randomforestclassifier__n_estimators': 5, 'randomforestclassifier__max_depth': 12, 'randomforestclassifier__min_impurity_split': 0.4}


In [28]:
print("Validation set score: {:.2f}".format(grid.score(X_test, y_test)))

Validation set score: 0.77


In [29]:
test_df = pd.read_csv('data/test.csv', encoding="utf-8-sig")
test_df = test_df.head()

In [30]:
# gs.predict(X_test) is equivalent to gs.best_estimator_.predict(X_test). 
# Using either, X_test will be passed through your entire pipeline and it will return the predictions.
grid.predict(test_df)

array([1, 1, 1, 1, 1])

### Saving Machine Learning Model : Serialization & Deserialization

In [32]:
#In Python, pickling is a standard way to store objects and retrieve them as their original state. 
# To give a simple example:
list_to_pickle = [1, 'here', 123, 'walker']

#Pickling the list
import pickle

list_pickle = pickle.dumps(list_to_pickle)
list_pickle

"(lp0\nI1\naS'here'\np1\naI123\naS'walker'\np2\na."

In [33]:
#Load back
loaded_pickle = pickle.loads(list_pickle)
loaded_pickle

[1, 'here', 123, 'walker']

### Our model is saved into filename

In [36]:
import dill as pickle
filename = 'model_v1.pk'

with open(filename, 'wb') as file:
    pickle.dump(grid, file)

### Now that the model is pickled, creating a Flask wrapper around it would be the next step.
#### Before that, to be sure that our pickled file works fine – let’s load it back and do a prediction:

In [37]:
with open(filename ,'rb') as f:
    loaded_model = pickle.load(f)
    
loaded_model.predict(test_df)

array([1, 1, 1, 1, 1])

### There are three important parts in constructing our wrapper function, <font color=red>apicall()</font>:

-  Getting the <font color=red>request</font> data (for which predictions are to be made)
-  Loading our <font color=red>pickled estimator</font>
-  <font color=red>jsonifytext</font> our predictions and send the response back with <font color=red>status code: 200</font>

HTTP messages are made of a header and a body. As a standard, majority of the body content sent across are in <font color=red>json</font> format. We’ll be sending (<font color=red>POST url-endpoint/</font>) the incoming data as batch to get predictions.

(__NOTE__: You can send plain text, XML, csv or image directly but for the sake of interchangeability of the format, it is advisable to use <font color=red>json</font>)

In [40]:
# type in your shell (assuming u have files server.py and utils.py):   gunicorn --bind 0.0.0.0:8000 server:app

In [50]:
# Let’s generate some prediction data and query the API running locally at https:0.0.0.0:8000/predict
# No need to open this port in browser. It runs in the background

In [44]:
import json
import requests

In [47]:
"""Setting the headers to send and accept json responses"""

header = {'Content-Type': 'application/json', \
                  'Accept': 'application/json'}

"""Reading test batch"""

df = pd.read_csv('data/test.csv', encoding="utf-8-sig")
df = df.head()

"""Converting Pandas Dataframe to json"""

data = df.to_json(orient='records')

In [48]:
data

'[{"Loan_ID":"LP001015","Gender":"Male","Married":"Yes","Dependents":"0","Education":"Graduate","Self_Employed":"No","ApplicantIncome":5720,"CoapplicantIncome":0,"LoanAmount":110.0,"Loan_Amount_Term":360.0,"Credit_History":1.0,"Property_Area":"Urban"},{"Loan_ID":"LP001022","Gender":"Male","Married":"Yes","Dependents":"1","Education":"Graduate","Self_Employed":"No","ApplicantIncome":3076,"CoapplicantIncome":1500,"LoanAmount":126.0,"Loan_Amount_Term":360.0,"Credit_History":1.0,"Property_Area":"Urban"},{"Loan_ID":"LP001031","Gender":"Male","Married":"Yes","Dependents":"2","Education":"Graduate","Self_Employed":"No","ApplicantIncome":5000,"CoapplicantIncome":1800,"LoanAmount":208.0,"Loan_Amount_Term":360.0,"Credit_History":1.0,"Property_Area":"Urban"},{"Loan_ID":"LP001035","Gender":"Male","Married":"Yes","Dependents":"2","Education":"Graduate","Self_Employed":"No","ApplicantIncome":2340,"CoapplicantIncome":2546,"LoanAmount":100.0,"Loan_Amount_Term":360.0,"Credit_History":null,"Property_Are

In [51]:
"""POST <url>/predict"""
resp = requests.post("http://0.0.0.0:8000/predict", data = json.dumps(data), headers= header)

In [52]:
resp.status_code

200

In [53]:
"""The final response we get is as follows:"""
resp.json()

{u'predictions': u'[{"0":"LP001015","1":1},{"0":"LP001022","1":1},{"0":"LP001031","1":1},{"0":"LP001035","1":1},{"0":"LP001051","1":1}]'}

### Ok, our local API serves some data from our model

### Now, how to deploy this shit somewhere ?