# Notebook code for api


Training model in notebook

# 1)- Import key modules

In [1]:
#support both Python 2 and Python 3 with minimal overhead.
from __future__ import absolute_import, division, print_function

# I am an engineer. I care only about error not warning. So, let's be maverick and ignore warnings.
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pickle

In [3]:
# checking version


# first install: pip install version_information
%reload_ext version_information
%version_information pandas, numpy, pickle, sklearn

Software,Version
Python,3.7.5 64bit [Clang 4.0.1 (tags/RELEASE_401/final)]
IPython,7.10.2
OS,Darwin 19.4.0 x86_64 i386 64bit
pandas,0.25.3
numpy,1.17.4
pickle,The 'pickle' distribution was not found and is required by the application
sklearn,0.22.2.post1
Sat May 16 17:21:31 2020 CEST,Sat May 16 17:21:31 2020 CEST


# 2)- Load Dataset

Dataset Link: https://www.kaggle.com/ritesaluja/bank-note-authentication-uci-data

In [4]:
df=pd.read_csv("data/BankNote_Authentication.csv")
df.shape

(1372, 5)

In [5]:
df.head()

Unnamed: 0,variance,skewness,curtosis,entropy,class
0,3.6216,8.6661,-2.8073,-0.44699,0
1,4.5459,8.1674,-2.4586,-1.4621,0
2,3.866,-2.6383,1.9242,0.10645,0
3,3.4566,9.5228,-4.0112,-3.5944,0
4,0.32924,-4.4552,4.5718,-0.9888,0


In [6]:
#check target variable
df['class'].value_counts()

0    762
1    610
Name: class, dtype: int64

# 3)-Model Preprocess

### 3a.Independent and Dependent features

In [7]:
X=df.iloc[:,:-1]
y=df.iloc[:,-1]

In [8]:
X.head()

Unnamed: 0,variance,skewness,curtosis,entropy
0,3.6216,8.6661,-2.8073,-0.44699
1,4.5459,8.1674,-2.4586,-1.4621
2,3.866,-2.6383,1.9242,0.10645
3,3.4566,9.5228,-4.0112,-3.5944
4,0.32924,-4.4552,4.5718,-0.9888


In [9]:
y[:5]

0    0
1    0
2    0
3    0
4    0
Name: class, dtype: int64

In [10]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=42)

# 4)- Model implementation

In [11]:
classifier=RandomForestClassifier()
classifier.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [12]:
### Prediction
y_pred=classifier.predict(X_test)

In [13]:
y_pred[:5]

array([0, 0, 0, 0, 0])

# 5)- Evaluate Model

### 5a. Check Accuracy

In [14]:
score=accuracy_score(y_test,y_pred)
score

0.9941690962099126

### 5b. For predict class

In [15]:
score_predprob=classifier.predict_proba(X_test)
score_predprob[:5]

array([[1.  , 0.  ],
       [0.99, 0.01],
       [0.9 , 0.1 ],
       [1.  , 0.  ],
       [1.  , 0.  ]])

### 5.c.for class 1 i.e authentic notes

In [16]:
score_predprob=classifier.predict_proba(X_test)[:, 1]
score_predprob[:5]

array([0.  , 0.01, 0.1 , 0.  , 0.  ])

### 5d.Classification Report

In [17]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       191
           1       1.00      0.99      0.99       152

    accuracy                           0.99       343
   macro avg       0.99      0.99      0.99       343
weighted avg       0.99      0.99      0.99       343



# 6)- Training on full data 

In [18]:
X.head()

Unnamed: 0,variance,skewness,curtosis,entropy
0,3.6216,8.6661,-2.8073,-0.44699
1,4.5459,8.1674,-2.4586,-1.4621
2,3.866,-2.6383,1.9242,0.10645
3,3.4566,9.5228,-4.0112,-3.5944
4,0.32924,-4.4552,4.5718,-0.9888


In [19]:
print(y[:5])

0    0
1    0
2    0
3    0
4    0
Name: class, dtype: int64


In [20]:
full_classifier=RandomForestClassifier()
full_classifier.fit(X,y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

# 6)- Out of sample data

- variance , skewness , curtosis , entropy
- For query : predict?variance=2&skewness=3&curtosis=2&entropy=1

In [21]:
oos_data=[[2,3,4,1]]

In [22]:
oos_data

[[2, 3, 4, 1]]

In [23]:
full_classifier.predict(oos_data)

array([0])

In [24]:
full_classifier.predict_proba(oos_data)[:, 1]

array([0.])

# 7)-Serialize Model
for use in api

In [25]:
pickle_out = open("train_model/classifier.pkl","wb")
pickle.dump(full_classifier, pickle_out)
pickle_out.close()

# END of Notebook Code