# Notebook 03: Modeling


Need to install this first https://anaconda.org/conda-forge/imbalanced-learn

### Import libraries

In [1]:
# Standard libraries
import pickle
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# library to handle imbalanced classes
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline as imbPipeline

# scikit items
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import (train_test_split, learning_curve, StratifiedKFold,
                                    cross_validate, cross_val_score, GridSearchCV)
from sklearn.metrics import (accuracy_score, recall_score, confusion_matrix, roc_curve,
                             auc, make_scorer, precision_score)
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.svm import SVC

#Remove deprecation warnings
import warnings
warnings.simplefilter('ignore', category = DeprecationWarning)

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


### Import pickled dataframe

In [2]:
#recall that cw stands for combined wine
with open('./combo_wines_cleaned.pickle', 'rb') as read_file:
    cwraw = pickle.load(read_file)

In [3]:
cwraw.head(2)

Unnamed: 0_level_0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,redness,class
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,1,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,1,0


In [4]:
#rename class to Class to as class is python object
cwraw.rename(columns = {'class': 'Class'}, inplace = True)
cwraw.head()

Unnamed: 0_level_0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,redness,Class
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,1,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,1,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,1,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,1,0
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,1,0


#### Remove `quality` from dataframe

In [5]:
cw = cwraw[['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'redness', 'Class']]

In [6]:
cw.head(1)

Unnamed: 0_level_0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,redness,Class
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,1,0


### Define X matrix and y vector

In [7]:
X = cw.iloc[:, :-1]
y = cw.iloc[:, -1]

### Splitting Train / Test Sets (with stratification)

In [8]:
X_train, X_test, y_train, y_test = train_test_split( X, y,
                                                   test_size=0.20,
                                                   stratify=y)

In [9]:
#Note the class breakdown in y_train
(y_train.value_counts(normalize=True)*100).values

array([80.35405041, 19.64594959])

In [10]:
#and the breakdown in y_test...
(y_test.value_counts(normalize=True)*100).values

array([80.30769231, 19.69230769])

In [11]:
#List of breakdowns

proportion = []
for each in [y_train, y_test]:
    out = (each.value_counts(normalize=True)*100).values
    proportion.append(out)
    
proportion[0:5]

[array([80.35405041, 19.64594959]), array([80.30769231, 19.69230769])]

In [12]:
#Check that the stratification works.  That is, do both sets have same proportion of classes 0&1?

types_of_y = [y_train, y_test]
proportion = [(each.value_counts(normalize=True)*100).values for each in types_of_y]

pd.DataFrame(np.r_[proportion], index=['% train', '% test'],
            columns = ['Class-0', 'Class-1'])

Unnamed: 0,Class-0,Class-1
% train,80.35405,19.64595
% test,80.307692,19.692308


### Tune performance for several models
-Logistic Regression  
-Random Forest  
-KNN

In [13]:
#NOTE: HAVE TO STRATIFY THE k-fold!
#use a var to hold the stratified object so that it can be called on subsequent models
skf = StratifiedKFold(n_splits=10, random_state=123)

In [14]:
#Set up each model w/ imb pipeline to feed GridsearchCV
pipe_lr = imbPipeline([
                        ('oversample', RandomOverSampler(random_state=0)),
                        ('scaling', StandardScaler()),
                        ('logreg', LogisticRegression())
                        ])
pipe_rf = imbPipeline([
                        ('oversample', RandomOverSampler(random_state=0)),
                        ('RandomForest', RandomForestClassifier())
                        ])
pipe_knn = imbPipeline([
                        ('oversample', RandomOverSampler(random_state=0)), 
                        ('knn', KNeighborsClassifier())
                       ])


In [15]:
# Grid-Searching parameters

param_lr   = {'logreg__penalty'   : ['l1','l2'],
              #'logreg__solver'    : ['liblinear','lbfgs'],
              'logreg__solver'    : ['liblinear'],
              'logreg__max_iter'  : [100, 200, 400],
              'logreg__C'         : np.logspace(0, 4, 10)
#               'logreg__C'         : [1.0, 2.0, 3.0, 4.0, 10, 15 ]
            }

        
param_rf = {'RandomForest__n_estimators'      : [50, 80, 200, 300],
             'RandomForest__max_features'     : ['auto','sqrt','log2'],
             'RandomForest__max_depth'        : [2, 3, 4, 5],
             'RandomForest__min_samples_split': [2, 3, 4],
             'RandomForest__min_samples_leaf' : [1, 2, 4],
             'RandomForest__bootstrap'        : [True, False],
            }

In [16]:
param_knn = {'knn__n_neighbors': [i for i in range(1,25)],
#              'knn__weights'  : ['uniform', 'distance']
                }

scoring = {'REC': make_scorer(recall_score), 
           'ACC': make_scorer(accuracy_score), 
           'PREC': make_scorer(precision_score)}

In [17]:
# Setup grid search

grid_lr = GridSearchCV(pipe_lr,param_grid=param_lr,
                       return_train_score=True, 
                       n_jobs=-1, scoring=scoring,
                       verbose=1, 
                       refit='PREC',
                      cv=skf)
grid_rf = GridSearchCV(pipe_rf, param_grid=param_rf,
                       return_train_score=True,
                       refit='PREC',
                       verbose=1,
                       n_jobs=-1, scoring=scoring,
                      cv=skf)

In [18]:
grid_knn = GridSearchCV(pipe_knn, param_grid=param_knn,
                       return_train_score=True,
                       refit='PREC',
                       verbose=1,
                       n_jobs=-1, scoring=scoring,
                      cv=skf)

In [19]:
# # Run GRID fit
grid_lr.fit(X_train, y_train)
grid_rf.fit(X_train, y_train)

Fitting 10 folds for each of 60 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:    6.9s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:   10.4s
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:   12.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting 10 folds for each of 864 candidates, totalling 8640 fits


[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:   16.3s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:   43.1s
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 1226 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 1776 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 2426 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done 3176 tasks      | elapsed:  6.9min
[Parallel(n_jobs=-1)]: Done 4026 tasks      | elapsed:  9.3min
[Parallel(n_jobs=-1)]: Done 4976 tasks      | elapsed: 11.4min
[Parallel(n_jobs=-1)]: Done 6026 tasks      | elapsed: 13.8min
[Parallel(n_jobs=-1)]: Done 7176 tasks      | elapsed: 16.8min
[Parallel(n_jobs=-1)]: Done 8426 tasks      | elapsed: 20.7min
[Parallel(n_jobs=-1)]: Done 8640 out of 8640 | elapsed: 21.4min finished


GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=123, shuffle=False),
             error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('oversample',
                                        RandomOverSampler(random_state=0,
                                                          ratio=None,
                                                          return_indices=False,
                                                          sampling_strategy='auto')),
                                       ('RandomForest',
                                        RandomForestClassifier(bootstrap=True,
                                                               class_weight=None,
                                                               criterion='gini',
                                                               max_depth...
                         'RandomForest__max_features': ['auto', 'sqrt', 'log2'],
           

In [20]:
grid_knn.fit(X_train, y_train)

Fitting 10 folds for each of 24 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:   21.4s
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:   29.1s finished


GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=123, shuffle=False),
             error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('oversample',
                                        RandomOverSampler(random_state=0,
                                                          ratio=None,
                                                          return_indices=False,
                                                          sampling_strategy='auto')),
                                       ('knn',
                                        KNeighborsClassifier(algorithm='auto',
                                                             leaf_size=30,
                                                             metric='minkowski',
                                                             metric_params=None,
                                                             n_...
                                    

In [21]:
pipe_knn.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'oversample', 'knn', 'oversample__random_state', 'oversample__ratio', 'oversample__return_indices', 'oversample__sampling_strategy', 'knn__algorithm', 'knn__leaf_size', 'knn__metric', 'knn__metric_params', 'knn__n_jobs', 'knn__n_neighbors', 'knn__p', 'knn__weights'])

In [22]:
# # Instantiate best Estimator
Opt_lr = grid_lr.best_estimator_
Opt_rf = grid_rf.best_estimator_
Opt_knn = grid_knn.best_estimator_

In [23]:
# Print results of the best_estimators

Opt_models= [grid_lr, grid_rf, grid_knn]
model_names = ['LogReg','RandomForest','knn']
for i in range(3):
    print('Grid Search Result for %s' %(model_names[i]))
    print('best parameters:     ', Opt_models[i].best_params_)
    print('best score     : %.2f' %Opt_models[i].best_score_)
    print('mean accuracy  : %.2f' %Opt_models[i].cv_results_['mean_test_ACC'].mean())
    print('mean recall    : %.2f' %Opt_models[i].cv_results_['mean_test_REC'].mean())
    print('mean precision : %.2f' %Opt_models[i].cv_results_['mean_test_PREC'].mean())

Grid Search Result for LogReg
best parameters:      {'logreg__C': 7.742636826811269, 'logreg__max_iter': 100, 'logreg__penalty': 'l1', 'logreg__solver': 'liblinear'}
best score     : 0.39
mean accuracy  : 0.72
mean recall    : 0.76
mean precision : 0.39
Grid Search Result for RandomForest
best parameters:      {'RandomForest__bootstrap': False, 'RandomForest__max_depth': 5, 'RandomForest__max_features': 'auto', 'RandomForest__min_samples_leaf': 1, 'RandomForest__min_samples_split': 2, 'RandomForest__n_estimators': 80}
best score     : 0.44
mean accuracy  : 0.74
mean recall    : 0.76
mean precision : 0.41
Grid Search Result for knn
best parameters:      {'knn__n_neighbors': 2}
best score     : 0.52
mean accuracy  : 0.70
mean recall    : 0.67
mean precision : 0.37


In [24]:
# # Pickle objects
with open('./gridsearched_Precision_lr.pkl','wb') as fin:
    pickle.dump(grid_lr, fin)
with open('./gridsearched_Precision_rf.pkl','wb') as fin:
    pickle.dump(grid_rf, fin)
with open('./gridsearched_Precision_knn.pkl','wb') as fin:
    pickle.dump(grid_knn, fin)

In [25]:
# # Pickle objects
with open('./opt_Precision_lr.pkl','wb') as fin:
    pickle.dump(Opt_lr, fin)
with open('./opt_Precision_rf.pkl','wb') as fin:
    pickle.dump(Opt_rf, fin)
with open('./opt_Precision_knn.pkl','wb') as fin:
    pickle.dump(Opt_knn, fin)

In [26]:
grid_knn.best_estimator_

Pipeline(memory=None,
         steps=[('oversample',
                 RandomOverSampler(random_state=0, ratio=None,
                                   return_indices=False,
                                   sampling_strategy='auto')),
                ('knn',
                 KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                      metric='minkowski', metric_params=None,
                                      n_jobs=None, n_neighbors=2, p=2,
                                      weights='uniform'))],
         verbose=False)

In [27]:
grid_knn.best_params_

{'knn__n_neighbors': 2}

In [28]:
grid_lr.best_params_

{'logreg__C': 7.742636826811269,
 'logreg__max_iter': 100,
 'logreg__penalty': 'l1',
 'logreg__solver': 'liblinear'}

In [29]:
grid_rf.best_params_

{'RandomForest__bootstrap': False,
 'RandomForest__max_depth': 5,
 'RandomForest__max_features': 'auto',
 'RandomForest__min_samples_leaf': 1,
 'RandomForest__min_samples_split': 2,
 'RandomForest__n_estimators': 80}

In [30]:
grid_lr.cv_results_['mean_test_REC']

array([0.75902041, 0.75902041, 0.75902041, 0.75902041, 0.75902041,
       0.75902041, 0.75902041, 0.75902041, 0.75902041, 0.75902041,
       0.75902041, 0.75902041, 0.75902041, 0.75902041, 0.75902041,
       0.75902041, 0.75902041, 0.75902041, 0.75902041, 0.75902041,
       0.75902041, 0.75902041, 0.75902041, 0.75902041, 0.75902041,
       0.75902041, 0.75902041, 0.75902041, 0.75902041, 0.75902041,
       0.75902041, 0.75902041, 0.75902041, 0.75902041, 0.75902041,
       0.75902041, 0.75902041, 0.75902041, 0.75902041, 0.75902041,
       0.75902041, 0.75902041, 0.75902041, 0.75902041, 0.75902041,
       0.75902041, 0.75902041, 0.75902041, 0.75902041, 0.75902041,
       0.75902041, 0.75902041, 0.75902041, 0.75902041, 0.75902041,
       0.75902041, 0.75902041, 0.75902041, 0.75902041, 0.75902041])