# Persisting `scikit-learn` models

In [1]:
import json
import numpy as np
import pandas as pd
import pickle
import re

from sklearn import ensemble, metrics, model_selection as ms

Load the [Wine Quality](https://archive.ics.uci.edu/ml/datasets/Wine+Quality) data set and fix the column names.

In [2]:
wines = pd.read_csv('datasets/wine-quality.csv', sep=';')

In [3]:
wines.rename(columns=lambda x: re.sub(r'\s+', '_', x.lower()), inplace=True)

In [4]:
wines.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


Train a cross-validated random forest regression model.

In [5]:
X = wines.drop(columns=['quality'])
y = wines['quality']

In [6]:
cv = ms.KFold(n_splits=5, shuffle=True, random_state=42)

In [7]:
estimator = ensemble.RandomForestRegressor(random_state=42)

gs = ms.GridSearchCV(estimator=estimator,
                     param_grid={'n_estimators': [25, 50, 100],
                                 'max_depth': [5, 10, 25]},
                     scoring='neg_mean_squared_error',
                     cv=cv)

In [8]:
gs.fit(X, y)

GridSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=True),
       error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [25, 50, 100], 'max_depth': [5, 10, 25]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [9]:
-gs.best_score_

0.3669921743780488

Persist the model to disk (check out [this page](https://scikit-learn.org/stable/modules/model_persistence.html)).

In [10]:
with open('prediction-service/features.json', 'w') as features_fp:
    json.dump({k: i for i, k in enumerate(X.columns)}, features_fp,
              indent=4, sort_keys=True)

In [11]:
with open('prediction-service/model.pkl', 'wb') as model_fp:
    pickle.dump(gs.best_estimator_, model_fp)