In [1]:
# import packages and modules
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.externals import joblib

In [2]:
# read data set from url
dataset_url = "http://mlr.cs.umass.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
data = pd.read_csv(dataset_url)

In [3]:
# print head of data set
print(data.head())

  fixed acidity;"volatile acidity";"citric acid";"residual sugar";"chlorides";"free sulfur dioxide";"total sulfur dioxide";"density";"pH";"sulphates";"alcohol";"quality"
0   7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5                                                                                                                     
1   7.8;0.88;0;2.6;0.098;25;67;0.9968;3.2;0.68;9.8;5                                                                                                                     
2  7.8;0.76;0.04;2.3;0.092;15;54;0.997;3.26;0.65;...                                                                                                                     
3  11.2;0.28;0.56;1.9;0.075;17;60;0.998;3.16;0.58...                                                                                                                     
4   7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5                                                                                                  

In [15]:
# clearly not comma separated, semi-colon separated in reality
# adjust read_csv() call to accomodate separation
data = pd.read_csv(dataset_url, sep = ";")

In [23]:
# display structure of data set
data_df = pd.DataFrame(data=data[0:],columns=list(data))
data = data_df
data.quality = data.quality
data.head()
# print(data)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [65]:
# display count of rows and columns in data frame
print("Number of columns in data frame: %d" %len(data.columns))
print("Column labels: ")
for i in range(len(list(data))):
    print(list(data)[i])
    
def num_rows(data):
    return len(data)
print("")
print("Number of rows in data frame: %d \n" %num_rows(data))
print("Target values: %s" % data.quality.unique())

# observe shape of data (samples and features)
print("")
print("(Rows, Features):")
data.shape
# 1599 rows, 12 features

Number of columns in data frame: 12
Column labels: 
fixed acidity
volatile acidity
citric acid
residual sugar
chlorides
free sulfur dioxide
total sulfur dioxide
density
pH
sulphates
alcohol
quality

Number of rows in data frame: 1599 

Target values: [5 6 7 4 8 3]

(Rows, Features):


(1599, 12)

In [7]:
# display summary stats
print(data.describe())
# data is all numeric, however they have differing scales: data needs standardization

       fixed acidity  volatile acidity  citric acid  residual sugar  \
count    1599.000000       1599.000000  1599.000000     1599.000000   
mean        8.319637          0.527821     0.270976        2.538806   
std         1.741096          0.179060     0.194801        1.409928   
min         4.600000          0.120000     0.000000        0.900000   
25%         7.100000          0.390000     0.090000        1.900000   
50%         7.900000          0.520000     0.260000        2.200000   
75%         9.200000          0.640000     0.420000        2.600000   
max        15.900000          1.580000     1.000000       15.500000   

         chlorides  free sulfur dioxide  total sulfur dioxide      density  \
count  1599.000000          1599.000000           1599.000000  1599.000000   
mean      0.087467            15.874922             46.467792     0.996747   
std       0.047065            10.460157             32.895324     0.001887   
min       0.012000             1.000000         

In [8]:
# remove target from data set
y = data.quality
x = data.drop("quality", axis = 1)
# partition data (80/20)
# random_state sets seed for reproduction of results
# startify on target (y): ensures training set and testing set look similar
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 123, stratify = y)

In [9]:
# standardization that can be reproduced on training and testing sets by saving means 
# and standard deviations from training set standardization
scaler = preprocessing.StandardScaler().fit(x_train)
# confirm standardization
x_train_scaled = scaler.transform(x_train)
print(x_train_scaled.mean(axis = 0))
print(x_train_scaled.std(axis = 0))

[ 1.16664562e-16 -3.05550043e-17 -8.47206937e-17 -2.22218213e-17
  2.22218213e-17 -6.38877362e-17 -4.16659149e-18 -2.54439854e-15
 -8.70817622e-16 -4.08325966e-16 -1.17220107e-15]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [10]:
# apply transformer to testing set
x_test_scaled = scaler.transform(x_test)
# confirm standardization
print(x_test_scaled.mean(axis = 0))
print(x_test_scaled.std(axis = 0))

[ 0.02776704  0.02592492 -0.03078587 -0.03137977 -0.00471876 -0.04413827
 -0.02414174 -0.00293273 -0.00467444 -0.10894663  0.01043391]
[1.02160495 1.00135689 0.97456598 0.91099054 0.86716698 0.94193125
 1.03673213 1.03145119 0.95734849 0.83829505 1.0286218 ]


In [11]:
# pipeline with preprocessing and model
pipeline = make_pipeline(preprocessing.StandardScaler(), RandomForestRegressor(n_estimators = 100))

In [12]:
# print tunable hyperparameters
print(pipeline.get_params())

{'randomforestregressor__min_samples_leaf': 1, 'randomforestregressor__max_leaf_nodes': None, 'randomforestregressor__max_features': 'auto', 'randomforestregressor__min_impurity_split': None, 'standardscaler__copy': True, 'randomforestregressor__oob_score': False, 'randomforestregressor__min_weight_fraction_leaf': 0.0, 'randomforestregressor__min_samples_split': 2, 'memory': None, 'randomforestregressor__min_impurity_decrease': 0.0, 'standardscaler': StandardScaler(copy=True, with_mean=True, with_std=True), 'randomforestregressor__random_state': None, 'standardscaler__with_mean': True, 'randomforestregressor__criterion': 'mse', 'randomforestregressor': RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
           oob_score=False, ran

In [13]:
# declare desired hyperparameters
hyperparameters = {"randomforestregressor__max_features" : ["auto", "sqrt", "log2"],
                  "randomforestregressor__max_depth": [None, 5, 3, 1]}

In [15]:
# tune model using cross-validation pipeline
clf = GridSearchCV(pipeline, hyperparameters, cv = 10)
# fit and tune model
clf.fit(x_train, y_train)
# print best set of parameters
print(clf.best_params_)

{'randomforestregressor__max_depth': None, 'randomforestregressor__max_features': 'sqrt'}


In [16]:
# refit on entire training set
print(clf.refit)

True


In [17]:
# evaluate model pipeline on test data
# predict new set of data
y_pred = clf.predict(x_test)

In [18]:
# evaluate model performance
print(r2_score(y_test, y_pred))
print(mean_squared_error(y_test, y_pred))

0.4682205944669775
0.343143125


In [19]:
# save model for future use
joblib.dump(clf, "rf_regressor.pkl")

['rf_regressor.pkl']

In [20]:
# to load model again
clf2 = joblib.load("rf_regressor.pkl")
# predict data set using loaded model
clf2.predict(x_test)

array([6.55, 5.78, 4.95, 5.63, 6.32, 5.53, 4.98, 4.77, 5.01, 5.99, 5.29,
       5.67, 5.88, 5.14, 5.71, 5.49, 6.59, 5.75, 5.71, 6.91, 5.45, 5.6 ,
       5.05, 6.14, 5.93, 5.  , 5.32, 5.17, 5.95, 5.96, 5.85, 6.46, 5.98,
       5.02, 4.98, 6.  , 5.05, 6.05, 5.01, 5.96, 4.84, 5.81, 6.56, 5.17,
       6.15, 5.35, 5.51, 5.51, 5.12, 6.55, 5.9 , 5.29, 5.91, 5.18, 5.49,
       5.64, 5.36, 5.4 , 5.02, 5.22, 5.26, 5.19, 5.05, 5.82, 5.97, 5.23,
       6.42, 5.  , 5.16, 6.69, 5.72, 5.78, 5.15, 5.05, 5.34, 6.02, 5.47,
       5.12, 5.14, 5.36, 6.38, 5.63, 6.12, 6.45, 5.09, 6.01, 6.4 , 6.39,
       5.72, 5.71, 5.94, 5.27, 6.43, 5.64, 5.73, 5.83, 6.79, 6.65, 5.51,
       6.73, 5.13, 5.39, 5.07, 6.35, 5.05, 4.86, 5.67, 5.13, 5.62, 5.88,
       5.95, 5.42, 6.02, 5.39, 5.1 , 5.26, 5.84, 5.13, 5.05, 6.03, 5.83,
       5.08, 5.79, 6.21, 5.36, 5.27, 5.42, 5.96, 5.48, 5.42, 5.71, 6.46,
       5.18, 5.3 , 5.11, 6.24, 5.03, 5.17, 6.67, 5.53, 5.17, 5.11, 5.65,
       6.07, 5.23, 5.43, 5.14, 6.46, 5.69, 5.12, 5.