In [1]:
import numpy as np
import pandas as pd

#importing Sampling Helper
from sklearn.model_selection import train_test_split

#importing preprocessing modules
from sklearn import preprocessing

#import Random Forest model
from sklearn.ensemble import RandomForestRegressor

#Import Cross Validation Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

#import metrics to evaluate our model
from sklearn.metrics import mean_squared_error, r2_score

#import module to save our model
import joblib



In [None]:
#We’ll be training and tuning a random forest for wine quality (as judged by wine snobs experts) 
# based on traits like acidity, residual sugar, and alcohol concentration.

In [2]:
#importing Dataset
dataset_url = 'http://mlr.cs.umass.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
data = pd.read_csv(dataset_url)

   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0                  54.0   0.9970  3.26       0.65   
3                 17.0                  60.0   0.9980  3.16       0.58   
4                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality  
0      9.4        5  
1      9.8        5  
2      9.8        5 

In [7]:
data = pd.read_csv(dataset_url, sep=';')
print (data.head())

In [9]:
print (data.shape)

(1599, 12)


In [10]:
print (data.describe())

       fixed acidity  volatile acidity  citric acid  residual sugar  \
count    1599.000000       1599.000000  1599.000000     1599.000000   
mean        8.319637          0.527821     0.270976        2.538806   
std         1.741096          0.179060     0.194801        1.409928   
min         4.600000          0.120000     0.000000        0.900000   
25%         7.100000          0.390000     0.090000        1.900000   
50%         7.900000          0.520000     0.260000        2.200000   
75%         9.200000          0.640000     0.420000        2.600000   
max        15.900000          1.580000     1.000000       15.500000   

         chlorides  free sulfur dioxide  total sulfur dioxide      density  \
count  1599.000000          1599.000000           1599.000000  1599.000000   
mean      0.087467            15.874922             46.467792     0.996747   
std       0.047065            10.460157             32.895324     0.001887   
min       0.012000             1.000000         

In [11]:
#separate target feature from other features
y = data.quality
X = data.drop('quality', axis=1)

In [12]:
#Split data into training and test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=123, 
                                                    stratify=y)

In [13]:
#Fitting the Transformer API
scaler = preprocessing.StandardScaler().fit(X_train)

In [17]:
#applying transformerto training data
X_train_scaled = scaler.transform(X_train)
 
print (X_train_scaled.mean(axis=0))
print (X_train_scaled.std(axis=0))

#Applying transformer to test data

X_test_scaled = scaler.transform(X_test)
 
print (X_test_scaled.mean(axis=0))
print (X_test_scaled.std(axis=0))

[ 1.16664562e-16 -3.05550043e-17 -8.47206937e-17 -2.22218213e-17
  2.22218213e-17 -6.38877362e-17 -4.16659149e-18 -2.54439854e-15
 -8.70817622e-16 -4.08325966e-16 -1.17220107e-15]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
[ 0.02776704  0.02592492 -0.03078587 -0.03137977 -0.00471876 -0.04413827
 -0.02414174 -0.00293273 -0.00467444 -0.10894663  0.01043391]
[1.02160495 1.00135689 0.97456598 0.91099054 0.86716698 0.94193125
 1.03673213 1.03145119 0.95734849 0.83829505 1.0286218 ]


In [18]:
#pipeline with preprocessing and model
pipeline = make_pipeline(preprocessing.StandardScaler(), 
                         RandomForestRegressor(n_estimators=100))

In [20]:
#Tuning HYperparameters

print (pipeline.get_params())

hyperparameters = { 'randomforestregressor__max_features' : ['auto', 'sqrt', 'log2'],
                  'randomforestregressor__max_depth': [None, 5, 3, 1]}

{'memory': None, 'steps': [('standardscaler', StandardScaler()), ('randomforestregressor', RandomForestRegressor())], 'verbose': False, 'standardscaler': StandardScaler(), 'randomforestregressor': RandomForestRegressor(), 'standardscaler__copy': True, 'standardscaler__with_mean': True, 'standardscaler__with_std': True, 'randomforestregressor__bootstrap': True, 'randomforestregressor__ccp_alpha': 0.0, 'randomforestregressor__criterion': 'mse', 'randomforestregressor__max_depth': None, 'randomforestregressor__max_features': 'auto', 'randomforestregressor__max_leaf_nodes': None, 'randomforestregressor__max_samples': None, 'randomforestregressor__min_impurity_decrease': 0.0, 'randomforestregressor__min_impurity_split': None, 'randomforestregressor__min_samples_leaf': 1, 'randomforestregressor__min_samples_split': 2, 'randomforestregressor__min_weight_fraction_leaf': 0.0, 'randomforestregressor__n_estimators': 100, 'randomforestregressor__n_jobs': None, 'randomforestregressor__oob_score': F

In [21]:
#Sklearn cross-validation with pipeline

clf = GridSearchCV(pipeline, hyperparameters, cv=10)
 
# Fit and tune model
clf.fit(X_train, y_train)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('randomforestregressor',
                                        RandomForestRegressor())]),
             param_grid={'randomforestregressor__max_depth': [None, 5, 3, 1],
                         'randomforestregressor__max_features': ['auto', 'sqrt',
                                                                 'log2']})

In [22]:
print (clf.best_params_)

{'randomforestregressor__max_depth': None, 'randomforestregressor__max_features': 'auto'}


In [23]:
#Predict the new set of data
y_pred = clf.predict(X_test)

In [25]:
#metrics to evaluate the model
print (r2_score(y_test, y_pred))
 
print (mean_squared_error(y_test, y_pred))


0.4592495913796236
0.348931875


In [26]:
#save model for future use
joblib.dump(clf, 'rf_regressor.pkl')

['rf_regressor.pkl']

In [27]:
#loading it again -- Just for reuse
clf2 = joblib.load('rf_regressor.pkl')
 
# Predict data set using loaded model
clf2.predict(X_test)

array([6.59, 5.73, 4.97, 5.57, 6.53, 5.81, 4.8 , 4.58, 5.  , 5.99, 5.29,
       5.72, 5.82, 5.03, 5.89, 5.65, 6.74, 5.69, 5.7 , 6.97, 5.44, 5.56,
       4.99, 6.08, 5.9 , 5.13, 5.51, 5.17, 5.86, 5.91, 5.86, 6.74, 5.99,
       5.03, 4.87, 5.88, 5.03, 5.87, 5.14, 5.9 , 4.93, 5.98, 6.79, 5.05,
       6.13, 5.36, 5.48, 5.5 , 5.12, 6.5 , 5.89, 5.22, 5.85, 5.15, 5.66,
       5.83, 5.24, 5.4 , 4.96, 5.27, 5.31, 5.02, 5.  , 5.86, 6.02, 5.27,
       6.33, 5.04, 5.1 , 6.65, 5.74, 5.49, 5.09, 5.02, 5.31, 5.98, 5.31,
       5.07, 5.33, 5.17, 6.58, 5.57, 6.28, 6.47, 5.16, 5.92, 6.44, 6.02,
       5.33, 5.87, 5.91, 5.33, 6.47, 5.64, 5.78, 5.77, 6.74, 6.84, 5.5 ,
       6.89, 5.04, 5.32, 5.12, 6.7 , 5.02, 4.74, 5.72, 4.9 , 5.62, 5.96,
       5.66, 5.61, 6.16, 5.45, 5.01, 5.2 , 5.93, 5.06, 4.82, 6.07, 5.83,
       5.11, 5.82, 6.09, 5.17, 5.34, 5.27, 5.84, 5.43, 5.45, 5.87, 6.35,
       5.18, 5.31, 5.04, 6.47, 5.02, 5.19, 6.86, 5.38, 5.17, 5.08, 5.97,
       6.11, 5.34, 5.4 , 5.12, 6.74, 5.45, 5.03, 5.