https://elitedatascience.com/python-machine-learning-tutorial-scikit-learn


In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.externals import joblib

# ------------------------------------------------------------------------------
# Load the data set:

data = pd.read_csv("winequality_red.csv", sep=';')

print("\n >>> data.shape <<< \n")
print(data.shape)

print("\n >>> data.head <<< \n")
print(data.head())

print("\n >>> data.describe <<< \n")
print(data.describe())


 >>> data.shape <<< 

(1599, 12)

 >>> data.head <<< 

   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0                  54.0   0.9970  3.26       0.65   
3                 17.0                  60.0   0.9980  3.16       0.58   
4                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality  
0      9

In [4]:
# ------------------------------------------------------------------------------
# We'll need to split our data into training and test sets. Splitting the data 
# at the beginning of your modeling workflow is crucial for getting a realistic 
# estimate of your model's performance.

# First, let's separate our target (y) features from our input (X) features:
# This will allow us to take advantage of Scikit-Learn's useful train_test_split 
# function in the next step:

y = data.quality
X = data.drop('quality', axis=1)

# ------------------------------------------------------------------------------
# Next, split data into training and test sets:

# As you can see, we'll set aside 20% of the data as a test set for evaluating 
# our model. We also set an arbitrary "random state" (a.k.a. seed) so that we 
# can reproduce our results. Finally, it's good practice to stratify your sample 
# by the target variable. This will ensure your training set looks similar to 
# your test set, making your evaluation metrics more reliable.

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=123, 
                                                    stratify=y)

# ------------------------------------------------------------------------------
# Data Preprocessing:

# Next, we need to standardize or scale our features. Standardization is the 
# process of subtracting the means from each feature and then dividing by the 
# feature standard deviations. This is very common procedure many ML algorithms
# assume that all features are centered around zero and have approximately the 
# same variance.

# We'll use Scikit-Learn's Transformer API to scale the data. After scaling, the 
# 'scaler' object below will have the saved means and standard deviations for each 
# feature in the training set.

scaler = preprocessing.StandardScaler().fit(X_train)

# Let's confirm that it worked by applying the transformer to the training data.

# Applying transformer to training data.
X_train_scaled = scaler.transform(X_train)

print("\n >>> X_train_scaled.mean <<< \n")
print(X_train_scaled.mean(axis=0))

print("\n >>> X_train_scaled.std <<< \n")
print(X_train_scaled.std(axis=0))

# Applying transformer to test data.
X_test_scaled = scaler.transform(X_test)

print("\n >>> X_test_scaled.mean <<< \n")
print(X_test_scaled.mean(axis=0))

print("\n >>> X_test_scaled.std <<< \n")
print(X_test_scaled.std(axis=0))


 >>> X_train_scaled.mean <<< 

[ 1.16664562e-16 -3.05550043e-17 -8.47206937e-17 -2.22218213e-17
  2.22218213e-17 -6.38877362e-17 -4.16659149e-18 -2.54439854e-15
 -8.70817622e-16 -4.08325966e-16 -1.17220107e-15]

 >>> X_train_scaled.std <<< 

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]

 >>> X_test_scaled.mean <<< 

[ 0.02776704  0.02592492 -0.03078587 -0.03137977 -0.00471876 -0.04413827
 -0.02414174 -0.00293273 -0.00467444 -0.10894663  0.01043391]

 >>> X_test_scaled.std <<< 

[1.02160495 1.00135689 0.97456598 0.91099054 0.86716698 0.94193125
 1.03673213 1.03145119 0.95734849 0.83829505 1.0286218 ]


In [5]:
# ------------------------------------------------------------------------------
# Create pipeline and declare hyperparameters to tune:

# This is exactly what it looks like: a modeling pipeline that first transforms 
# the data using StandardScaler() and then fits a model using a random forest regressor.

pipeline = make_pipeline(preprocessing.StandardScaler(), RandomForestRegressor(n_estimators=100))

# List tunable hyperparameters.
print("\n >>> pipeline.get_params <<< \n")
print(pipeline.get_params())

# Now, let's declare the hyperparameters we want to tune through cross-validation.

hyperparameters = { 'randomforestregressor__max_features' : ['auto', 'sqrt', 'log2'],
                    'randomforestregressor__max_depth': [None, 5, 3, 1]}


 >>> pipeline.get_params <<< 

{'memory': None, 'steps': [('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('randomforestregressor', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False))], 'verbose': False, 'standardscaler': StandardScaler(copy=True, with_mean=True, with_std=True), 'randomforestregressor': RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, 

In [6]:
# ------------------------------------------------------------------------------
# Tune model using a cross-validation pipeline.

# GridSearchCV to setup cross-validation on our pipeline. GridSearchCV essentially 
# performs cross-validation across the entire "grid" (all possible permutations)
# of hyperparameters.

clf = GridSearchCV(pipeline, hyperparameters, cv=10)
 
# Fit and tune model.
clf.fit(X_train, y_train)

# Now, we can see the best set of parameters found using CV:
print("\n >>> clf.best_params_ <<< \n")
print(clf.best_params_)

# ------------------------------------------------------------------------------
# Refit on the entire training set.

# After you've tuned your hyperparameters appropriately using cross-validation, 
# you can generally get a small performance improvement by refitting the model 
# on the entire training set.

# Conveniently, GridSearchCV from sklearn will automatically refit the model 
# with the best set of hyperparameters using the entire training set.

# This functionality is ON by default, but you can confirm it:

# Confirm model will be retrained.
print("\n >>> clf.refit <<< \n")
print(clf.refit)


 >>> clf.best_params_ <<< 

{'randomforestregressor__max_depth': None, 'randomforestregressor__max_features': 'auto'}

 >>> clf.refit <<< 

True


In [7]:
# ------------------------------------------------------------------------------
# Evaluate model pipeline on test data.

# Now, we can predict a new set of data:

y_pred = clf.predict(X_test)

# After we predict, we can use the metrics we imported earlier to evaluate our 
# model performance.

print("\n >>> r2_score(y_test, y_pred) <<< \n")
print(r2_score(y_test, y_pred))

print("\n >>> mean_squared_error(y_test, y_pred) <<< \n")
print(mean_squared_error(y_test, y_pred))


 >>> r2_score(y_test, y_pred) <<< 

0.46062546158968476

 >>> mean_squared_error(y_test, y_pred) <<< 

0.3480440625


In [12]:
# ------------------------------------------------------------------------------
# Save model to .pkl file. for future use.

joblib.dump(clf, 'rf_regressor.pkl')

['rf_regressor.pkl']

In [13]:
# ------------------------------------------------------------------------------
# To use the model in the future - load it from the .pkl file.

clf2 = joblib.load('rf_regressor.pkl')

# Predict data set using loaded model.
clf2.predict(X_test)

# After we predict, we can use the metrics we imported earlier to evaluate our 
# model performance.

print("\n >>> r2_score(y_test, y_pred) <<< \n")
print(r2_score(y_test, y_pred))

print("\n >>> mean_squared_error(y_test, y_pred) <<< \n")
print(mean_squared_error(y_test, y_pred))


 >>> r2_score(y_test, y_pred) <<< 

0.46062546158968476

 >>> mean_squared_error(y_test, y_pred) <<< 

0.3480440625
