In [291]:
# We collect all needed modules here.
import logging
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

# Task

We want to determine the quality of a red/white wine. To do so, we will build a model based on some data with quality labels (which are ranks between 0 and 10). Each red/white wine is described by a list of attributes with values. The model will be built using supervised learning. In other words, we will train the model to make it match the ranks to the corresponding attribute values. The ultimate goal is that the model can correctly determine the rank given a new red/white wine with its attribute values. 

# Data Source

Download the data from UCI: https://archive.ics.uci.edu/dataset/186/wine+quality

# Data Profiling

We would like to have some general ideas about the data.

In [6]:
# Let's take a look at the data files.
! ls -ahl

total 456K
drwxrwxr-x 3 fmeng fmeng 4.0K Aug 25 23:42 .
drwxrwxr-x 6 fmeng fmeng 4.0K Aug 25 22:45 ..
drwxrwxr-x 2 fmeng fmeng 4.0K Aug 25 22:47 .ipynb_checkpoints
-rw-rw-r-- 1 fmeng fmeng 2.4K Aug 25 23:42 ml_hands_on.ipynb
-rwx------ 1 fmeng fmeng 3.3K May 22 22:24 winequality.names
-rwx------ 1 fmeng fmeng  83K May 22 22:24 winequality-red.csv
-rwx------ 1 fmeng fmeng 259K May 22 22:24 winequality-white.csv
-rw-rw-r-- 1 fmeng fmeng  90K Aug 25 22:42 wine+quality.zip


In [18]:
# Read a few lines (5 lines in this example) of each CSV file to understand its format.
! head -5 winequality-red.csv

"fixed acidity";"volatile acidity";"citric acid";"residual sugar";"chlorides";"free sulfur dioxide";"total sulfur dioxide";"density";"pH";"sulphates";"alcohol";"quality"
7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5
7.8;0.88;0;2.6;0.098;25;67;0.9968;3.2;0.68;9.8;5
7.8;0.76;0.04;2.3;0.092;15;54;0.997;3.26;0.65;9.8;5
11.2;0.28;0.56;1.9;0.075;17;60;0.998;3.16;0.58;9.8;6


In [17]:
! head -5 winequality-white.csv

"fixed acidity";"volatile acidity";"citric acid";"residual sugar";"chlorides";"free sulfur dioxide";"total sulfur dioxide";"density";"pH";"sulphates";"alcohol";"quality"
7;0.27;0.36;20.7;0.045;45;170;1.001;3;0.45;8.8;6
6.3;0.3;0.34;1.6;0.049;14;132;0.994;3.3;0.49;9.5;6
8.1;0.28;0.4;6.9;0.05;30;97;0.9951;3.26;0.44;10.1;6
7.2;0.23;0.32;8.5;0.058;47;186;0.9956;3.19;0.4;9.9;6


- The first line of this CSV file is called the *"header"*. It specifies the column names. Though, a CSV file does not necessarily have the header. In other words, some CSV files simply start their data from the very first row.
- The symbol ";" is called the "*separator*" which separates columns. Various characters can be used as separators, such as "," and the tab character. Usually, we may have to view a few lines to tell which character is used. 

In [4]:
# Load the data into pandas DataFrame.
df_red = pd.read_csv('winequality-red.csv', sep=';')
# Check the type of each column.
print('Red Wine Data Profile:\n', df_red.dtypes)
# Get some basic statistics.
print('Red Wine Data Profile:\n', df_red.describe())

Red Wine Data Profile:
 fixed acidity           float64
volatile acidity        float64
citric acid             float64
residual sugar          float64
chlorides               float64
free sulfur dioxide     float64
total sulfur dioxide    float64
density                 float64
pH                      float64
sulphates               float64
alcohol                 float64
quality                   int64
dtype: object
Red Wine Data Profile:
        fixed acidity  volatile acidity  citric acid  residual sugar  \
count    1599.000000       1599.000000  1599.000000     1599.000000   
mean        8.319637          0.527821     0.270976        2.538806   
std         1.741096          0.179060     0.194801        1.409928   
min         4.600000          0.120000     0.000000        0.900000   
25%         7.100000          0.390000     0.090000        1.900000   
50%         7.900000          0.520000     0.260000        2.200000   
75%         9.200000          0.640000     0.420000      

# Data Preparation for Machine Learning

### Data Set Split
Typically, the raw dataset needs to be split to at least two subsets: *training set* and *test set*. The training set is used to actually train the model. And, the test set is used to evaluate how well the trained model performs. In some other cases, another subset is needed to tune the machine learning model, and usually we call it *validation set*. 

In [29]:
# We use a friendly tool provided in scikit-learn to split the raw dataset. And, we use 2/3 of the raw data to do the training, and the rest to do the testing.
df_train_set, df_test_set = train_test_split(df_red, test_size=0.33, shuffle=True)
print('df_train_set shape:', df_train_set.shape)
print('df_test_set shape:', df_test_set.shape)

df_train_set shape: (1071, 12)
df_test_set shape: (528, 12)


In [299]:
# To make our training and testing smoother, we need to separate the attributes from the quality ranks in each of the set obtained above. 
# And, instead of using pandas DataFrame, we use NumPy ndarray for convenience in training and testing.
nd_train_data = df_train_set[df_train_set.columns.drop('quality')].to_numpy()
print('nd_train_data shape:', nd_train_data.shape)
nd_train_label = df_train_set['quality'].to_numpy()
print('nd_train_label shape:', nd_train_label.shape)

nd_test_data = df_test_set[df_test_set.columns.drop('quality')].to_numpy()
print('nd_test_data shape:', nd_test_data.shape)
nd_test_label = df_test_set['quality'].to_numpy()
print('nd_test_label shape:', nd_test_label.shape)

nd_train_data shape: (1071, 11)
nd_train_label shape: (1071,)
nd_test_data shape: (528, 11)
nd_test_label shape: (528,)


### Data Scaling

Attributes may have different ranges for their values. Let's take a look at basic statatistics of our data in the following figure.

<img src="wine_attribute_stat.png"/>

The min and max values in the table justify my point. For example, the attribute "Chlorides" has lower values than others. Such differences in scales may impede the training process and eventually impair the performance of model. In practice, we can compute z-scores of the attribute values to offset the differences in scale. 

In [333]:
train_scaler = StandardScaler().fit(nd_train_data)
nd_train_data = train_scaler.transform(nd_train_data)
test_scaler = StandardScaler().fit(nd_test_data)
nd_test_data = test_scaler.transform(nd_test_data)
print('Train Col #2 mean =', np.round(np.mean(nd_train_data[:, 1]), decimals=2))
print('Train Col #2 std =', np.round(np.std(nd_train_data[:, 1]), decimals=2))

Train Col #2 mean = -0.0
Train Col #2 std = 1.0


# Model Traning

Without looking into the models, we simply treat them as blackboxes and train them using our training data. To help you gain better intuition, the model training can be roughly thought of as a procedure correlating the attribute values and the corresponding quality ranks. 

In [317]:
# Model #1: Linear Regression

# Create a Linear Regression model (not trained yet).
lin_reg = linear_model.LinearRegression()
# Train the model. And, the returned model is the trained model.
lin_reg = lin_reg.fit(nd_train_data, nd_train_label)
# Let's take a look at the score of training. In general, this scoring is based on the comparisons between the true labels and the predicted labels.
# scikit-learn provides an algorithm of this scoring. When evaluating the performance of training, various scoring methods could be used. 
# Somehow, again, without looking into the technical details, we simply take the score value as a performance indicator. The best score is 1.0 indicating 
# a perfect performance of training. And, the lower, the worse, and it can be negative.
train_score = lin_reg.score(nd_train_data, nd_train_label)
print('Training score =', train_score)

Training score = 0.37929697168861654


Uh...apparently, this score could hardly be said to be satisfactory. If one training trial doesn't convince you, let's try more. 

In [318]:
# We will shuffle the raw data and sample a training dataset for each trial.
for i in range(50):
    df_train_set_rand, df_test_set_rand = train_test_split(df_red, test_size=0.33, random_state=np.random.randint(low=1, high=100), shuffle=True)
    nd_train_data_rand = df_train_set_rand[df_train_set_rand.columns.drop('quality')].to_numpy()
    nd_train_label_rand = df_train_set_rand['quality'].to_numpy()
    lin_reg_rand = linear_model.LinearRegression()
    lin_reg_rand = lin_reg_rand.fit(nd_train_data_rand, nd_train_label_rand)
    train_score = lin_reg_rand.score(nd_train_data_rand, nd_train_label_rand)
    print('Trial %s: Trainin score = %s' % (i, train_score))

Trial 0: Trainin score = 0.361670247040604
Trial 1: Trainin score = 0.35119184141788384
Trial 2: Trainin score = 0.3739379378814498
Trial 3: Trainin score = 0.3405121545545843
Trial 4: Trainin score = 0.3871966126815539
Trial 5: Trainin score = 0.36897466917433863
Trial 6: Trainin score = 0.3856845263811164
Trial 7: Trainin score = 0.33925043429827717
Trial 8: Trainin score = 0.3863755610469831
Trial 9: Trainin score = 0.3522979064720929
Trial 10: Trainin score = 0.34929195458368256
Trial 11: Trainin score = 0.37860527614346995
Trial 12: Trainin score = 0.3663788259117796
Trial 13: Trainin score = 0.358138525644185
Trial 14: Trainin score = 0.36993992564004086
Trial 15: Trainin score = 0.3504680332659327
Trial 16: Trainin score = 0.36993992564004086
Trial 17: Trainin score = 0.3539495641858609
Trial 18: Trainin score = 0.35839608052906347
Trial 19: Trainin score = 0.36255811694292706
Trial 20: Trainin score = 0.3562524107497428
Trial 21: Trainin score = 0.361670247040604
Trial 22: Trai

After another 50 trials, it's sort of convining that the Linear Regression model, w.r.t. our current training strategy, may not work well. Though, we still want to see how the model would perform on the test set.

# Model Testing

In testing, we will use the other dataset split from the raw data. We cannot use the same dataset for both training and testing. It's cheating. This is because the trained model may only perform well on the training set but not quite on any other datasets. In other words, the model is useless.

In [319]:
# We simply use the same scoring to evaluate the performance of the trained model on the test set.
test_score = lin_reg.score(nd_test_data, nd_test_label)
print('test_score =', test_score)

test_score = 0.31260962625335964


Without any surprise, the performance score on testing is not appealing. More importantly, it is different from the training score. And, usually, testing scores are lower, more or less, than the training scores.

# Try Other Models

Linear Regression is not our only option. And, in practice, it's very common that some models don't work well no matter how hard you train them. A straightforward strategy in this case is to try some other models. On the other hand, it's important to note that a model failing to perform well on one problem doesn't necessarily imply its performance on others. There is a famous theorem called **No Free Lunch Theorem** roughly stating the fact that no model rules everything, only performs well in some cases while bad in others. 

In [320]:
# A neural network model, multi-layer perceptron. 
mlp = MLPClassifier(hidden_layer_sizes=(300, 500, 300), learning_rate='adaptive', solver='adam', 
                    random_state=np.random.randint(low=1, high=100), max_iter=2000)
mlp = mlp.fit(nd_train_data, nd_train_label)
train_score = mlp.score(nd_train_data, nd_train_label)
print('Training score =', train_score)
test_score = mlp.score(nd_test_data, nd_test_label)
print('Testing score =', test_score)

Training score = 0.9626517273576097
Testing score = 0.6098484848484849


In [321]:
# Gaussian Process
gp = GaussianProcessClassifier(random_state=np.random.randint(low=1, high=100), n_jobs=-1)
gp = gp.fit(nd_train_data, nd_train_label)
train_score = gp.score(nd_train_data, nd_train_label)
print('Training score =', train_score)
test_score = gp.score(nd_test_data, nd_test_label)
print('Testing score =', test_score)

Training score = 0.8300653594771242
Testing score = 0.5965909090909091


In [322]:
# K-Nearest-Neighbor
knn = KNeighborsClassifier(n_neighbors=10, weights='distance', n_jobs=-1)
knn = knn.fit(nd_train_data, nd_train_label)
train_score = knn.score(nd_train_data, nd_train_label)
print('Training score =', train_score)
test_score = knn.score(nd_test_data, nd_test_label)
print('Testing score =', test_score)

Training score = 1.0
Testing score = 0.6079545454545454


In [343]:
# Decision Tree
dt = DecisionTreeClassifier(max_depth=10)
dt = dt.fit(nd_train_data, nd_train_label)
train_score = dt.score(nd_train_data, nd_train_label)
print('Training score =', train_score)
test_score = dt.score(nd_test_data, nd_test_label)
print('Testing score =', test_score)

Training score = 0.876750700280112
Testing score = 0.5662878787878788


In [348]:
# An ensemble model, Adaboost.
ada = AdaBoostClassifier(n_estimators=20, random_state=np.random.randint(low=1, high=100))
ada = ada.fit(nd_train_data, nd_train_label)
train_score = ada.score(nd_train_data, nd_train_label)
print('Training score =', train_score)
test_score = ada.score(nd_test_data, nd_test_label)
print('Testing score =', test_score)

Training score = 0.5602240896358543
Testing score = 0.5284090909090909


In [351]:
# Another ensemble model, Random Forest.
rf = RandomForestClassifier(max_depth=10, n_estimators=10, criterion='entropy', n_jobs=-1)
rf = rf.fit(nd_train_data, nd_train_label)
train_score = rf.score(nd_train_data, nd_train_label)
print('Training score =', train_score)
test_score = rf.score(nd_test_data, nd_test_label)
print('Testing score =', test_score)

Training score = 0.9477124183006536
Testing score = 0.6098484848484849


In [352]:
# Naive Bayes
gnb = GaussianNB()
gnb = gnb.fit(nd_train_data, nd_train_label)
train_score = gnb.score(nd_train_data, nd_train_label)
print('Training score =', train_score)
test_score = gnb.score(nd_test_data, nd_test_label)
print('Testing score =', test_score)

Training score = 0.5639589169000934
Testing score = 0.5416666666666666


In [372]:
# Support Vector Machine
svm = SVC(gamma=2, C=3, random_state=np.random.randint(low=1, high=100))
svm = svm.fit(nd_train_data, nd_train_label)
train_score = svm.score(nd_train_data, nd_train_label)
print('Training score =', train_score)
test_score = svm.score(nd_test_data, nd_test_label)
print('Testing score =', test_score)

Training score = 0.9981325863678805
Testing score = 0.625


For each model used above, there are hyperparameters to tune the behaviors of the models. Nonetheless, we have got a general idea that models may have significantly different behaviors and performance. 