In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## 0. Data processing and visualization
load winequality-white.csv dataset and make exploratory data analysis

In [2]:
wine_quality = pd.read_csv("winequality-white.csv")
wine_quality.head()

Unnamed: 0,"fixed acidity;""volatile acidity"";""citric acid"";""residual sugar"";""chlorides"";""free sulfur dioxide"";""total sulfur dioxide"";""density"";""pH"";""sulphates"";""alcohol"";""quality"""
0,7;0.27;0.36;20.7;0.045;45;170;1.001;3;0.45;8.8;6
1,6.3;0.3;0.34;1.6;0.049;14;132;0.994;3.3;0.49;9...
2,8.1;0.28;0.4;6.9;0.05;30;97;0.9951;3.26;0.44;1...
3,7.2;0.23;0.32;8.5;0.058;47;186;0.9956;3.19;0.4...
4,7.2;0.23;0.32;8.5;0.058;47;186;0.9956;3.19;0.4...


In [3]:
wine_quality.shape

(4898, 1)

In [4]:
wine_quality.columns[0]

'fixed acidity;"volatile acidity";"citric acid";"residual sugar";"chlorides";"free sulfur dioxide";"total sulfur dioxide";"density";"pH";"sulphates";"alcohol";"quality"'

In [5]:
wine_quality.iloc[0,0]

'7;0.27;0.36;20.7;0.045;45;170;1.001;3;0.45;8.8;6'

In [6]:
matrix = [] 
for i in list(wine_quality.index):
    matrix.append([float(elem) for elem in wine_quality.iloc[i,0].split(";")])
columns = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 
           'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
          'pH', 'sulphates', 'alcohol', 'quality']
df = pd.DataFrame(data=matrix, index=list(wine_quality.index), columns=columns)
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6.0
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6.0
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6.0
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6.0
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6.0


In [7]:
df.isnull().values.any()

False

In [8]:
df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0
mean,6.854788,0.278241,0.334192,6.391415,0.045772,35.308085,138.360657,0.994027,3.188267,0.489847,10.514267,5.877909
std,0.843868,0.100795,0.12102,5.072058,0.021848,17.007137,42.498065,0.002991,0.151001,0.114126,1.230621,0.885639
min,3.8,0.08,0.0,0.6,0.009,2.0,9.0,0.98711,2.72,0.22,8.0,3.0
25%,6.3,0.21,0.27,1.7,0.036,23.0,108.0,0.991723,3.09,0.41,9.5,5.0
50%,6.8,0.26,0.32,5.2,0.043,34.0,134.0,0.99374,3.18,0.47,10.4,6.0
75%,7.3,0.32,0.39,9.9,0.05,46.0,167.0,0.9961,3.28,0.55,11.4,6.0
max,14.2,1.1,1.66,65.8,0.346,289.0,440.0,1.03898,3.82,1.08,14.2,9.0


In [9]:
target_col = 'quality'
df[target_col] = df[target_col].astype(int)
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [10]:
df.shape

(4898, 12)

In [11]:
X = df[df.columns.difference(['quality'])].copy()
y = df['quality'].copy()

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

## 1. Decision trees classification
In this task we will try to predict wine quality based on its features by fitting a decision tree model. Fit a decision tree classifier by making a grid search over loss functions: 'giny', 'entropy' and over max_leaf_nodes parameter. Choose this parameters via 5-Fold cross-validation. Visualize the best model's tree diagram

In [13]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

# creating a KFold object with 5 splits 
folds = KFold(n_splits = 5, shuffle = True)

# specify range of hyperparameters
hyper_params = [{'criterion' :  ['gini', 'entropy'],
                 'max_leaf_nodes' : list(range(2, 100))}]

# specify model
model = DecisionTreeClassifier(random_state=1)

# set up GridSearchCV()
model_cv = GridSearchCV(estimator = model, 
                        param_grid = hyper_params,
                        cv = folds, 
                        refit = True,
                        return_train_score=True)

model_cv.fit(X_train, y_train)

GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=True),
             estimator=DecisionTreeClassifier(random_state=1),
             param_grid=[{'criterion': ['gini', 'entropy'],
                          'max_leaf_nodes': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                             13, 14, 15, 16, 17, 18, 19, 20, 21,
                                             22, 23, 24, 25, 26, 27, 28, 29, 30,
                                             31, ...]}],
             return_train_score=True)

In [14]:
best_score, best_score_id = model_cv.cv_results_['mean_test_score'].max(), model_cv.cv_results_['mean_test_score'].argmax()
best_hyperparams = model_cv.cv_results_['params'][best_score_id]

print("The best test score is {0} corresponding to hyperparameters {1}".format(best_score, best_hyperparams))

The best test score is 0.5520711158026429 corresponding to hyperparameters {'criterion': 'gini', 'max_leaf_nodes': 74}


In [15]:
model = DecisionTreeClassifier(criterion='gini', max_leaf_nodes=50, random_state=1)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('f1_score: ', f1_score(y_test, y_pred, average='micro'))
print('accuracy_score: ', accuracy_score(y_test, y_pred))

f1_score:  0.5040816326530613
accuracy_score:  0.5040816326530613


In [16]:
# Importing required packages for visualization
from IPython.display import Image  
from six import StringIO  
from sklearn.tree import export_graphviz
import pydotplus, graphviz

# Putting features
features = list(X_train.loc[:, X_train.columns != 'quality'])
features

['alcohol',
 'chlorides',
 'citric acid',
 'density',
 'fixed acidity',
 'free sulfur dioxide',
 'pH',
 'residual sugar',
 'sulphates',
 'total sulfur dioxide',
 'volatile acidity']

In [None]:
# plotting the tree
dot_data = StringIO()  
export_graphviz(model, out_file=dot_data,feature_names=features,filled=True,rounded=True)

graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.set_size("25,20!") 
# graph.write_png('tree.png')
Image(graph.create_png())

## 2. Comparison classification
Try to predict wine quality with other classification tools that we studied(Logistic Regression, SVM, LDA) compare the accuracies and f-scores of all models on the test set and choose the best performing algorithm. 

In [16]:
print("labels:", set(df.quality))
print("amount:", len(set(df.quality)))

labels: {3, 4, 5, 6, 7, 8, 9}
amount: 7


In [17]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.pipeline import make_pipeline
from sklearn.metrics import jaccard_score
from sklearn.metrics import f1_score
from sklearn.svm import SVC

pipeLine = make_pipeline(LinearDiscriminantAnalysis(n_components=len(set(df.quality))-1), SVC(gamma = "auto", random_state=1))
pipeLine.fit(X_train, y_train)
svc = pipeLine.named_steps['svc']
lda = pipeLine.named_steps['lineardiscriminantanalysis']
lda_predict = lda.predict(X_test)
print("lda f1_score: ", f1_score(y_test, lda_predict, average='micro'))
print("lda accuracy_score: ", accuracy_score(y_test, lda_predict))

lda f1_score:  0.5142857142857142
lda accuracy_score:  0.5142857142857142


In [18]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold
  
folds = KFold(n_splits = 5, shuffle = True)
    
# defining parameter range 
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
              'kernel': ['poly', 'rbf', 'sigmoid'],
              'degree': [1, 2, 3, 4]}  
  
grid = RandomizedSearchCV(SVC(random_state = 1), param_grid, cv = folds, refit = True, verbose = 3) 
  
# fitting the model for grid search 
grid.fit(lda.transform(X_train), y_train) 

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END C=100, degree=2, gamma=0.1, kernel=sigmoid;, score=0.441 total time=   0.1s
[CV 2/5] END C=100, degree=2, gamma=0.1, kernel=sigmoid;, score=0.425 total time=   0.2s
[CV 3/5] END C=100, degree=2, gamma=0.1, kernel=sigmoid;, score=0.415 total time=   0.2s
[CV 4/5] END C=100, degree=2, gamma=0.1, kernel=sigmoid;, score=0.390 total time=   0.1s
[CV 5/5] END C=100, degree=2, gamma=0.1, kernel=sigmoid;, score=0.442 total time=   0.1s
[CV 1/5] END C=0.1, degree=3, gamma=0.1, kernel=sigmoid;, score=0.566 total time=   0.2s
[CV 2/5] END C=0.1, degree=3, gamma=0.1, kernel=sigmoid;, score=0.509 total time=   0.2s
[CV 3/5] END C=0.1, degree=3, gamma=0.1, kernel=sigmoid;, score=0.519 total time=   0.3s
[CV 4/5] END C=0.1, degree=3, gamma=0.1, kernel=sigmoid;, score=0.484 total time=   0.3s
[CV 5/5] END C=0.1, degree=3, gamma=0.1, kernel=sigmoid;, score=0.517 total time=   0.3s
[CV 1/5] END C=1000, degree=1, gamma=0.0001, kern

RandomizedSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=True),
                   estimator=SVC(random_state=1),
                   param_distributions={'C': [0.1, 1, 10, 100, 1000],
                                        'degree': [1, 2, 3, 4],
                                        'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                                        'kernel': ['poly', 'rbf', 'sigmoid']},
                   verbose=3)

In [19]:
best_score, best_score_id = grid.cv_results_['mean_test_score'].max(),grid.cv_results_['mean_test_score'].argmax()
best_hyperparams = grid.cv_results_['params'][best_score_id]

print("The best test score is {0} corresponding to hyperparameters {1}".format(best_score, best_hyperparams))

The best test score is 0.5857543592149503 corresponding to hyperparameters {'kernel': 'rbf', 'gamma': 1, 'degree': 2, 'C': 1}


In [20]:
# The degree is being ingnored with the 'rbf' kernel, so no need to specify it.
svc2 = SVC(kernel='rbf', gamma=0.1, C=100, random_state=1)
svc2.fit(lda.transform(X_train), y_train)
svc2_predict = svc2.predict(lda.transform(X_test))
print("svc2 f1_score: ", f1_score(y_test, svc2_predict, average='micro'))
print("svc2 accuracy_score: ", accuracy_score(y_test, svc2_predict))

svc2 f1_score:  0.5459183673469388
svc2 accuracy_score:  0.5459183673469388


In [21]:
from sklearn.metrics import make_scorer
from sklearn.linear_model import LogisticRegression

# creating a KFold object with 5 splits 
folds = KFold(n_splits = 5, shuffle = True)

# specify range of hyperparameters
# Set the parameters by cross-validation
hyper_params = [{'C' :  [0.01, 0.1, 1., 10, 100],
                 'solver' : ['saga'],
                 'penalty' : ['elasticnet'],
                 'tol' : [1e-5, 1e-4, 1e-3, 1e-2, 1e-1],
                'l1_ratio': [0.2, 0.4, 0.6, 0.8],
                'multi_class': ['multinomial']}]

# specify model
model = LogisticRegression(random_state=1, max_iter=10000)

scorer = make_scorer(f1_score, average='micro')

# set up GridSearchCV() 
model_cv = RandomizedSearchCV(estimator = model, 
                              param_distributions = hyper_params,
                              scoring=scorer, 
                              cv = folds, 
                              verbose = 3,
                              refit = True,
                              return_train_score=True)

model_cv.fit(lda.transform(X_train), y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END C=100, l1_ratio=0.4, multi_class=multinomial, penalty=elasticnet, solver=saga, tol=0.1;, score=(train=0.544, test=0.537) total time=   0.0s
[CV 2/5] END C=100, l1_ratio=0.4, multi_class=multinomial, penalty=elasticnet, solver=saga, tol=0.1;, score=(train=0.548, test=0.526) total time=   0.0s
[CV 3/5] END C=100, l1_ratio=0.4, multi_class=multinomial, penalty=elasticnet, solver=saga, tol=0.1;, score=(train=0.543, test=0.542) total time=   0.0s
[CV 4/5] END C=100, l1_ratio=0.4, multi_class=multinomial, penalty=elasticnet, solver=saga, tol=0.1;, score=(train=0.541, test=0.548) total time=   0.0s
[CV 5/5] END C=100, l1_ratio=0.4, multi_class=multinomial, penalty=elasticnet, solver=saga, tol=0.1;, score=(train=0.550, test=0.538) total time=   0.0s
[CV 1/5] END C=10, l1_ratio=0.8, multi_class=multinomial, penalty=elasticnet, solver=saga, tol=0.1;, score=(train=0.544, test=0.537) total time=   0.0s
[CV 2/5] END C=10, l1_

RandomizedSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=True),
                   estimator=LogisticRegression(max_iter=10000, random_state=1),
                   param_distributions=[{'C': [0.01, 0.1, 1.0, 10, 100],
                                         'l1_ratio': [0.2, 0.4, 0.6, 0.8],
                                         'multi_class': ['multinomial'],
                                         'penalty': ['elasticnet'],
                                         'solver': ['saga'],
                                         'tol': [1e-05, 0.0001, 0.001, 0.01,
                                                 0.1]}],
                   return_train_score=True,
                   scoring=make_scorer(f1_score, average=micro), verbose=3)

In [22]:
best_score, best_score_id = model_cv.cv_results_['mean_test_score'].max(),model_cv.cv_results_['mean_test_score'].argmax()
best_hyperparams = model_cv.cv_results_['params'][best_score_id]

print("The best test score is {0} corresponding to hyperparameters {1}".format(best_score, best_hyperparams))

The best test score is 0.5431373315609769 corresponding to hyperparameters {'tol': 0.001, 'solver': 'saga', 'penalty': 'elasticnet', 'multi_class': 'multinomial', 'l1_ratio': 0.2, 'C': 1.0}


In [23]:
# The degree is being ingnored with the 'rbf' kernel, so no need to specify it.
lReg = LogisticRegression(C=0.1, l1_ratio=0.6, multi_class='multinomial', penalty='elasticnet',
                          solver='saga', tol=1e-05, max_iter=1000, random_state=1)
lReg.fit(lda.transform(X_train), y_train)
lReg_predict = lReg.predict(lda.transform(X_test))
print("Logistic Regression f1_score: ", f1_score(y_test, lReg_predict, average='micro'))
print("Logistic Regression accuracy_score: ", accuracy_score(y_test, lReg_predict))

Logistic Regression f1_score:  0.513265306122449
Logistic Regression accuracy_score:  0.513265306122449


## 3. Decision trees regression
In this task we will use all the columns to predict alcohol concentration of a wine. Use the directives in task 1 as a guide to fit a Decision tree regressor.

In [24]:
X = df[df.columns.difference(['alcohol'])].copy()
y = df['alcohol'].copy()

In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [26]:
from sklearn.tree import DecisionTreeRegressor

# creating a KFold object with 5 splits 
folds = KFold(n_splits = 5, shuffle = True)

# specify range of hyperparameters
hyper_params = [{'criterion' :  ['squared_error', 'friedman_mse', 'absolute_error'],
                 'max_leaf_nodes' : list(range(2, 100))}]

# specify model
model = DecisionTreeRegressor(random_state=1)

# set up GridSearchCV()
model_cv = GridSearchCV(estimator = model, 
                        param_grid = hyper_params,
                        cv = folds, 
                        refit = True,
                        return_train_score=True)

model_cv.fit(X_train, y_train)

GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=True),
             estimator=DecisionTreeRegressor(random_state=1),
             param_grid=[{'criterion': ['squared_error', 'friedman_mse',
                                        'absolute_error'],
                          'max_leaf_nodes': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                             13, 14, 15, 16, 17, 18, 19, 20, 21,
                                             22, 23, 24, 25, 26, 27, 28, 29, 30,
                                             31, ...]}],
             return_train_score=True)

In [27]:
best_score, best_score_id = model_cv.cv_results_['mean_test_score'].max(),model_cv.cv_results_['mean_test_score'].argmax()
best_hyperparams = model_cv.cv_results_['params'][best_score_id]

print("The best test score is {0} corresponding to hyperparameters {1}".format(best_score, best_hyperparams))

The best test score is 0.8352526232570983 corresponding to hyperparameters {'criterion': 'friedman_mse', 'max_leaf_nodes': 90}


In [28]:
model = DecisionTreeRegressor(criterion='squared_error', max_leaf_nodes=99, random_state=1)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [29]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

print('MAE: ', mean_absolute_error(y_test, y_pred))
print('MSE: ', mean_squared_error(y_test, y_pred))
print('RMSE: ', np.sqrt(mean_squared_error(y_test, y_pred)))
print('R^2: ', r2_score(y_test, y_pred))

MAE:  0.3635662666326438
MSE:  0.2406038233162766
RMSE:  0.4905138360090127
R^2:  0.847212724398044


## 4. Comparison regression
Predict wine alcohol concentration with Linear Regression. Compare mean absolute errors and rooted mean squared errors. What is the best model?

In [30]:
from sklearn.linear_model import LinearRegression

lReg2 = LinearRegression()
lReg2.fit(X_train, y_train)
lReg2_pred = lReg2.predict(X_test)

In [31]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

print('MAE: ', mean_absolute_error(y_test, lReg2_pred))
print('MSE: ', mean_squared_error(y_test, lReg2_pred))
print('RMSE: ', np.sqrt(mean_squared_error(y_test, lReg2_pred)))
print('R^2: ', r2_score(y_test, lReg2_pred))

MAE:  0.3003292527809054
MSE:  0.1549733377565343
RMSE:  0.39366653116125366
R^2:  0.9015894521524802


###### The best model from DecisionTreeRegressor and Linear Regression is the last one.