In [None]:
import warnings
warnings.filterwarnings('ignore')

import math
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')

from matplotlib import pyplot
from sklearn.tree import plot_tree
from matplotlib.pyplot import figure

from sklearn.model_selection import train_test_split, GridSearchCV
random_state = 1
train_size = 0.75
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import f_regression

from sklearn.tree import DecisionTreeClassifier  
from sklearn.svm import SVC


In [None]:
# Read data from file (or url) and save the dataframe
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
df = pd.read_csv(url, sep = ';') 
# if the names of the columns are not present, insert them using `names = []`
# if the file is an excel use df = pd.read_excel(data_fn)
print(f"Shape of the input data {df.shape}")

target = 'quality'

# storing in X the content of the dataframe excluding the target column
X = df.drop(target, axis=1)
# storing in y the labels
y = df[target]
print(f"Shape of X: {X.shape}\nShape of y: {y.shape}")

# dividing the dataset in train and test
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state=random_state, train_size = train_size)
print("There are {} samples in the training dataset".format(Xtrain.shape[0]))
print("There are {} samples in the testing dataset".format(Xtest.shape[0]))
print("Each sample has {} features".format(Xtrain.shape[1]))

# Create the model

In [None]:
# show the p-values of the target with respect to the variables
_, p_values = f_regression(X,y)
p_values_show = pd.DataFrame({'Variable': X.columns, 'p-value': p_values})
p_values_show

In [None]:
# instantiating the model
model = DecisionTreeClassifier(criterion = 'entropy')
# fitting the model
model.fit(Xtrain, ytrain)

# using the model to predict the labels of the training set
ytrain_model = model.predict(Xtrain)   
accuracy_train = accuracy_score(ytrain, ytrain_model)
print(f"The accuracy on training set is {(accuracy_train * 100):.2f}%")

In [None]:
# using the model to predict the label of new data
ytest_model = model.predict(Xtest)
accuracy_test = accuracy_score(ytest, ytest_model)
print("The accuracy on test set is {0:.2f}%".format(accuracy_test * 100))

In [None]:
# Showing the decision tree
figure(figsize = (15,15))
plot_tree(model
#          , fontsize=6
          , filled=True
          , feature_names = ['sepal length', 'sepal width', 'petal length', 'petal width']
          , class_names = ['setosa', 'versicolor', 'virginica']
          , rounded = True
          , proportion = True
         );

# Tuning of the model

In [None]:
# using the estimator to predict the label of new data
y_predicted_test = model.predict(Xtest)
accuracy_test = accuracy_score(ytest, y_predicted_test) * 100
fitted_max_depth = model.tree_.max_depth
initial_impurity = model.tree_.impurity[0] # the impurity variable of tree_ contains the impurities of all the nodes
print(f"The accuracy on test set is {accuracy_test:.1f}%")
print(f"The maximum depth of the tree fitted on X_train is {fitted_max_depth}")
# print("The impurity of the X_train dataset is {0:.3f}".format(initial_impurity))


In [None]:
parameter_values = range(1,fitted_max_depth+1)
# Tuning with cross validation
# we'll build an estimator changing the depth of the decision tree
# we'll compute the scores and we'll save the average in a list
avg_scores = []
for par in parameter_values:
    estimator = DecisionTreeClassifier(criterion="entropy"
                                            , max_depth = par
                                            , random_state = random_state
                                            )
    scores = cross_val_score(estimator, Xtrain, ytrain
                             , scoring='accuracy', cv = 5)
    # cross_val_score produces an array with one score for each fold
    avg_scores.append(np.mean(scores))
print(avg_scores)

plt.figure(figsize=(32,20))
plt.plot(parameter_values, avg_scores, '-o', linewidth=5, markersize=24)
plt.xlabel('max_depth')
plt.ylabel('accuracy')
plt.title("Score with Cross Validation varying max_depth of tree", fontsize = 24)
plt.show();

In [None]:
# get the depth of the tree that obtained the best result
top_par_cv = parameter_values[np.argmax(avg_scores)]
# create an estimator using the best depth
estimator = DecisionTreeClassifier(criterion="entropy", max_depth = top_par_cv)
estimator.fit(Xtrain,ytrain);
y_predicted = estimator.predict(Xtest)
accuracy_cv = accuracy_score(ytest, y_predicted) * 100
print(f"The accuracy on test set tuned with cross_validation is {accuracy_cv:.1f}% with depth {top_par_cv}")

# showing more infromation on the classifier
print(classification_report(ytest, y_predicted))
# printing the confusion matrix
print(confusion_matrix(ytest, y_predicted))

# Using several classifier

In [None]:
# 8. The model used is the support vector machine
tuned_param_svc = [{'kernel': 'rbf', 
                    'C': [1, 10, 100, 1000],
                    },
                    {'kernel': 'linear',
                     'C': [1, 10, 100, 1000],                     
                    },
                   ]

avg_scores_2 = []
for par in tuned_param_svc:
    for c in par['C']:
        estimator = SVC(kernel=par['kernel'], C=c)
        scores = cross_val_score(estimator, Xtrain, ytrain
                                , scoring='accuracy', cv = 5)
        # cross_val_score produces an array with one score for each fold
        avg_scores_2.append(np.mean(scores))
print(avg_scores_2)

best = np.argmax(avg_scores_2)
best_param = {'kernel': tuned_param_svc[int(best/4)]['kernel'],\
    'C': tuned_param_svc[int(best/4)]['C'][best%4]}
estimator_2 = SVC(**best_param)

estimator_2.fit(Xtrain,ytrain);
y_predicted = estimator_2.predict(Xtest)
accuracy_cv = accuracy_score(ytest, y_predicted) * 100
print(f"The accuracy on test set tuned with cross_validation is {accuracy_cv:.1f}% using the kernel {estimator_2.kernel} and C={estimator_2.C}")
