In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import svm
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.utils.multiclass import unique_labels
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelBinarizer
import math

In [3]:
def confusion_matrix_report(y_true, y_pred):    
    cm, labels = confusion_matrix(y_true, y_pred), unique_labels(y_true, y_pred)
    column_width = max([len(str(x)) for x in labels] + [5])  # 5 is value length
    report = " " * column_width + " " + "{:_^{}}".format("Prediction", column_width * len(labels))+ "\n"
    report += " " * column_width + " ".join(["{:>{}}".format(label, column_width) for label in labels]) + "\n"
    for i, label1 in enumerate(labels):
        report += "{:>{}}".format(label1, column_width) + " ".join(["{:{}d}".format(cm[i, j], column_width) for j in range(len(labels))]) + "\n"
    return report

In [4]:
white = pd.read_csv("data/winequality-white1.csv")
red = pd.read_csv("data/winequality-red1.csv")

In [5]:
conditions = [(white['quality'] <= 5),
              (white['quality'] >= 7)]
choices = ['1_low', '3_high']
white['quality_class'] = np.select(conditions, choices, default = '2_middle')
conditions = [(red['quality'] <= 5),
              (red['quality'] >= 7)]
red['quality_class'] = np.select(conditions, choices, default = '2_middle')

In [6]:
white_data = white.drop(['quality', 'quality_class'], axis = 1)
red_data = red.drop(['quality', 'quality_class'], axis = 1)

In [7]:
white_class = white['quality_class']
red_class = red['quality_class']

In [8]:
white_data_z_stand = pd.DataFrame()
for column in white_data:
    mean = white_data[column].mean()
    sd = white_data[column].std()
    white_data_z_stand[column] = (white_data[column]-mean)/sd
red_data_z_stand = pd.DataFrame()
for column in red_data:
    mean = red_data[column].mean()
    sd = red_data[column].std()
    red_data_z_stand[column] = (red_data[column]-mean)/sd

In [9]:
white_data_z_stand.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
count,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0
mean,2.653755e-14,-1.053431e-14,5.34461e-14,-2.538326e-15,-1.419036e-15,6.210721e-18,-1.387439e-16,2.148461e-12,1.316599e-14,-1.280696e-14,-2.846868e-14
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-3.619982,-1.966784,-2.761461,-1.141827,-1.683102,-1.958477,-3.043919,-2.312802,-3.101091,-2.364468,-2.043089
25%,-0.657434,-0.6770318,-0.5304215,-0.924953,-0.447289,-0.7237012,-0.7144009,-0.770628,-0.6507699,-0.6996389,-0.8241915
50%,-0.06492444,-0.1809733,-0.117266,-0.2348977,-0.1268931,-0.07691388,-0.1026084,-0.09608339,-0.05474574,-0.1739035,-0.09285319
75%,0.5275851,0.414297,0.4611517,0.6917479,0.1935028,0.6286722,0.6738976,0.6929749,0.6075033,0.5270772,0.719745
max,8.704217,8.152811,10.9553,11.71292,13.74167,14.91679,7.09772,15.02976,4.183648,5.171074,2.99502


In [10]:
white_data_01_stand = pd.DataFrame()
for column in white_data:
    mini = white_data[column].min()
    maxi = white_data[column].max()
    white_data_01_stand[column] = (white_data[column]-mini)/(maxi-mini)
red_data_01_stand = pd.DataFrame()
for column in red_data:
    mini = red_data[column].min()
    maxi = red_data[column].max()
    red_data_01_stand[column] = (red_data[column]-mini)/(maxi-mini)

In [11]:
cross_val = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# White

In [None]:
# We could really use a measure that does take into account all F1s or so
# At least something that indicates which configurations might be
# interesting to look at

### Trying out other Cs

In [None]:
clf_01_001 = svm.SVC(C=0.1, gamma=0.01, cache_size=5000,
              decision_function_shape='ovo', class_weight='balanced')

In [None]:
predicted = cross_val_predict(clf_01_001, 
                              white_data_z_stand, white_class, cv=cross_val)

In [None]:
print(accuracy_score(white['quality_class'], predicted))
print(confusion_matrix_report(white['quality_class'], predicted))
print(classification_report(white['quality_class'], predicted))

In [None]:
clf_1_001 = svm.SVC(C=1, gamma=0.01, cache_size=5000,
              decision_function_shape='ovo', class_weight='balanced')

In [None]:
predicted = cross_val_predict(clf_1_001, 
                              white_data_z_stand, white_class, cv=cross_val)

In [None]:
print(accuracy_score(white['quality_class'], predicted))
print(confusion_matrix_report(white['quality_class'], predicted))
print(classification_report(white['quality_class'], predicted))

In [None]:
clf_10_001 = svm.SVC(C=10, gamma=0.01, cache_size=5000,
              decision_function_shape='ovo', class_weight='balanced')

In [None]:
predicted = cross_val_predict(clf_10_001, 
                              white_data_z_stand, white_class, cv=cross_val)

In [None]:
print(accuracy_score(white['quality_class'], predicted))
print(confusion_matrix_report(white['quality_class'], predicted))
print(classification_report(white['quality_class'], predicted))

In [None]:
clf_100_001 = svm.SVC(C=100, gamma=0.01, cache_size=5000,
              decision_function_shape='ovo', class_weight='balanced')

In [None]:
predicted = cross_val_predict(clf_100_001, 
                              white_data_z_stand, white_class, cv=cross_val)

In [None]:
print(accuracy_score(white['quality_class'], predicted))
print(confusion_matrix_report(white['quality_class'], predicted))
print(classification_report(white['quality_class'], predicted))

### Trying out other gamma

In [24]:
clf_100_03 = svm.SVC(C=100, gamma=0.3, cache_size=5000,
              decision_function_shape='ovo', class_weight='balanced')

In [25]:
predicted = cross_val_predict(clf_100_03, 
                              white_data_z_stand, white_class, cv=cross_val)

In [26]:
print(accuracy_score(white['quality_class'], predicted))
print(confusion_matrix_report(white['quality_class'], predicted))
print(classification_report(white['quality_class'], predicted))

0.6939567170273581
         _______Prediction_______
           1_low 2_middle   3_high
   1_low    1194      401       45
2_middle     440     1508      250
  3_high      82      281      697

             precision    recall  f1-score   support

      1_low       0.70      0.73      0.71      1640
   2_middle       0.69      0.69      0.69      2198
     3_high       0.70      0.66      0.68      1060

avg / total       0.69      0.69      0.69      4898



In [21]:
clf_100_01 = svm.SVC(C=100, gamma=0.1, cache_size=5000,
              decision_function_shape='ovo', class_weight='balanced')

In [22]:
predicted = cross_val_predict(clf_100_01, 
                              white_data_z_stand, white_class, cv=cross_val)

In [23]:
print(accuracy_score(white['quality_class'], predicted))
print(confusion_matrix_report(white['quality_class'], predicted))
print(classification_report(white['quality_class'], predicted))

0.6606778276847693
         _______Prediction_______
           1_low 2_middle   3_high
   1_low    1187      374       79
2_middle     515     1286      397
  3_high      65      232      763

             precision    recall  f1-score   support

      1_low       0.67      0.72      0.70      1640
   2_middle       0.68      0.59      0.63      2198
     3_high       0.62      0.72      0.66      1060

avg / total       0.66      0.66      0.66      4898



In [12]:
clf_100_001 = svm.SVC(C=100, gamma=0.01, cache_size=5000,
              decision_function_shape='ovo', class_weight='balanced')

In [13]:
predicted = cross_val_predict(clf_100_001, 
                              white_data_z_stand, white_class, cv=cross_val)

In [14]:
print(accuracy_score(white['quality_class'], predicted))
print(confusion_matrix_report(white['quality_class'], predicted))
print(classification_report(white['quality_class'], predicted))

0.5853409554920376
         _______Prediction_______
           1_low 2_middle   3_high
   1_low    1179      352      109
2_middle     609      902      687
  3_high      62      212      786

             precision    recall  f1-score   support

      1_low       0.64      0.72      0.68      1640
   2_middle       0.62      0.41      0.49      2198
     3_high       0.50      0.74      0.60      1060

avg / total       0.60      0.59      0.58      4898



In [15]:
clf_100_0001 = svm.SVC(C=100, gamma=0.001, cache_size=5000,
              decision_function_shape='ovo', class_weight='balanced')

In [16]:
predicted = cross_val_predict(clf_100_0001, 
                              white_data_z_stand, white_class, cv=cross_val)

In [17]:
print(accuracy_score(white['quality_class'], predicted))
print(confusion_matrix_report(white['quality_class'], predicted))
print(classification_report(white['quality_class'], predicted))

0.5549203756635361
         _______Prediction_______
           1_low 2_middle   3_high
   1_low    1194      349       97
2_middle     701      806      691
  3_high     125      217      718

             precision    recall  f1-score   support

      1_low       0.59      0.73      0.65      1640
   2_middle       0.59      0.37      0.45      2198
     3_high       0.48      0.68      0.56      1060

avg / total       0.56      0.55      0.54      4898



In [18]:
clf_100_00001 = svm.SVC(C=100, gamma=0.0001, cache_size=5000,
              decision_function_shape='ovo', class_weight='balanced')

In [19]:
predicted = cross_val_predict(clf_100_00001, 
                              white_data_z_stand, white_class, cv=cross_val)

In [20]:
print(accuracy_score(white['quality_class'], predicted))
print(confusion_matrix_report(white['quality_class'], predicted))
print(classification_report(white['quality_class'], predicted))

0.5289914250714577
         _______Prediction_______
           1_low 2_middle   3_high
   1_low    1186      302      152
2_middle     759      683      756
  3_high     139      199      722

             precision    recall  f1-score   support

      1_low       0.57      0.72      0.64      1640
   2_middle       0.58      0.31      0.40      2198
     3_high       0.44      0.68      0.54      1060

avg / total       0.55      0.53      0.51      4898



### Gridsearch

In [27]:
clf = svm.SVC(cache_size=5000,
              decision_function_shape='ovo', class_weight='balanced')

In [31]:
parameters = [{'C': [0.1, 1, 10, 100, 1000, 10000], 'kernel': ['linear']},
              {'C': [0.1, 1, 10, 100, 1000, 10000], 'gamma': [0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7], 'kernel': ['rbf']}]

In [32]:
grid_search_estimator = GridSearchCV(clf, 
                                     parameters, scoring='accuracy', cv=cross_val)

In [None]:
grid_search_estimator.fit(white_data_z_stand, white_class)

# Red