In [1]:
#import all libraries

import numpy as np
import pandas as pd

from IPython.display import display, HTML, SVG, Image, IFrame

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline



from sklearn.svm import SVC 
from sklearn import preprocessing 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [2]:
# import csv for red
# https://stackoverflow.com/questions/24606330/how-to-read-a-file-with-a-semi-colon-separator-in-pandas

red_wines = pd.read_csv("winequality-red.csv", sep=";")
red_wines

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [3]:
# Create Target Column based on quality rating of 5 or lower = bad (0) and 6 or higher = good (1)
red_wines['target'] = np.where(red_wines['quality']<=5,0,1)

In [4]:
target = red_wines['target']
target_names = ["bad quality","good quality"]

In [5]:
data = red_wines.drop(['quality', 'target'], axis=1)
feature_names = data.columns

In [6]:
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4


In [7]:
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [8]:
model=SVC(kernel='linear')
model.fit(X_train, y_train)

SVC(kernel='linear')

In [9]:
print('Test Acc: %.3f' % model.score(X_test, y_test))

Test Acc: 0.728


In [10]:
predictions = model.predict(X_test)
print(classification_report(y_test, predictions,
                            target_names=target_names))

              precision    recall  f1-score   support

 bad quality       0.67      0.76      0.71       178
good quality       0.78      0.70      0.74       222

    accuracy                           0.73       400
   macro avg       0.73      0.73      0.73       400
weighted avg       0.73      0.73      0.73       400



In [13]:
# using 5 parameters we have determined may result with better results:
target2 = red_wines['target']
target_names2 = ["bad quality","good quality"]

In [14]:
data2 = red_wines.drop(["quality",'citric acid','fixed acidity','residual sugar', 'chlorides', 'free sulfur dioxide','pH','target'], axis=1)
feature_names2 = data2.columns

In [15]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(data2, target2, random_state=42)

In [16]:
model2=SVC(kernel='linear')
model2.fit(X_train2, y_train2)

SVC(kernel='linear')

In [17]:
print('Test Acc: %.3f' % model2.score(X_test2, y_test2))

Test Acc: 0.720


In [18]:
predictions = model.predict(X_test)
print(classification_report(y_test, predictions,
                            target_names=target_names))

              precision    recall  f1-score   support

 bad quality       0.67      0.76      0.71       178
good quality       0.78      0.70      0.74       222

    accuracy                           0.73       400
   macro avg       0.73      0.73      0.73       400
weighted avg       0.73      0.73      0.73       400



below code from applied machine learning course on linkedin

In [19]:
def print_results(results):
    print('BEST PARAMS: {}\n'.format(results.best_params_))

    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))

In [20]:
svc = SVC()
parameters = {
    'kernel': ['linear', 'rbf'],
    'C': [0.1, 1, 10]
}

cv = GridSearchCV(svc, parameters, cv=5)
cv.fit(X_train, y_train)

print_results(cv)

BEST PARAMS: {'C': 1, 'kernel': 'linear'}

0.739 (+/-0.091) for {'C': 0.1, 'kernel': 'linear'}
0.613 (+/-0.042) for {'C': 0.1, 'kernel': 'rbf'}
0.748 (+/-0.091) for {'C': 1, 'kernel': 'linear'}
0.625 (+/-0.037) for {'C': 1, 'kernel': 'rbf'}
0.746 (+/-0.093) for {'C': 10, 'kernel': 'linear'}
0.706 (+/-0.076) for {'C': 10, 'kernel': 'rbf'}


In [21]:
svc2 = SVC()
parameters = {
    'kernel': ['linear', 'rbf'],
    'C': [0.1, 1, 10]
}

cv2 = GridSearchCV(svc, parameters, cv=5)
cv2.fit(X_train2, y_train2)

print_results(cv2)

BEST PARAMS: {'C': 1, 'kernel': 'linear'}

0.731 (+/-0.103) for {'C': 0.1, 'kernel': 'linear'}
0.609 (+/-0.029) for {'C': 0.1, 'kernel': 'rbf'}
0.742 (+/-0.088) for {'C': 1, 'kernel': 'linear'}
0.614 (+/-0.028) for {'C': 1, 'kernel': 'rbf'}
0.741 (+/-0.085) for {'C': 10, 'kernel': 'linear'}
0.709 (+/-0.062) for {'C': 10, 'kernel': 'rbf'}


In [23]:
cv2.best_estimator_

SVC(C=1, kernel='linear')