In [1]:
#Import dependencies
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#Read data
wine = pd.read_csv('wineQualityReds.csv')
wine.head()

Unnamed: 0.1,Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol,quality
0,1,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,2,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,3,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,4,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,5,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [3]:
#Column info
wine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 13 columns):
Unnamed: 0              1599 non-null int64
fixed.acidity           1599 non-null float64
volatile.acidity        1599 non-null float64
citric.acid             1599 non-null float64
residual.sugar          1599 non-null float64
chlorides               1599 non-null float64
free.sulfur.dioxide     1599 non-null float64
total.sulfur.dioxide    1599 non-null float64
density                 1599 non-null float64
pH                      1599 non-null float64
sulphates               1599 non-null float64
alcohol                 1599 non-null float64
quality                 1599 non-null int64
dtypes: float64(11), int64(2)
memory usage: 162.5 KB


## Preprocessing Data for performing Machine learning algorithms

In [4]:
#Classification - wine rated 7 or higher is good wine
bins = (2, 6.5, 8)
group_names = ['bad', 'good']
wine['quality'] = pd.cut(wine['quality'], bins = bins, labels = group_names)

In [5]:
#Bad = 0 and good = 1 
from sklearn.preprocessing import LabelEncoder
label_quality = LabelEncoder()
wine['quality'] = label_quality.fit_transform(wine['quality'])

In [6]:
wine['quality'].value_counts()

0    1382
1     217
Name: quality, dtype: int64

In [7]:
X = wine.drop('quality', axis = 1)
y = wine['quality']

In [8]:
#Train Test Split data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [9]:
#Standard Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [10]:
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


### Random Forest Classifier

In [11]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=200)
rfc.fit(X_train, y_train)
pred_rfc = rfc.predict(X_test)

In [12]:
from sklearn.metrics import classification_report
print(classification_report(y_test, pred_rfc))

              precision    recall  f1-score   support

           0       0.90      0.96      0.93       273
           1       0.61      0.36      0.45        47

   micro avg       0.87      0.87      0.87       320
   macro avg       0.75      0.66      0.69       320
weighted avg       0.85      0.87      0.86       320



In [13]:
#Check most important variables
rfc.feature_importances_

array([0.0759194 , 0.06528121, 0.10119049, 0.07979014, 0.05876594,
       0.0681252 , 0.05529454, 0.07441868, 0.08039054, 0.05378462,
       0.12067415, 0.16636508])

In [14]:
feats = ['volatile.acidity', 'sulphates','alcohol']

In [15]:
#Retrain using important variables
x_train, x_test, Y_train, Y_test = train_test_split(wine[feats], wine.quality, test_size = 0.2, random_state = 42)

In [16]:
rfc2 = RandomForestClassifier(n_estimators=200)
rfc2.fit(x_train, Y_train)
pred_rfc2 = rfc2.predict(x_test)

In [17]:
print(classification_report(Y_test, pred_rfc2))

              precision    recall  f1-score   support

           0       0.93      0.95      0.94       273
           1       0.65      0.55      0.60        47

   micro avg       0.89      0.89      0.89       320
   macro avg       0.79      0.75      0.77       320
weighted avg       0.88      0.89      0.89       320



#### Random forest has accuracy of 87% using all data, 90% using most important features

In [18]:
#Confusion matrix
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, pred_rfc))

[[262  11]
 [ 30  17]]


## Support Vector Classifier

In [19]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train, y_train)
pred_svc = svc.predict(X_test)

In [20]:
print(classification_report(y_test, pred_svc))

              precision    recall  f1-score   support

           0       0.88      0.98      0.93       273
           1       0.67      0.26      0.37        47

   micro avg       0.87      0.87      0.87       320
   macro avg       0.78      0.62      0.65       320
weighted avg       0.85      0.87      0.85       320



#### Support vector classifier has accuracy of 85%

## Grid Search CV

In [21]:
#Test different parameters
from sklearn.model_selection import GridSearchCV
param = {
    'C': [1, 10, 100, 1000],
    'kernel':['linear', 'rbf'],
    'gamma' :[0.0001, 0.0005, 0.001, 0.005]
}
grid_svc = GridSearchCV(svc, param_grid=param, scoring='accuracy', cv=10, verbose=3)

In [22]:
grid_svc.fit(X_train, y_train)

Fitting 10 folds for each of 32 candidates, totalling 320 fits
[CV] C=1, gamma=0.0001, kernel=linear ................................
[CV]  C=1, gamma=0.0001, kernel=linear, score=0.8671875, total=   0.0s
[CV] C=1, gamma=0.0001, kernel=linear ................................
[CV]  C=1, gamma=0.0001, kernel=linear, score=0.8671875, total=   0.0s
[CV] C=1, gamma=0.0001, kernel=linear ................................
[CV]  C=1, gamma=0.0001, kernel=linear, score=0.8671875, total=   0.0s
[CV] C=1, gamma=0.0001, kernel=linear ................................
[CV]  C=1, gamma=0.0001, kernel=linear, score=0.8671875, total=   0.0s
[CV] C=1, gamma=0.0001, kernel=linear ................................
[CV]  C=1, gamma=0.0001, kernel=linear, score=0.8671875, total=   0.0s
[CV] C=1, gamma=0.0001, kernel=linear ................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


[CV]  C=1, gamma=0.0001, kernel=linear, score=0.8671875, total=   0.0s
[CV] C=1, gamma=0.0001, kernel=linear ................................
[CV]  C=1, gamma=0.0001, kernel=linear, score=0.8671875, total=   0.0s
[CV] C=1, gamma=0.0001, kernel=linear ................................
[CV]  C=1, gamma=0.0001, kernel=linear, score=0.8671875, total=   0.0s
[CV] C=1, gamma=0.0001, kernel=linear ................................
[CV]  C=1, gamma=0.0001, kernel=linear, score=0.8671875, total=   0.0s
[CV] C=1, gamma=0.0001, kernel=linear ................................
[CV]  C=1, gamma=0.0001, kernel=linear, score=0.8661417322834646, total=   0.0s
[CV] C=1, gamma=0.0001, kernel=rbf ...................................
[CV] ... C=1, gamma=0.0001, kernel=rbf, score=0.8671875, total=   0.0s
[CV] C=1, gamma=0.0001, kernel=rbf ...................................
[CV] ... C=1, gamma=0.0001, kernel=rbf, score=0.8671875, total=   0.0s
[CV] C=1, gamma=0.0001, kernel=rbf .................................

[CV] . C=1, gamma=0.005, kernel=linear, score=0.8671875, total=   0.0s
[CV] C=1, gamma=0.005, kernel=linear .................................
[CV] . C=1, gamma=0.005, kernel=linear, score=0.8671875, total=   0.0s
[CV] C=1, gamma=0.005, kernel=linear .................................
[CV] . C=1, gamma=0.005, kernel=linear, score=0.8671875, total=   0.0s
[CV] C=1, gamma=0.005, kernel=linear .................................
[CV] . C=1, gamma=0.005, kernel=linear, score=0.8671875, total=   0.0s
[CV] C=1, gamma=0.005, kernel=linear .................................
[CV] . C=1, gamma=0.005, kernel=linear, score=0.8671875, total=   0.0s
[CV] C=1, gamma=0.005, kernel=linear .................................
[CV]  C=1, gamma=0.005, kernel=linear, score=0.8661417322834646, total=   0.0s
[CV] C=1, gamma=0.005, kernel=rbf ....................................
[CV] .... C=1, gamma=0.005, kernel=rbf, score=0.8671875, total=   0.0s
[CV] C=1, gamma=0.005, kernel=rbf ...................................

[CV]  C=10, gamma=0.001, kernel=linear, score=0.8671875, total=   0.0s
[CV] C=10, gamma=0.001, kernel=linear ................................
[CV]  C=10, gamma=0.001, kernel=linear, score=0.8671875, total=   0.0s
[CV] C=10, gamma=0.001, kernel=linear ................................
[CV]  C=10, gamma=0.001, kernel=linear, score=0.8671875, total=   0.0s
[CV] C=10, gamma=0.001, kernel=linear ................................
[CV]  C=10, gamma=0.001, kernel=linear, score=0.8671875, total=   0.0s
[CV] C=10, gamma=0.001, kernel=linear ................................
[CV]  C=10, gamma=0.001, kernel=linear, score=0.8671875, total=   0.0s
[CV] C=10, gamma=0.001, kernel=linear ................................
[CV]  C=10, gamma=0.001, kernel=linear, score=0.8671875, total=   0.0s
[CV] C=10, gamma=0.001, kernel=linear ................................
[CV]  C=10, gamma=0.001, kernel=linear, score=0.8671875, total=   0.0s
[CV] C=10, gamma=0.001, kernel=linear ................................
[CV]  

[CV]  C=100, gamma=0.0005, kernel=linear, score=0.8671875, total=  13.0s
[CV] C=100, gamma=0.0005, kernel=linear ..............................
[CV]  C=100, gamma=0.0005, kernel=linear, score=0.8671875, total=   7.2s
[CV] C=100, gamma=0.0005, kernel=linear ..............................
[CV]  C=100, gamma=0.0005, kernel=linear, score=0.8671875, total=   5.6s
[CV] C=100, gamma=0.0005, kernel=linear ..............................
[CV]  C=100, gamma=0.0005, kernel=linear, score=0.8671875, total=   4.8s
[CV] C=100, gamma=0.0005, kernel=linear ..............................
[CV]  C=100, gamma=0.0005, kernel=linear, score=0.8671875, total=   6.0s
[CV] C=100, gamma=0.0005, kernel=linear ..............................
[CV]  C=100, gamma=0.0005, kernel=linear, score=0.8671875, total=   0.8s
[CV] C=100, gamma=0.0005, kernel=linear ..............................
[CV]  C=100, gamma=0.0005, kernel=linear, score=0.8671875, total=   6.7s
[CV] C=100, gamma=0.0005, kernel=linear .......................

[CV] ... C=100, gamma=0.005, kernel=rbf, score=0.890625, total=   0.0s
[CV] C=100, gamma=0.005, kernel=rbf ..................................
[CV]  C=100, gamma=0.005, kernel=rbf, score=0.8976377952755905, total=   0.0s
[CV] C=1000, gamma=0.0001, kernel=linear .............................
[CV]  C=1000, gamma=0.0001, kernel=linear, score=0.8671875, total= 2.2min
[CV] C=1000, gamma=0.0001, kernel=linear .............................
[CV]  C=1000, gamma=0.0001, kernel=linear, score=0.8671875, total= 2.6min
[CV] C=1000, gamma=0.0001, kernel=linear .............................
[CV]  C=1000, gamma=0.0001, kernel=linear, score=0.8671875, total= 3.5min
[CV] C=1000, gamma=0.0001, kernel=linear .............................
[CV]  C=1000, gamma=0.0001, kernel=linear, score=0.8671875, total= 2.2min
[CV] C=1000, gamma=0.0001, kernel=linear .............................
[CV]  C=1000, gamma=0.0001, kernel=linear, score=0.8671875, total= 1.8min
[CV] C=1000, gamma=0.0001, kernel=linear ..............

[CV] . C=1000, gamma=0.001, kernel=rbf, score=0.8515625, total=   0.0s
[CV] C=1000, gamma=0.001, kernel=rbf .................................
[CV] .. C=1000, gamma=0.001, kernel=rbf, score=0.890625, total=   0.0s
[CV] C=1000, gamma=0.001, kernel=rbf .................................
[CV] . C=1000, gamma=0.001, kernel=rbf, score=0.8984375, total=   0.0s
[CV] C=1000, gamma=0.001, kernel=rbf .................................
[CV] . C=1000, gamma=0.001, kernel=rbf, score=0.8671875, total=   0.0s
[CV] C=1000, gamma=0.001, kernel=rbf .................................
[CV]  C=1000, gamma=0.001, kernel=rbf, score=0.8818897637795275, total=   0.0s
[CV] C=1000, gamma=0.005, kernel=linear ..............................
[CV]  C=1000, gamma=0.005, kernel=linear, score=0.8671875, total= 2.3min
[CV] C=1000, gamma=0.005, kernel=linear ..............................
[CV]  C=1000, gamma=0.005, kernel=linear, score=0.8671875, total= 2.3min
[CV] C=1000, gamma=0.005, kernel=linear .........................

[Parallel(n_jobs=1)]: Done 320 out of 320 | elapsed: 110.4min finished


GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [1, 10, 100, 1000], 'kernel': ['linear', 'rbf'], 'gamma': [0.0001, 0.0005, 0.001, 0.005]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=3)

In [23]:
#Best parameters
grid_svc.best_params_

{'C': 100, 'gamma': 0.005, 'kernel': 'rbf'}

In [24]:
#SVC with best parameters
svc2 = SVC(C = 100, gamma =  0.005, kernel= 'rbf')
svc2.fit(X_train, y_train)
pred_svc2 = svc2.predict(X_test)
print(classification_report(y_test, pred_svc2))

              precision    recall  f1-score   support

           0       0.89      0.99      0.93       273
           1       0.80      0.26      0.39        47

   micro avg       0.88      0.88      0.88       320
   macro avg       0.84      0.62      0.66       320
weighted avg       0.87      0.88      0.85       320



### SVC improves from 85% to 87% using Grid Search CV

## Cross Validation Score

In [26]:
#Cross validation for different models
from sklearn.model_selection import cross_val_score
algs_list = [rfc, rfc2, svc, svc2]
algs_names = ['rfc','rfc(best)', 'svc', 'svc(best)']
i=0
for alg in algs_list:
    cross_score = cross_val_score(alg, X = X_train, y = y_train, cv = 10)
    print(f"{algs_names[i]} has a Cross Validation Score of {cross_score.mean()}")
    i +=1
    


rfc has a Cross Validation Score of 0.9116818405511811
rfc(best) has a Cross Validation Score of 0.910894438976378




svc has a Cross Validation Score of 0.8850762795275591
svc(best) has a Cross Validation Score of 0.8850762795275591


### Using Cross Validation, Random Forest Classifier using all data has the best accuracy at 91.17%