In [16]:
# import dependencies 
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
%matplotlib inline

In [20]:
#Read data and show data
wine = pd.read_csv('wineQualityWhites.csv', index_col = 0)
wine.head()

Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol,quality
1,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
2,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
3,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
5,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [21]:
#Column info
wine.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4898 entries, 1 to 4898
Data columns (total 12 columns):
fixed.acidity           4898 non-null float64
volatile.acidity        4898 non-null float64
citric.acid             4898 non-null float64
residual.sugar          4898 non-null float64
chlorides               4898 non-null float64
free.sulfur.dioxide     4898 non-null float64
total.sulfur.dioxide    4898 non-null float64
density                 4898 non-null float64
pH                      4898 non-null float64
sulphates               4898 non-null float64
alcohol                 4898 non-null float64
quality                 4898 non-null int64
dtypes: float64(11), int64(1)
memory usage: 497.5 KB


## Preprocessing Data for performing Machine learning algorithms¶

In [22]:
#Classification - wine rated 7 or higher is good wine
bins = (2, 6.5, 9)
group_names = ['bad', 'good']
wine['quality'] = pd.cut(wine['quality'], bins = bins, labels = group_names)

In [23]:
# bad = 0 and good = 1 
from sklearn.preprocessing import LabelEncoder
label_quality = LabelEncoder()
wine['quality'] = label_quality.fit_transform(wine['quality'].astype(str))

In [26]:
# Categorizing 'good' and 'bad' counts
wine['quality'].value_counts()

0    3838
1    1060
Name: quality, dtype: int64

In [27]:
#splitting data to X and y
X = wine.drop(['quality'], axis = 1)
y = wine['quality']

In [28]:
#Train Test Split data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [29]:
#Standard Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [30]:
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

## Random Forest Classifier

In [31]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=200)
rfc.fit(X_train, y_train)
pred_rfc = rfc.predict(X_test)

In [32]:
from sklearn.metrics import classification_report
print(classification_report(y_test, pred_rfc))

              precision    recall  f1-score   support

           0       0.89      0.96      0.93       753
           1       0.84      0.60      0.70       227

   micro avg       0.88      0.88      0.88       980
   macro avg       0.86      0.78      0.81       980
weighted avg       0.88      0.88      0.87       980



In [33]:
#Check most important variables
rfc.feature_importances_

array([0.06579699, 0.08635942, 0.06973779, 0.08857465, 0.09030075,
       0.08431897, 0.08107201, 0.12219803, 0.0852878 , 0.07579737,
       0.15055622])

In [34]:
feats = ['density','alcohol']

In [35]:
#Retrain using important variables
x_train, x_test, Y_train, Y_test = train_test_split(wine[feats], wine.quality, test_size = 0.2, random_state = 42)

In [36]:
rfc2 = RandomForestClassifier(n_estimators=200)
rfc2.fit(x_train, Y_train)
pred_rfc2 = rfc2.predict(x_test)

In [37]:
print(classification_report(Y_test, pred_rfc2))

              precision    recall  f1-score   support

           0       0.86      0.89      0.88       753
           1       0.59      0.53      0.56       227

   micro avg       0.81      0.81      0.81       980
   macro avg       0.73      0.71      0.72       980
weighted avg       0.80      0.81      0.80       980



## Random forest has accuracy of % using all data, % using most important features

In [None]:
#Confusion matrix
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, pred_rfc))

# # Support Vector Classifier¶

In [None]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train, y_train)
pred_svc = svc.predict(X_test)

In [None]:
#Test different parameters
from sklearn.model_selection import GridSearchCV
param = {
    'C': [1, 10, 100, 1000],
    'kernel':['linear', 'rbf'],
    'gamma' :[0.0001, 0.0005, 0.001, 0.005]
}
grid_svc = GridSearchCV(svc, param_grid=param, scoring='accuracy', cv=10)

In [None]:
grid_svc.fit(X_train, y_train)

In [None]:
#Best parameters
grid_svc.best_params_

In [None]:
#SVC with best parameters
svc2 = SVC(C = 100, gamma =  0.005, kernel= 'rbf')
svc2.fit(X_train, y_train)
pred_svc2 = svc2.predict(X_test)
print(classification_report(y_test, pred_svc2))

In [None]:
#Cross validation for different models
from sklearn.model_selection import cross_val_score
algs_list = [rfc, rfc2, svc, svc2]
algs_names = ['rfc','rfc(best)', 'svc', 'svc(best)']
i=0
for alg in algs_list:
    cross_score = cross_val_score(alg, X = X_train, y = y_train, cv = 10)
    print(f"{algs_names[i]} has a Cross Validation Score of {cross_score.mean()}")
    i +=1