In [22]:

import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import cohen_kappa_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import plot_confusion_matrix 

red = pd.read_csv('Wine Project\winequality-red.csv', delimiter= ';')
white = pd.read_csv('Wine Project\winequality-white.csv', delimiter= ';')


In [23]:
# concat two data sets
red['type'] = 'red'
white['type'] = 'white'
all_wine = pd.concat([red, white], ignore_index=True)

all_wine['quality_label'] = all_wine['quality'].apply(
    lambda value: 'low'
    if value <= 5 else 'medium'
    if value <= 7 else 'high'
)

In [24]:
display(all_wine.columns)

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality', 'type', 'quality_label'],
      dtype='object')

In [25]:
display(all_wine)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type,quality_label
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,red,low
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,red,low
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,red,low
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,red,medium
4,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,red,low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6,white,medium
6493,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5,white,low
6494,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6,white,medium
6495,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7,white,medium


In [26]:
x = all_wine.copy(deep=True)

In [27]:
# leave only fields that are not correlated with each other for the training
X = x[[
    'residual sugar',
    'chlorides',
    'total sulfur dioxide',
    'pH',
    'sulphates',
    'alcohol'
]]

In [28]:
y = all_wine['type']

print(X.shape)
print(y.shape)

(6497, 6)
(6497,)


In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=1)

In [30]:
# Logistic Regression

model = LogisticRegression()                    # 2. instantiate model
model.fit(X_train, y_train)                     # 3. fit model to data
y_model = model.predict(X_test)                 # 4. predict on new data
print(accuracy_score(y_test, y_model))


0.9615384615384616


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [31]:
# Kohen Kapa core
kappa = cohen_kappa_score(y_model, y_test)
print("Cohen-Kappa score:", kappa)

Cohen-Kappa score: 0.8903800943743064


In [32]:
pred_lr = model.predict(X_test)
print(pred_lr)

['white' 'white' 'white' ... 'red' 'red' 'white']


In [33]:
# X = df # --> the features we will keep to build our model
# y = target # --> what you're trying to predict

In [34]:
quality_label = LabelEncoder()

In [35]:


#Bad becomes 0 and good becomes 1 
all_wine['quality'] = quality_label.fit_transform(all_wine['quality'])
all_wine['quality'].value_counts()


3    2836
2    2138
4    1079
1     216
5     193
0      30
6       5
Name: quality, dtype: int64

In [36]:
#Scaler
# data normalisation with sklearn
from sklearn.preprocessing import MinMaxScaler

# fit scaler on training data
norm = MinMaxScaler().fit(X_train)

# transform data
X_train_norm = norm.transform(X_train)
X_test_norm = norm.transform(X_test)

In [37]:
# Random Forest Classifier

rfc = RandomForestClassifier(n_estimators=200)
rfc.fit(X_train, y_train)
pred_rfc = rfc.predict(X_test)



In [45]:
#Let's see how our model performed
print(classification_report(y_test, pred_rfc))

kappa = cohen_kappa_score(y_model, y_test)
print("Cohen-Kappa score:", kappa)
# random for give 99%

              precision    recall  f1-score   support

         red       0.98      0.98      0.98       297
       white       0.99      0.99      0.99      1003

    accuracy                           0.99      1300
   macro avg       0.99      0.99      0.99      1300
weighted avg       0.99      0.99      0.99      1300

Cohen-Kappa score: 0.8903800943743064


In [39]:
# Support Vector Classifier

svc = SVC()
svc.fit(X_train, y_train)
pred_svc = svc.predict(X_test)



In [40]:
print(classification_report(y_test, pred_svc))

# Support Vector Classifier give 93%

              precision    recall  f1-score   support

         red       0.91      0.78      0.84       297
       white       0.94      0.98      0.96      1003

    accuracy                           0.93      1300
   macro avg       0.93      0.88      0.90      1300
weighted avg       0.93      0.93      0.93      1300



In [41]:
# Decision Tree

from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X_train,y_train)
dt_predict = dt.predict(X_test)

In [42]:
#print confusion matrix and accuracy score
dt_conf_matrix = confusion_matrix(y_test, dt_predict)
dt_acc_score = accuracy_score(y_test, dt_predict)
print(dt_conf_matrix)
print(dt_acc_score*100)

# Decision Tree give 98 %

[[288   9]
 [ 15 988]]
98.15384615384616


In [43]:
# NaiveBayes
nb = GaussianNB()
nb.fit(X_train,y_train)
nb_predict=nb.predict(X_test)

In [44]:
#print confusion matrix and accuracy score
nb_conf_matrix = confusion_matrix(y_test, nb_predict)
nb_acc_score = accuracy_score(y_test, nb_predict)
print(nb_conf_matrix)
print(nb_acc_score*100)

[[273  24]
 [ 47 956]]
94.53846153846153
