In [3]:
## Import necessary functions
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, cross_val_score
import tensorflow as tf
from tensorflow import keras
%matplotlib inline

In [2]:
wine_data = pd.read_csv('winequality-red.csv', sep = ';')
wine_data.head()

FileNotFoundError: [Errno 2] No such file or directory: 'winequality-red.csv'

In [None]:
wine_data.head(10)

In [None]:
wine_data.info()

In [None]:
wine_data.quality.value_counts()
# we have 6 kind of quality scores and most of them are 5 and 6

In [None]:
wine_data.shape

In [None]:
sns.catplot(x ='quality',data = wine_data, kind = 'count')

In [None]:
plt.figure(1, figsize = (10,6))
plt.title("Quality distribution", color = "black", fontsize = 15)
wine_data["quality"].value_counts().plot.pie(autopct = "%1.1f%%");

In [None]:
# Plotting Figure Size
plt.figure(figsize = (15,10))
sns.set(font_scale = 1.1)
# Generate Heat Map, allow annotations and place floats in map
sns.heatmap(wine_data.corr() , cmap = "Blues", annot = True, fmt = ".2f", annot_kws = {"size": 12});

In [None]:
#grouping the wine quality into good or bad based on their quality
wine_data["quality remark"] = ["good" if i>=6 else "bad" for i in wine_data["quality"]]
wine_data.head(20)
#lesser than 7 is bad , 7 and greater than is good

In [None]:
## Using Label encoder, transforming catagorical features to numerical features.
label_quality = LabelEncoder()

In [None]:
wine_data['quality'] = label_quality.fit_transform(wine_data['quality'])

In [None]:
X = wine_data[wine_data.columns[:-1]]
y = wine_data["quality"]

In [None]:
features = wine_data.drop(['quality', 'quality remark'], axis=1)
target = wine_data['quality remark']
 
#Spliting dataset into training set and test set to cross-validate the models and determine their effectiveness.
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=40)
X_train.shape, X_test.shape
#split data into 80:20 ratio for model selection

In [None]:
#checking how many good and bad white wine there are
target.value_counts()

In [None]:
#plotting graph to visualize the count of the quality remark
sns.countplot(x=wine_data["quality remark"])
plt.show()

In [None]:
# Normalise the features to use zero mean normalisation
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)
X_st = sc.fit_transform(X)
print(X_st[:5])

In [None]:
## Logistic Regression

logistic_regression = LogisticRegression(solver = "liblinear")
logistic_regression_model = logistic_regression.fit(X_train,y_train)
y_pred_logistic_regression = logistic_regression_model.predict(X_test)

print(classification_report(y_test , y_pred_logistic_regression))
print("Accuracy Score:",accuracy_score(y_pred_logistic_regression,y_test))

In [None]:
score = round(accuracy_score(y_test, y_pred_logistic_regression), 6)
cm = confusion_matrix(y_test, y_pred_logistic_regression)
sns.heatmap(cm, annot = True, fmt = ".0f")
plt.xlabel("Predicted Values")
plt.ylabel("Actual Values")
plt.show()

n_folds = 5
clf_cv = SVC(C=1.0, kernel='rbf', degree=3, gamma='auto')
cv_error = (cross_val_score(clf_cv, X, y, cv=n_folds))
print(cv_error)

In [None]:
## Support Vector Machine
 
svc = SVC(C=1.0, kernel='rbf', degree=3, gamma='auto')
svc_model = svc.fit(X_train, y_train)
y_pred_svc = svc_model.predict(X_test)

print(classification_report(y_test , y_pred_svc))
print("Accuracy Score:",accuracy_score(y_pred_svc,y_test))

In [None]:
score = round(accuracy_score(y_test, y_pred_svc), 6)
cm = confusion_matrix(y_test, y_pred_svc)
sns.heatmap(cm, annot = True, fmt = ".0f")
plt.xlabel("Predicted Values")
plt.ylabel("Actual Values")
plt.show()

n_folds = 5
clf_cv = SVC(C=1.0, kernel='rbf', degree=3, gamma='auto')
cv_error = (cross_val_score(clf_cv, X, y, cv=n_folds))
print(cv_error)

In [None]:
## Decision Tree

decision_tree = DecisionTreeClassifier()
decision_tree_model = decision_tree.fit(X_train , y_train)
y_pred_decision_tree = decision_tree_model.predict(X_test)

print(classification_report(y_test , y_pred_decision_tree))
print("Accuracy Score:",accuracy_score(y_pred_decision_tree,y_test))

In [None]:
score = round(accuracy_score(y_test, y_pred_decision_tree), 6)
cm = confusion_matrix(y_test, y_pred_decision_tree)
sns.heatmap(cm, annot = True, fmt = ".0f")
plt.xlabel("Predicted Values")
plt.ylabel("Actual Values")
plt.show()

n_folds = 5
clf_cv = SVC(C=1.0, kernel='rbf', degree=3, gamma='auto')
cv_error = (cross_val_score(clf_cv, X, y, cv=n_folds))
print(cv_error)

In [None]:
from sklearn.decomposition import PCA
feature_names = list(X.columns)
pca = PCA(n_components=10)
Xs_pca = pca.fit_transform(X_st)
Xs_pca=Xs_pca[:,0:3] #retain the first 3 PC

In [None]:
## Mulitple layer perceptron neural network

model=tf.keras.models.Sequential(layers = None , name = None)
model.add(tf.keras.layers.Input(shape = 12,))
model.add(tf.keras.layers.Dense(units = 16 , activation = "relu" ))
model.add(tf.keras.layers.Dense(units = 8 , activation = "relu" ))
model.add(tf.keras.layers.Dense(units = 6 , activation = "sigmoid"))
model.summary()

In [None]:
#Transforming quality to categorical data
y_train_cat = tf.keras.utils.to_categorical(y_train, 6)
y_test_cat = tf.keras.utils.to_categorical(y_test, 6)

In [None]:
model.compile(optimizer = 'adam' , loss = 'categorical_crossentropy' ,metrics= ['accuracy'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_st, y, test_size=0.2, random_state=1, stratify=y)

In [None]:
from numpy import argmax

#history = model.fit(X_train, y_train, validation_data=(X_test, y_test),epochs=50)###
history = model.fit(X_train, y_train_cat,  batch_size= 32, epochs = 50 , validation_data = (X_test,y_test_cat))

pred = model.predict(X_test)
pred = argmax(pred, axis=-1).astype('int')
print(accuracy_score(pred, y_test))

In [None]:
pred