# **Wine Quality Prediction**

### Two datasets are included, related to red and white vinho verde wine samples, from the north of Portugal. The goal is to model wine quality based on physicochemical tests
## **Dataset link : https://www.kaggle.com/uciml/red-wine-quality-cortez-et-al-2009**

## Dataset description : Input variables
### 1 - fixed acidity

###2 - volatile acidity

###3 - citric acid

###4 - residual sugar

###5 - chlorides

###6 - free sulfur dioxide

###7 - total sulfur dioxide

###8 - density

###9 - pH

###10 - sulphates

###11 - alcohol

###Output variable (based on sensory data):

###12 - quality (score between 0 and 10)

###These datasets can be viewed as classification task. The classes are ordered and not balanced (e.g. there are many more normal wines than excellent or poor ones).

In [None]:
# importing required packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [None]:
# loading dataset
df=pd.read_csv("/content/winequality-red.csv")
df.head()

In [None]:
# checking null values
df.isna().sum()

In [None]:
# printing features data type
df.dtypes

In [None]:
# list of feature columns
print(df.columns)

In [None]:
print(df.info())

In [None]:
# describing the statistics of the given dataset
print(df.describe())

In [None]:
# displaying total number of duplicate rows/samples
df.duplicated().sum()

In [None]:
# finding a correlation between the feature columns
df.corr()

## Visualization

### visually displaying the correlation between the feature columns

In [None]:
# figure size
fig = plt.figure(figsize = (10,6))

sns.heatmap(df.corr(), annot=True)

In [None]:
sns.set(style="whitegrid")

sns.countplot('quality', data=df)

In [None]:
# storing a list of column
col = df.columns
print(col)

In [None]:
fig= plt.figure(figsize=(40, 40))

sns.pairplot(df, hue='quality', hue_order=col)

In [None]:
fig, ax = plt.subplots(4, 3, figsize=(25, 40))
k = 0

for i in range(4):
  for j in range(3):

    sns.distplot(df[col[k]], rug=True, axlabel=col[k], ax=ax[i][j])
    k += 1

###Some of the columns are highly skewed towards the right. So we have to apply a log transform to get a normal distribution.


In [None]:
# applying log transformation
df['residual sugar'] = df['residual sugar'].apply(np.log)
df['chlorides'] = df['chlorides'].apply(np.log)
df['free sulfur dioxide'] = df['free sulfur dioxide'].apply(np.log)
df['total sulfur dioxide'] = df['total sulfur dioxide'].apply(np.log)
df['sulphates'] = df['sulphates'].apply(np.log)
df['alcohol'] = df['alcohol'].apply(np.log)

### result after applying log transformation

In [None]:
fig, ax = plt.subplots(4, 3, figsize=(25, 40))
k = 0

for i in range(4):
  for j in range(3):

    sns.distplot(df[col[k]], rug=True, axlabel=col[k], ax=ax[i][j])
    k += 1

In [None]:
# correlation between output and input feature
df.corr()['quality'].sort_values(ascending=False)[1:]

In [None]:
# creating X-->input feature vector and y-->output feature vector

X = df.drop('quality', axis = 1)
y = df['quality']

In [None]:
# splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=21)

In [None]:
# normalizing the dataset

from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## Aplplying Machine Learning model

In [None]:
from lightgbm import LGBMClassifier

lgbm = LGBMClassifier()

# fitting the model to the training set
lgbm.fit(X_train, y_train)

In [None]:
# predicting output
prediction = lgbm.predict(X_test)

lgbm_acc = accuracy_score(y_test, prediction)
print(f"Accuracy Score of Training Data is {accuracy_score(y_train, lgbm.predict(X_train))}")
print(f"Accuracy Score of Testing  Data is {lgbm_acc}")

In [None]:
print("\n Train Data: LGBM_Confusion Matrix:\n ")
print(confusion_matrix(y_train, lgbm.predict(X_train)))

print("\n Train Data: LGBM_Classification Report:\n ")
print(classification_report(y_train, lgbm.predict(X_train)))

In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier(learning_rate = 0.1, loss = 'deviance', n_estimators = 100, max_dept=25)
xgb.fit(X_train, y_train)

# predicting output over the test set
prediction = xgb.predict(X_test)

xgb_acc = accuracy_score(y_test, prediction)

print(f"Accuracy Score of Training Data is {accuracy_score(y_train, xgb.predict(X_train))}")
print(f"Accuracy Score of Training Data is {xgb_acc}\n")

In [None]:
print("\n Train Data: xgboost_Confusion Matrix:\n ")
print(confusion_matrix(y_train, xgb.predict(X_train)))

print("\n Train Data: xgboost_Classification Report:\n ")
print(classification_report(y_train, xgb.predict(X_train)))

In [None]:
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV


rf = RandomForestClassifier()
parameters = {
    'min_samples_split' : [2, 3, 4, 5],
    'max_depth' : [10, 15, 20, 25],
    'n_estimators' : [90, 100, 110, 120]
}



grid_search = GridSearchCV(rf, parameters, cv = 10, n_jobs = -1)
grid_search.fit(X_train, y_train)

In [None]:
# best paramters 
grid_search.best_params_

In [None]:
grid_search.best_score_

In [None]:
rf = RandomForestClassifier(max_depth = 15, min_samples_split = 3, n_estimators = 100)
rf.fit(X_train, y_train)

prediction = rf.predict(X_test)

rf_acc = accuracy_score(y_test, prediction)

print(f"Accuracy Score of Training Data is {accuracy_score(y_train, rf.predict(X_train))}")
print(f"Accuracy Score of Testing Data is {rf_acc}\n")

In [None]:
print("\n Train Data: RandomForestClassifier_Confusion Matrix:\n ")
print(confusion_matrix(y_train, rf.predict(X_train)))

print("\n Train Data: RandomForestClassifier_Classification Report:\n ")
print(classification_report(y_train, rf.predict(X_train)))

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier()
parameters = {
    'min_samples_split' : [2, 3, 4, 5, 6],
    'min_samples_leaf' : [1, 2, 3, 4],
    'n_estimators' : [80, 90, 100, 110, 120]
}

grid_search = GridSearchCV(gb, parameters, cv = 10, n_jobs = -1)
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_score_

In [None]:
gb = GradientBoostingClassifier(min_samples_leaf = 1, min_samples_split = 6, n_estimators = 120)
gb.fit(X_train, y_train)

prediction = gb.predict(X_test)

gb_acc = accuracy_score(y_test, prediction)

print(f"Accuracy Score of Training Data is {accuracy_score(y_train, gb.predict(X_train))}")
print(f"Accuracy Score of Testing Data is {gb_acc}\n")

In [None]:
print("\n Train Data: GradientBoostingClassifier_Confusion Matrix:\n ")
print(confusion_matrix(y_train, gb.predict(X_train)))

print("\n Train Data: GradientBoostingClassifier_Classification Report:\n ")
print(classification_report(y_train, gb.predict(X_train)))

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

etc = ExtraTreesClassifier()
etc.fit(X_train, y_train)

prediction = etc.predict(X_test)

etc_acc = accuracy_score(y_test, prediction)

print(f"Accuracy Score of Training Data is {accuracy_score(y_train, etc.predict(X_train))}")
print(f"Accuracy Score of Testing Data is {etc_acc} \n")

In [None]:
print("\n Train Data: ExtraTreesClassifier_Confusion Matrix:\n ")
print(confusion_matrix(y_train, etc.predict(X_train)))

print("\n Train Data: ExtraTreesClassifier_Classification Report:\n ")
print(classification_report(y_train, etc.predict(X_train)))

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
parameters = {
    'n_neighbors' : [5, 10, 15],
    'weights' : ['distance'],
    'leaf_size' : [20, 25, 30, 35]
}

grid_search = GridSearchCV(knn, parameters, cv = 10, n_jobs = -1)
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_score_

In [None]:
knn = KNeighborsClassifier(n_neighbors = 15, leaf_size = 20, weights = 'distance')
knn.fit(X_train, y_train)

prediction = knn.predict(X_test)

knn_acc = accuracy_score(y_test, prediction)

print(f"Accuracy Score of Training Data is {accuracy_score(y_train, knn.predict(X_train))}")
print(f"Accuracy Score of Testing Data is {knn_acc}\n")

In [None]:
print("\n Train Data: KNN_Confusion Matrix:\n ")
print(confusion_matrix(y_train, knn.predict(X_train)))

print("\n Train Data: KNN_Classification Report:\n ")
print(classification_report(y_train, knn.predict(X_train)))

### Plotting the accuracy of all the above ML model

In [None]:
models = pd.DataFrame({
    'Model' : ['LGBM', 'XgBoost', 'RandomForestClassifier', 'Gradient Boosting', 
               'Extra Tree', 'KNN'
              ],
    'Score' : [lgbm_acc, xgb_acc, rf_acc, gb_acc, etc_acc, knn_acc]
})


models.sort_values(by = 'Score', ascending = False)

In [None]:
plt.figure(figsize = (20, 8))

sns.barplot(x = 'Model', y = 'Score', data = models)
plt.show()