Importing The Dependencies

In [47]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


Data collection

In [2]:
dataset = pd.read_csv('data/winequality-red.csv')

In [7]:
# Printing The First 5 Rows
dataset.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [4]:
# Printing The Shape Of The Dataset
print(dataset.shape)

(1599, 12)


In [8]:
# Printing The Description Of The Dataset
dataset.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [9]:
# Printing The Statistical Summary Of The Dataset
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [10]:
# Checking For Missing Values
dataset.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

Data Analysis And Visulaization

In [36]:
# Statistical Analysis Of The Dataset
plt.figure(figsize=(10,6))
sns.catplot(x='quality', data=dataset, kind='count')
plt.savefig('./wineQualityFig/count_quality.png')

<img src="./wineQualityFig/count_quality.png" />

In [23]:
plot = plt.figure(figsize=(5,5))
sns.barplot(x='quality', y='volatile acidity', data=dataset)
plot.savefig('./wineQualityFig/demo.png', bbox_inches='tight')

<img src="./wineQualityFig/demo.png" />

In [25]:
plot = plt.figure(figsize=(5,5))
sns.barplot(x='quality', y='citric acid', data=dataset)
plot.savefig('./wineQualityFig/citric.png', bbox_inches='tight')

<img src="./wineQualityFig/citric.png" />

Correlation

In [28]:
# Correlation 
corr = dataset.corr()
corr["quality"].sort_values(ascending=False)

quality                 1.000000
alcohol                 0.476166
sulphates               0.251397
citric acid             0.226373
fixed acidity           0.124052
residual sugar          0.013732
free sulfur dioxide    -0.050656
pH                     -0.057731
chlorides              -0.128907
density                -0.174919
total sulfur dioxide   -0.185100
volatile acidity       -0.390558
Name: quality, dtype: float64

In [31]:
# Construction the Heatmap To Visualize The Correlation Between The Feature
plt.figure(figsize=(10,10))
sns.heatmap(corr, cbar=True ,square=True,fmt='.1f' ,annot=True,annot_kws={'size':10},cmap='Blues')
plt.savefig('./wineQualityFig/corr.png', bbox_inches='tight')

<img src="./wineQualityFig/corr.png" />

Data PreProcessing

In [38]:
# Seprating The Data and label
X = dataset.drop('quality', axis=1)
Y = dataset['quality'].apply(lambda y_value: 1 if y_value > 6 else 0)

In [35]:

print("X Shape:",X.shape)
print("Y Shape:",Y.shape)

X Shape: (1599, 11)
Y Shape: (1599,)


In [40]:
print(Y.value_counts())

0    1382
1     217
Name: quality, dtype: int64


Train and Test Split

In [74]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=42)

In [75]:
print("X_train Shape:",X_train.shape)
print("X_test Shape:",X_test.shape)
print("Y_train Shape:",Y_train.shape)
print("Y_test Shape:",Y_test.shape)

X_train Shape: (1279, 11)
X_test Shape: (320, 11)
Y_train Shape: (1279,)
Y_test Shape: (320,)


Training Model

In [76]:
# Random model clissifier 
model = RandomForestClassifier()

In [77]:
model.fit(X_train,Y_train)

In [78]:
X_train_prediction=model.predict(X_train)
X_test_prediction=model.predict(X_test)

# Accuracy Score For Training Data
print("Accuracy Score For Training Data:",accuracy_score(Y_train,X_train_prediction))

Accuracy Score For Training Data: 1.0


In [79]:
# Accuracy Score For Testing Data
print("Accuracy Score For Testing Data:",accuracy_score(Y_test,X_test_prediction))

Accuracy Score For Testing Data: 0.9


Building A Predictive System

In [83]:
input_data = (7.3,0.65,0.0,1.2,0.065,15.0,21.0,0.9946,3.39,0.47,10.0)
# Changing The Input Data To Array
input_data = np.asarray(input_data)
print("Input Data:",input_data)

Input Data: [ 7.3     0.65    0.      1.2     0.065  15.     21.      0.9946  3.39
  0.47   10.    ]


In [85]:
input_data_reshape = input_data.reshape(1,-1)
print("Input Data Reshape:",input_data_reshape)

Input Data Reshape: [[ 7.3     0.65    0.      1.2     0.065  15.     21.      0.9946  3.39
   0.47   10.    ]]


In [91]:
prediction = model.predict(input_data_reshape)
print("Prediction:",prediction)
if(prediction==0):
    print("The Wine Quality is Bad")
else:
    print("The Wine Quality is Good")

Prediction: [1]
The Wine Quality is Good


