# **Wine Quality Prediction**

In [0]:
import pandas as pd

In [0]:
data = pd.read_csv('https://raw.githubusercontent.com/edyoda/data-science-complete-tutorial/master/Data/winequality-white.csv', sep=';')

In [3]:
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [4]:
data.quality.value_counts()

6    2198
5    1457
7     880
8     175
4     163
3      20
9       5
Name: quality, dtype: int64

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         4898 non-null   float64
 1   volatile acidity      4898 non-null   float64
 2   citric acid           4898 non-null   float64
 3   residual sugar        4898 non-null   float64
 4   chlorides             4898 non-null   float64
 5   free sulfur dioxide   4898 non-null   float64
 6   total sulfur dioxide  4898 non-null   float64
 7   density               4898 non-null   float64
 8   pH                    4898 non-null   float64
 9   sulphates             4898 non-null   float64
 10  alcohol               4898 non-null   float64
 11  quality               4898 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 459.3 KB


# **Some Important Observations**
* After first experiment, we realize that we have too less of observations to get the right class of data
* Can we minimize/bucketize the target information?
* Let's try to bucketize it to 3 classes


In [0]:
def f(r):
    if r <= 3:
        return 1
    elif r > 3 and r<= 6:
        return 2
    else:
        return 3

data.quality = data.quality.map(f)

In [0]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [0]:
lr = LogisticRegression()

In [0]:
trainX, testX, trainY, testY = train_test_split(data.drop(columns=['quality']), data.quality)

In [15]:
lr.fit(trainX, trainY)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [16]:
lr.score(testX, testY)

0.7746938775510204

In [0]:
y_pred = lr.predict(testX)

In [0]:
from sklearn.metrics import confusion_matrix

In [0]:
conf_mat = confusion_matrix(y_pred=y_pred, y_true=testY)

In [20]:
pd.DataFrame(conf_mat, columns=['Predicted 1', 'Predicted 2', 'Predicted 3'], index=['Actual 1', 'Actual 2', 'Actual 3'])

Unnamed: 0,Predicted 1,Predicted 2,Predicted 3
Actual 1,0,2,0
Actual 2,0,898,65
Actual 3,0,209,51


In [21]:
data.quality.value_counts()

2    3818
3    1060
1      20
Name: quality, dtype: int64

**Understanding the above observation**
* The classes seems to be imbalanced.
* ML models are succiptable to predict data belonging to maximum representative class.
* As we see from above confusion matrix, because 0 class had very few data model is not at all predicting class 0

**Dealing with imbalanced classes**
* Most popular technique is generating under representative class.
* Either we can repeat information or using nearest neighbours, we generate similar data.
* Other no so popular techniques are reducing over representative class of data

In [22]:
from imblearn.over_sampling import RandomOverSampler, SMOTE



In [0]:
sampler = RandomOverSampler()

In [27]:
feature_s, target_s = sampler.fit_sample(data.drop(columns=['quality']), data.quality)



In [30]:
feature_s.shape

(11454, 11)

In [31]:
pd.Series(target_s).value_counts()

3    3818
2    3818
1    3818
dtype: int64

In [0]:
lr = LogisticRegression()

In [0]:
trainX, testX, trainY, testY = train_test_split(feature_s, target_s)

In [34]:
lr.fit(trainX, trainY)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [0]:
y_pred = lr.predict(testX)

In [36]:
confusion_matrix(y_pred=y_pred, y_true=testY)

array([[516, 206, 250],
       [317, 316, 302],
       [128, 160, 669]])

In [38]:
(516 + 316 + 669 )/(516+206+250 + 317+316+302+128+160+669)

0.5240921787709497

In [37]:
lr.score(testX, testY)

0.5240921787709497

* Decision Tree Classifier

In [0]:
from sklearn.tree import DecisionTreeClassifier

In [0]:
dt = DecisionTreeClassifier()

In [41]:
dt.fit(trainX, trainY)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [42]:
dt.score(testX, testY)

0.9483240223463687

In [0]:
y_pred = dt.predict(testX)

In [0]:
conf_mat = confusion_matrix(y_pred=y_pred, y_true=testY)

In [46]:
pd.DataFrame(conf_mat, columns=['Predicted 1', 'Predicted 2', 'Predicted 3'], index=['Actual 1', 'Actual 2', 'Actual 3'])

Unnamed: 0,Predicted 1,Predicted 2,Predicted 3
Actual 1,972,0,0
Actual 2,6,814,115
Actual 3,0,27,930


In [47]:
(972 + 814 + 930) / (972 + 814 + 930 + 6 + 27 + 115)

0.9483240223463687

* RandomForest Classifier

In [0]:
from sklearn.ensemble import RandomForestClassifier

In [0]:
rf = RandomForestClassifier(n_estimators=100, n_jobs=2)

In [50]:
rf.fit(trainX, trainY)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=2,
                       oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [51]:
rf.score(testX, testY)

0.9696229050279329

In [0]:
y_pred = rf.predict(testX)

In [0]:
conf_mat = confusion_matrix(y_pred=y_pred, y_true=testY)

In [57]:
pd.DataFrame(conf_mat, columns=['Predicted 1', 'Predicted 2', 'Predicted 3'], index=['Actual 1', 'Actual 2', 'Actual 3'])

Unnamed: 0,Predicted 1,Predicted 2,Predicted 3
Actual 1,972,0,0
Actual 2,1,872,62
Actual 3,0,24,933


In [58]:
(972 + 872 + 933 ) /( 972 + 872 + 933 + 1 + 24 + 62)

0.9696229050279329