In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
W = pd.read_csv('winequality-red.csv')
W

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [3]:
W.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [4]:
W.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [5]:
Xf = W.columns[:-1]
Xf

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol'],
      dtype='object')

In [6]:
W.corr()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
fixed acidity,1.0,-0.256131,0.671703,0.114777,0.093705,-0.153794,-0.113181,0.668047,-0.682978,0.183006,-0.061668,0.124052
volatile acidity,-0.256131,1.0,-0.552496,0.001918,0.061298,-0.010504,0.07647,0.022026,0.234937,-0.260987,-0.202288,-0.390558
citric acid,0.671703,-0.552496,1.0,0.143577,0.203823,-0.060978,0.035533,0.364947,-0.541904,0.31277,0.109903,0.226373
residual sugar,0.114777,0.001918,0.143577,1.0,0.05561,0.187049,0.203028,0.355283,-0.085652,0.005527,0.042075,0.013732
chlorides,0.093705,0.061298,0.203823,0.05561,1.0,0.005562,0.0474,0.200632,-0.265026,0.37126,-0.221141,-0.128907
free sulfur dioxide,-0.153794,-0.010504,-0.060978,0.187049,0.005562,1.0,0.667666,-0.021946,0.070377,0.051658,-0.069408,-0.050656
total sulfur dioxide,-0.113181,0.07647,0.035533,0.203028,0.0474,0.667666,1.0,0.071269,-0.066495,0.042947,-0.205654,-0.1851
density,0.668047,0.022026,0.364947,0.355283,0.200632,-0.021946,0.071269,1.0,-0.341699,0.148506,-0.49618,-0.174919
pH,-0.682978,0.234937,-0.541904,-0.085652,-0.265026,0.070377,-0.066495,-0.341699,1.0,-0.196648,0.205633,-0.057731
sulphates,0.183006,-0.260987,0.31277,0.005527,0.37126,0.051658,0.042947,0.148506,-0.196648,1.0,0.093595,0.251397


In [7]:
for a in range(len(W.corr().columns)):
    for b in range(a):
        if abs(W.corr().iloc[a,b]) > 0.6:
            name = W.corr().columns[a]
            print(name)

citric acid
total sulfur dioxide
density
pH


In [8]:
W['alcohol'].unique()

array([ 9.4       ,  9.8       , 10.        ,  9.5       , 10.5       ,
        9.2       ,  9.9       ,  9.1       ,  9.3       ,  9.        ,
        9.7       , 10.1       , 10.6       ,  9.6       , 10.8       ,
       10.3       , 13.1       , 10.2       , 10.9       , 10.7       ,
       12.9       , 10.4       , 13.        , 14.        , 11.5       ,
       11.4       , 12.4       , 11.        , 12.2       , 12.8       ,
       12.6       , 12.5       , 11.7       , 11.3       , 12.3       ,
       12.        , 11.9       , 11.8       ,  8.7       , 13.3       ,
       11.2       , 11.6       , 11.1       , 13.4       , 12.1       ,
        8.4       , 12.7       , 14.9       , 13.2       , 13.6       ,
       13.5       , 10.03333333,  9.55      ,  8.5       , 11.06666667,
        9.56666667, 10.55      ,  8.8       , 13.56666667, 11.95      ,
        9.95      ,  9.23333333,  9.25      ,  9.05      , 10.75      ])

In [9]:
W.corr()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
fixed acidity,1.0,-0.256131,0.671703,0.114777,0.093705,-0.153794,-0.113181,0.668047,-0.682978,0.183006,-0.061668,0.124052
volatile acidity,-0.256131,1.0,-0.552496,0.001918,0.061298,-0.010504,0.07647,0.022026,0.234937,-0.260987,-0.202288,-0.390558
citric acid,0.671703,-0.552496,1.0,0.143577,0.203823,-0.060978,0.035533,0.364947,-0.541904,0.31277,0.109903,0.226373
residual sugar,0.114777,0.001918,0.143577,1.0,0.05561,0.187049,0.203028,0.355283,-0.085652,0.005527,0.042075,0.013732
chlorides,0.093705,0.061298,0.203823,0.05561,1.0,0.005562,0.0474,0.200632,-0.265026,0.37126,-0.221141,-0.128907
free sulfur dioxide,-0.153794,-0.010504,-0.060978,0.187049,0.005562,1.0,0.667666,-0.021946,0.070377,0.051658,-0.069408,-0.050656
total sulfur dioxide,-0.113181,0.07647,0.035533,0.203028,0.0474,0.667666,1.0,0.071269,-0.066495,0.042947,-0.205654,-0.1851
density,0.668047,0.022026,0.364947,0.355283,0.200632,-0.021946,0.071269,1.0,-0.341699,0.148506,-0.49618,-0.174919
pH,-0.682978,0.234937,-0.541904,-0.085652,-0.265026,0.070377,-0.066495,-0.341699,1.0,-0.196648,0.205633,-0.057731
sulphates,0.183006,-0.260987,0.31277,0.005527,0.37126,0.051658,0.042947,0.148506,-0.196648,1.0,0.093595,0.251397


In [10]:
Cf = ['citric acid', 'total sulfur dioxide', 'density', 'pH']

In [11]:
W1=pd.get_dummies(W[Xf], columns= Cf, drop_first=True)

In [12]:
W1.columns

Index(['fixed acidity', 'volatile acidity', 'residual sugar', 'chlorides',
       'free sulfur dioxide', 'sulphates', 'alcohol', 'citric acid_0.01',
       'citric acid_0.02', 'citric acid_0.03',
       ...
       'pH_3.69', 'pH_3.7', 'pH_3.71', 'pH_3.72', 'pH_3.74', 'pH_3.75',
       'pH_3.78', 'pH_3.85', 'pH_3.9', 'pH_4.01'],
      dtype='object', length=752)

In [13]:
Xf=W1.columns

In [14]:
X = sm.add_constant(W1)
Y = W['quality']

In [15]:
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=102)

In [16]:
Imodel1 = sm.OLS(Ytrain, Xtrain).fit()
Imodel1.summary()

0,1,2,3
Dep. Variable:,quality,R-squared:,0.775
Model:,OLS,Adj. R-squared:,0.496
Method:,Least Squares,F-statistic:,2.78
Date:,"Tue, 22 Mar 2022",Prob (F-statistic):,1.89e-35
Time:,11:50:13,Log-Likelihood:,-577.04
No. Observations:,1279,AIC:,2570.0
Df Residuals:,571,BIC:,6219.0
Df Model:,707,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.9134,1.368,0.668,0.505,-1.774,3.601
fixed acidity,-0.0135,0.049,-0.277,0.782,-0.109,0.082
volatile acidity,-1.1009,0.217,-5.077,0.000,-1.527,-0.675
residual sugar,0.0301,0.034,0.886,0.376,-0.037,0.097
chlorides,-1.7645,0.716,-2.466,0.014,-3.170,-0.359
free sulfur dioxide,0.0092,0.005,1.988,0.047,0.000,0.018
sulphates,1.2627,0.222,5.690,0.000,0.827,1.699
alcohol,0.2794,0.048,5.800,0.000,0.185,0.374
citric acid_0.01,0.0578,0.211,0.275,0.784,-0.356,0.471

0,1,2,3
Omnibus:,54.94,Durbin-Watson:,1.994
Prob(Omnibus):,0.0,Jarque-Bera (JB):,156.605
Skew:,-0.119,Prob(JB):,9.86e-35
Kurtosis:,4.698,Cond. No.,1.75e+18


In [17]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics

In [18]:
W = pd.read_csv('winequality-red.csv')
W.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [19]:
W['quality'].value_counts()

5    681
6    638
7    199
4     53
8     18
3     10
Name: quality, dtype: int64

In [20]:
W.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')

In [21]:
Xf = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol']

In [22]:
X = W[Xf]
Y = W['quality']

In [37]:
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=63)

In [38]:
scaler = StandardScaler()

In [39]:
pca = PCA(n_components=2)

In [40]:
knn = KNeighborsClassifier()

In [41]:
operations = [('scaler', scaler), ('pca', pca), ('knn', knn)]

In [42]:
pipe = Pipeline(operations)

In [43]:
k_values = list(range(1,30))

In [44]:
params = {'knn__n_neighbors': k_values,'knn__metric': ['euclidean', 'manhattan', 'chebyshev', 
            'canberra']}

In [45]:
model = GridSearchCV(estimator = pipe, param_grid=params, cv = 10, scoring='accuracy')

In [46]:
model.fit(Xtrain,Ytrain)



GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('pca', PCA(n_components=2)),
                                       ('knn', KNeighborsClassifier())]),
             param_grid={'knn__metric': ['euclidean', 'manhattan', 'chebyshev',
                                         'canberra'],
                         'knn__n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
                                              12, 13, 14, 15, 16, 17, 18, 19,
                                              20, 21, 22, 23, 24, 25, 26, 27,
                                              28, 29]},
             scoring='accuracy')

In [47]:
model.best_estimator_

Pipeline(steps=[('scaler', StandardScaler()), ('pca', PCA(n_components=2)),
                ('knn',
                 KNeighborsClassifier(metric='manhattan', n_neighbors=1))])

In [48]:
model.best_params_

{'knn__metric': 'manhattan', 'knn__n_neighbors': 1}

In [49]:
predy = model.predict(Xtest)
print(confusion_matrix(Ytest,predy))

[[ 0  0  2  0  1  0]
 [ 0  1  5  3  0  0]
 [ 0  6 87 38  4  0]
 [ 0  2 41 70  9  1]
 [ 3  2 12 12 16  2]
 [ 0  0  0  2  1  0]]


In [50]:
print(classification_report(Ytest,predy))

              precision    recall  f1-score   support

           3       0.00      0.00      0.00         3
           4       0.09      0.11      0.10         9
           5       0.59      0.64      0.62       135
           6       0.56      0.57      0.56       123
           7       0.52      0.34      0.41        47
           8       0.00      0.00      0.00         3

    accuracy                           0.54       320
   macro avg       0.29      0.28      0.28       320
weighted avg       0.54      0.54      0.54       320



In [51]:
auc1 = metrics.roc_auc_score(Ytest,model.predict_proba(Xtest)[:,1])
auc1

ValueError: multi_class must be in ('ovo', 'ovr')