In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

from sklearn.preprocessing import MinMaxScaler
# from sklearn.impute import KNNImputer

from sklearn.ensemble import (RandomForestClassifier, 
                              GradientBoostingClassifier)
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier

from sklearn.datasets import make_classification

In [2]:
data = pd.read_csv("winequality-white.csv", sep = ";")

In [3]:
data.head() 

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [4]:
data.quality.value_counts()

6    2198
5    1457
7     880
8     175
4     163
3      20
9       5
Name: quality, dtype: int64

In [5]:
X = data [[u'fixed acidity', u'volatile acidity', u'citric acid',
           u'residual sugar',u'chlorides',u'free sulfur dioxide',
           u'total sulfur dioxide', u'density',u'pH', u'sulphates',
           u'alcohol']]

In [6]:
y = data.quality

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

In [8]:
X_train = pd.DataFrame(MinMaxScaler().fit(X_train).transform(X_train),
                       columns = X.columns)
X_test = pd.DataFrame(MinMaxScaler().fit(X_test).transform(X_test),
                       columns = X.columns)

### Модели

In [9]:
forest = RandomForestClassifier(n_estimators=2000, class_weight='balanced',
                                max_depth=20, random_state=0, n_jobs = -1)
forest.fit(X_train, y_train)

In [10]:
pred_forest = forest.predict(X_test)
f1_score(y_test, forest.predict(X_test), average='weighted')

0.4192332675465921

In [11]:
confusion_matrix(forest.predict(X_test),y_test)

array([[  0,   0,   0,   0,   0,   0],
       [  0,   1,   1,   0,   0,   0],
       [  3,  36, 300, 162,  19,   4],
       [  6,  33, 416, 930, 450,  86],
       [  0,   0,   0,   0,   2,   0],
       [  0,   0,   0,   0,   0,   0]], dtype=int64)

In [12]:
from sklearn.metrics import classification_report

print(classification_report(forest.predict(X_test),y_test))

              precision    recall  f1-score   support

           3       0.00      0.00      0.00         0
           4       0.01      0.50      0.03         2
           5       0.42      0.57      0.48       524
           6       0.85      0.48      0.62      1921
           7       0.00      1.00      0.01         2
           8       0.00      0.00      0.00         0

    accuracy                           0.50      2449
   macro avg       0.21      0.43      0.19      2449
weighted avg       0.76      0.50      0.59      2449



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [29]:
bcf = BaggingClassifier(n_estimators=1000, random_state=0, n_jobs=-1)
bcf.fit(X_train, y_train)

In [30]:
confusion_matrix(bcf.predict(X_test),y_test)

array([[  0,   0,   0,   0,   0,   0],
       [  0,   6,  10,   3,   1,   0],
       [  5,  45, 397, 321,  55,  18],
       [  4,  19, 310, 745, 383,  63],
       [  0,   0,   0,  16,  20,   6],
       [  0,   0,   0,   7,  12,   3]], dtype=int64)

In [31]:
print(classification_report(bcf.predict(X_test),y_test))

              precision    recall  f1-score   support

           3       0.00      0.00      0.00         0
           4       0.09      0.30      0.13        20
           5       0.55      0.47      0.51       841
           6       0.68      0.49      0.57      1524
           7       0.04      0.48      0.08        42
           8       0.03      0.14      0.05        22

    accuracy                           0.48      2449
   macro avg       0.23      0.31      0.22      2449
weighted avg       0.62      0.48      0.53      2449



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [32]:
pred_bcf = bcf.predict(X_test)
f1_score(y_test, bcf.predict(X_test), average='weighted')

0.4239507823767768

In [33]:
ada = AdaBoostClassifier(n_estimators=1000, random_state=0)
ada.fit(X_train, y_train)

In [34]:
pred_ada = ada.predict(X_test)
f1_score(y_test, ada.predict(X_test), average='weighted')

0.4028967465999217

In [35]:
confusion_matrix(forest.predict(X_test),y_test)

array([[  0,   0,   0,   0,   0,   0],
       [  0,   1,   1,   0,   0,   0],
       [  3,  36, 300, 162,  19,   4],
       [  6,  33, 416, 930, 450,  86],
       [  0,   0,   0,   0,   2,   0],
       [  0,   0,   0,   0,   0,   0]], dtype=int64)

In [36]:
confusion_matrix(bcf.predict(X_test),y_test)

array([[  0,   0,   0,   0,   0,   0],
       [  0,   6,  10,   3,   1,   0],
       [  5,  45, 397, 321,  55,  18],
       [  4,  19, 310, 745, 383,  63],
       [  0,   0,   0,  16,  20,   6],
       [  0,   0,   0,   7,  12,   3]], dtype=int64)

In [37]:
confusion_matrix(ada.predict(X_test),y_test)

array([[  0,   0,   0,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0],
       [  4,  35, 491, 421,  77,  13],
       [  5,  35, 226, 671, 394,  77],
       [  0,   0,   0,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0]], dtype=int64)

In [38]:
pred_forest_tr = forest.predict_proba(X_test) 
pred_bcf_tr = bcf.predict_proba(X_test)
pred_ada_tr = ada.predict_proba(X_test)

In [39]:
pred_forest_tr.shape

(2449, 7)

In [40]:
X_all_tr = (pred_forest_tr + pred_bcf_tr + pred_ada_tr)/3

In [41]:
X_all_tr.shape

(2449, 7)

In [42]:
X_all_tr.shape

(2449, 7)

In [43]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_all_tr,y_test, test_size=0.2, random_state=42)

In [44]:
GBC = GradientBoostingClassifier()
GBC.fit(X_train1, y_train1)

In [45]:
pred_GBC = GBC.predict(X_test1)

In [46]:
confusion_matrix(pred_GBC,y_test1)

array([[  0,   0,   3,   0,   0,   0],
       [  0,   1,   0,   0,   0,   0],
       [  1,   6,  89,  45,   3,   0],
       [  0,   4,  55, 153,  41,   5],
       [  0,   0,   3,  28,  39,   5],
       [  0,   0,   1,   3,   2,   3]], dtype=int64)

In [47]:
f1_score(y_test1, pred_GBC, average='weighted')

0.5751372843442627

In [None]:
X_train

In [49]:
pred_forest_m = forest.predict_proba(X_train) 
pred_bcf_m = bcf.predict_proba(X_train)
pred_ada_m = ada.predict_proba(X_train)

In [50]:
X_all_m = (pred_forest_m + pred_bcf_m + pred_ada_m)/3

In [51]:
GBC.fit(X_all_m, y_train)

In [52]:
pred_GBC_m = GBC.predict(X_all_m)

In [53]:
confusion_matrix(pred_GBC_m, y_train)

array([[  11,    0,    0,    0,    0,    0,    0],
       [   0,   93,    0,    0,    0,    0,    0],
       [   0,    0,  740,    0,    0,    0,    0],
       [   0,    0,    0, 1106,    0,    0,    0],
       [   0,    0,    0,    0,  409,    0,    0],
       [   0,    0,    0,    0,    0,   85,    0],
       [   0,    0,    0,    0,    0,    0,    5]], dtype=int64)

In [54]:
f1_score(y_train, pred_GBC_m, average='weighted')

1.0