In [2]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

In [3]:
df = pd.read_csv('winequality-red.csv')
df['quality'] = df['quality'].transform(lambda x: 0 if x <= 5 else 1)

df.pop('citric acid')
df.pop('free sulfur dioxide')
df.pop('fixed acidity')

df = df.drop(df[(df['volatile acidity'] > 1.0)].index)
df = df.drop(df[(df['chlorides'] > 0.15)].index)
df = df.drop(df[(df['total sulfur dioxide'] > 125)].index)

target = df.pop('quality')
features = df

features = MinMaxScaler().fit_transform(features)
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=1234)

In [4]:
from sklearn.model_selection import GridSearchCV

In [5]:
params = {
    'n_estimators': [200, 250, 300, 350, 400, 450, 500],
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_depth': [None, 10, 50, 100, 200],
    'min_samples_split': [2, 4, 8, 10, 20],
    'min_samples_leaf': [1, 2, 5, 10, 20]
}

In [6]:
gsc = GridSearchCV(RandomForestClassifier(), param_grid=params, verbose=3, cv=5)

In [7]:
gsc.fit(x_train, y_train)

Fitting 5 folds for each of 2625 candidates, totalling 13125 fits
[CV 1/5] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.834 total time=   0.2s
[CV 2/5] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.779 total time=   0.2s
[CV 3/5] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.799 total time=   0.2s
[CV 4/5] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.838 total time=   0.2s
[CV 5/5] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.769 total time=   0.2s
[CV 1/5] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=250;, score=0.821 total time=   0.3s
[CV 2/5] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=250;, score=0.804 t

In [8]:
gsc.best_score_

0.8148499727223133

In [9]:
gsc.best_params_

{'criterion': 'log_loss',
 'max_depth': None,
 'min_samples_leaf': 2,
 'min_samples_split': 4,
 'n_estimators': 450}

In [10]:
print(classification_report(y_test, gsc.predict(x_test)))

              precision    recall  f1-score   support

           0       0.85      0.81      0.83       130
           1       0.85      0.88      0.87       164

    accuracy                           0.85       294
   macro avg       0.85      0.85      0.85       294
weighted avg       0.85      0.85      0.85       294



In [16]:
params = {
    'n_estimators': [410, 420, 430, 440, 450, 460, 470, 480, 490],
    'criterion': ['log_loss'],
    'max_depth': [None, 300, 400],
    'min_samples_split': [4, 5, 6, 7],
    'min_samples_leaf': [2, 3, 4]
}
gsc = GridSearchCV(RandomForestClassifier(), param_grid=params, verbose=3, cv=5)
gsc.fit(x_train, y_train)

Fitting 5 folds for each of 324 candidates, totalling 1620 fits
[CV 1/5] END criterion=log_loss, max_depth=None, min_samples_leaf=2, min_samples_split=4, n_estimators=410;, score=0.826 total time=   0.5s
[CV 2/5] END criterion=log_loss, max_depth=None, min_samples_leaf=2, min_samples_split=4, n_estimators=410;, score=0.791 total time=   0.4s
[CV 3/5] END criterion=log_loss, max_depth=None, min_samples_leaf=2, min_samples_split=4, n_estimators=410;, score=0.808 total time=   0.4s
[CV 4/5] END criterion=log_loss, max_depth=None, min_samples_leaf=2, min_samples_split=4, n_estimators=410;, score=0.833 total time=   0.4s
[CV 5/5] END criterion=log_loss, max_depth=None, min_samples_leaf=2, min_samples_split=4, n_estimators=410;, score=0.774 total time=   0.5s
[CV 1/5] END criterion=log_loss, max_depth=None, min_samples_leaf=2, min_samples_split=4, n_estimators=420;, score=0.834 total time=   0.4s
[CV 2/5] END criterion=log_loss, max_depth=None, min_samples_leaf=2, min_samples_split=4, n_esti

In [17]:
gsc.best_score_

0.8114420803782506

In [18]:
gsc.best_params_

{'criterion': 'log_loss',
 'max_depth': None,
 'min_samples_leaf': 2,
 'min_samples_split': 4,
 'n_estimators': 450}

In [19]:
print(classification_report(y_test, gsc.predict(x_test)))

              precision    recall  f1-score   support

           0       0.84      0.78      0.81       130
           1       0.84      0.88      0.86       164

    accuracy                           0.84       294
   macro avg       0.84      0.83      0.83       294
weighted avg       0.84      0.84      0.84       294



In [20]:
params = {
    'n_estimators': [441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459],
    'criterion': ['log_loss'],
    'max_depth': [None, 500, 600],
    'min_samples_split': [4, 5],
    'min_samples_leaf': [2]
}

gsc = GridSearchCV(RandomForestClassifier(), param_grid=params, verbose=3, cv=5)
gsc.fit(x_train, y_train)

Fitting 5 folds for each of 114 candidates, totalling 570 fits
[CV 1/5] END criterion=log_loss, max_depth=None, min_samples_leaf=2, min_samples_split=4, n_estimators=441;, score=0.830 total time=   0.5s
[CV 2/5] END criterion=log_loss, max_depth=None, min_samples_leaf=2, min_samples_split=4, n_estimators=441;, score=0.787 total time=   0.5s
[CV 3/5] END criterion=log_loss, max_depth=None, min_samples_leaf=2, min_samples_split=4, n_estimators=441;, score=0.812 total time=   0.5s
[CV 4/5] END criterion=log_loss, max_depth=None, min_samples_leaf=2, min_samples_split=4, n_estimators=441;, score=0.838 total time=   0.5s
[CV 5/5] END criterion=log_loss, max_depth=None, min_samples_leaf=2, min_samples_split=4, n_estimators=441;, score=0.778 total time=   0.5s
[CV 1/5] END criterion=log_loss, max_depth=None, min_samples_leaf=2, min_samples_split=4, n_estimators=442;, score=0.838 total time=   0.5s
[CV 2/5] END criterion=log_loss, max_depth=None, min_samples_leaf=2, min_samples_split=4, n_estim

In [21]:
gsc.best_score_

0.8131369339879979

In [22]:
gsc.best_params_

{'criterion': 'log_loss',
 'max_depth': None,
 'min_samples_leaf': 2,
 'min_samples_split': 4,
 'n_estimators': 442}

In [23]:
print(classification_report(y_test, gsc.predict(x_test)))

              precision    recall  f1-score   support

           0       0.85      0.80      0.83       130
           1       0.85      0.89      0.87       164

    accuracy                           0.85       294
   macro avg       0.85      0.85      0.85       294
weighted avg       0.85      0.85      0.85       294



So the best model is Random Forest with 442 estimator, log_loss as split criterion, with 2 minimal samples in leaf and with 4 minimal samples in split.

Now test this model for every class.

In [24]:
df = pd.read_csv('winequality-red.csv')
df['quality'] = df['quality'].transform(lambda x: x-3)

df.pop('citric acid')
df.pop('free sulfur dioxide')
df.pop('fixed acidity')

df = df.drop(df[(df['volatile acidity'] > 1.0)].index)
df = df.drop(df[(df['chlorides'] > 0.15)].index)
df = df.drop(df[(df['total sulfur dioxide'] > 125)].index)

target = df.pop('quality')
features = df

features = MinMaxScaler().fit_transform(features)
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=1234)

In [29]:
rfc = RandomForestClassifier(n_estimators=442, criterion='log_loss', min_samples_split=4, min_samples_leaf=2)

In [30]:
rfc.fit(x_train, y_train)

In [32]:
print(classification_report(y_test, rfc.predict(x_test)))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.00      0.00      0.00        14
           2       0.73      0.81      0.77       114
           3       0.68      0.77      0.72       119
           4       0.64      0.50      0.56        42
           5       0.00      0.00      0.00         3

    accuracy                           0.70       294
   macro avg       0.34      0.35      0.34       294
weighted avg       0.65      0.70      0.67       294



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Result is about 15% points smaller than model for 2 classes. The model works the best for 2 (5) and 3 (6), which shows huge bias in dataset.

Problems with dataset:
- Massive bias in target data - 2 classes have colossal difference to another classes. What's more 1, 2, 9 and 10 don't exist in dataset.
- Any outstanding features which can help to predict.