In [33]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [34]:
df = pd.read_csv('winequality-red.csv')
df['quality'] = df['quality'].transform(lambda x: 0 if x <= 5 else 1)

df.pop('citric acid')
df.pop('free sulfur dioxide')
df.pop('fixed acidity')

df = df.drop(df[(df['volatile acidity'] > 1.0)].index)
df = df.drop(df[(df['chlorides'] > 0.15)].index)
df = df.drop(df[(df['total sulfur dioxide'] > 125)].index)

target = df.pop('quality')
features = df
features = MinMaxScaler().fit_transform(features)
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=1234)

In [5]:
params = {
    'n_estimators': [200, 250, 300, 350, 400, 450, 500],
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_depth': [None, 10, 50, 100, 200],
    'min_samples_split': [2, 4, 8, 10, 20],
    'min_samples_leaf': [1, 2, 5, 10, 20]
}

In [None]:
gsc = GridSearchCV(RandomForestClassifier(), param_grid=params, verbose=3, cv=5)
gsc.fit(x_train, y_train)

In [None]:
gsc.best_score_

In [None]:
gsc.best_params_

In [None]:
print(classification_report(y_test, gsc.predict(x_test)))

In [None]:
params = {
    'n_estimators': [410, 420, 430, 440, 450, 460, 470, 480, 490],
    'criterion': ['log_loss'],
    'max_depth': [None, 300, 400],
    'min_samples_split': [4, 5, 6, 7],
    'min_samples_leaf': [2, 3, 4]
}

In [None]:
gsc = GridSearchCV(RandomForestClassifier(), param_grid=params, verbose=3, cv=5)
gsc.fit(x_train, y_train)

In [None]:
gsc.best_score_

In [None]:
gsc.best_params_

In [None]:
print(classification_report(y_test, gsc.predict(x_test)))

In [None]:
params = {
    'n_estimators': [441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459],
    'criterion': ['log_loss'],
    'max_depth': [None, 500, 600],
    'min_samples_split': [4, 5],
    'min_samples_leaf': [2]
}

In [None]:
gsc = GridSearchCV(RandomForestClassifier(), param_grid=params, verbose=3, cv=5)
gsc.fit(x_train, y_train)

In [None]:
gsc.best_score_

In [None]:
gsc.best_params_

In [None]:
print(classification_report(y_test, gsc.predict(x_test)))

So the best model is **Random Forest** with **442 estimators**, **log_loss** as split criterion, with **2 minimal samples in leaf** and with **4 minimal samples in split**.

Now test this model for every class.

In [24]:
df = pd.read_csv('winequality-red.csv')
df['quality'] = df['quality'].transform(lambda x: x-3)

df.pop('citric acid')
df.pop('free sulfur dioxide')
df.pop('fixed acidity')

df = df.drop(df[(df['volatile acidity'] > 1.0)].index)
df = df.drop(df[(df['chlorides'] > 0.15)].index)
df = df.drop(df[(df['total sulfur dioxide'] > 125)].index)

target = df.pop('quality')
features = df
features = MinMaxScaler().fit_transform(features)
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=1234)

In [None]:
rfc = RandomForestClassifier(n_estimators=442, criterion='log_loss', min_samples_split=4, min_samples_leaf=2)
rfc.fit(x_train, y_train)

In [None]:
print(classification_report(y_test, rfc.predict(x_test)))

Result is about **15% points** smaller than model for 2 classes. Even if the accuracy isn't much worse, performance of the model is poor because the model is **biased to 2 (5) and 3 (6)**. It's caused by the overwhelming number of cases for that quality. Other problem is too small number of cases for another quality categories, so I don't think reduction of the cases with 2 or 3 quality would help.

Problems with dataset:
- Massive bias in target data - 2 classes have colossal difference to another classes. What's more 1, 2, 9 and 10 don't exist in dataset.
- Any outstanding features which can help to predict.