In [1]:
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import make_scorer, mean_absolute_error, mean_squared_error, max_error
from sklearn.ensemble import RandomForestRegressor

In [2]:
red = pd.read_csv('winequality-red.csv', sep=';')
white = pd.read_csv('winequality-white.csv', sep=';')

wine = pd.concat([red, white])
wine.reset_index(drop=True, inplace=True)

In [17]:
wine

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
6492,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6
6493,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5
6494,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6
6495,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7


In [4]:
RANDOM_STATE = 42

In [5]:
def make_ordinal_scorer(scorer, for_cv=True, greater_is_better=True):
    def my_scorer(y_true, y_pred):
        y_rounded = y_pred.round()
        y_rounded[y_rounded < 0] = 0
        y_rounded[y_rounded > 10] = 10
        return scorer(y_true, y_rounded)

    if for_cv:
        return make_scorer(my_scorer, greater_is_better=greater_is_better)
    else:
        return my_scorer


In [6]:
def score_model(model: RandomForestRegressor, X_train, X_test, y_train, y_test) -> pd.DataFrame:
    X = pd.concat([X_train, X_test])
    y = pd.concat([y_train, y_test])

    X_resampled, y_resampled = RandomOverSampler(random_state=RANDOM_STATE).fit_resample(X, y)
    X_train_resampled, y_train_resampled = RandomOverSampler(random_state=RANDOM_STATE).fit_resample(X_train, y_train)

    train_scores = []
    test_scores = []
    all_scores = []

    ordinal_mean_absolute_error = make_ordinal_scorer(mean_absolute_error, for_cv=False)
    ordinal_mean_squared_error = make_ordinal_scorer(mean_squared_error, for_cv=False)
    ordinal_max_error = make_ordinal_scorer(max_error, for_cv=False)

    scorers = [ordinal_mean_absolute_error, ordinal_mean_squared_error, ordinal_max_error]
    for scorer in scorers:
        model.fit(X_train_resampled, y_train_resampled)
        train_scores.append(scorer(y_train, model.predict(X_train)))
        test_scores.append(scorer(y_test, model.predict(X_test)))
        model.fit(X_resampled, y_resampled)
        all_scores.append(scorer(y, model.predict(X)))

    return pd.DataFrame.from_dict({'Train': train_scores, 'Test': test_scores, 'All': all_scores}, orient='index',
                                  columns=['Mean absolute error', 'Mean squared error', 'Max error'])

## Препроцессинг 

### Проверяем наличие nan

In [7]:
print(f'Количество nan: {wine.isna().sum().sum()}')

Количество nan: 0


### Создаём X и y

In [8]:
y_columns = ['quality']
X_columns = [column for column in wine.columns.tolist() if column not in y_columns]
X, y = wine[X_columns], wine[y_columns]
y = y.round()

### Проверяем соотношения классов

In [9]:
y['quality'].value_counts(normalize=True).sort_values(ascending=False)

### Генерируем X_train, X_test, y_train, y_test 

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y['quality'], random_state=RANDOM_STATE, stratify=y['quality'])

## Поиск параметров

In [11]:
class BalancedStratifiedKFold:
    def __init__(self, n_splits=4, random_state=None):
        self.n_splits = n_splits
        self.random_state = random_state

    def split(self, X, y, groups=None):
        for train_idx, test_idx in StratifiedKFold(n_splits=self.n_splits).split(X, y):
            X_train, y_train = X.take(train_idx), y.take(train_idx)
            X_test, y_test = X.take(test_idx), y.take(test_idx)

            X_train['index'] = train_idx
            X_test['index'] = test_idx

            X_train, y_train = RandomOverSampler(random_state=self.random_state).fit_resample(X_train, y_train)
            X_test, y_test = RandomOverSampler(random_state=self.random_state).fit_resample(X_test, y_test)

            yield X_train['index'].to_numpy(), X_test['index'].to_numpy()

    def get_n_splits(self, X, y, groups=None):
        return self.n_splits

In [12]:
# parameters = {'n_estimators': list(range(30, 180, 10)), 'min_samples_leaf': [1, 5, 10]}
# clf = GridSearchCV(RandomForestRegressor(random_state=RANDOM_STATE, max_features='sqrt'), parameters,
#                    cv=BalancedStratifiedKFold(random_state=RANDOM_STATE), verbose=100,
#                    scoring=make_ordinal_scorer(mean_squared_error, greater_is_better=False), n_jobs=6)
# clf.fit(X_train, y_train)

# clf.cv_results_

## Результаты

In [13]:
model = RandomForestRegressor(random_state=RANDOM_STATE, max_features='sqrt', min_samples_leaf=7, n_estimators=50)

In [14]:
score_model(model, X_train, X_test, y_train, y_test)

Unnamed: 0,Mean absolute error,Mean squared error,Max error
Train,0.207307,0.208539,2.0
Test,0.417846,0.475692,3.0
All,0.195783,0.19763,2.0


### Результаты на 9

In [15]:
X_train, y_train = RandomOverSampler(random_state=RANDOM_STATE).fit_resample(X_train, y_train)
model.fit(X_train, y_train)
y_test[y_test == 9]  # 2426
model.predict(X_test[X_test.index == 2426]).round()

### Результаты на 8

In [16]:
y_test[y_test == 8].index
eights = [498, 5266, 278, 2818, 6364, 4347, 1403, 3579, 2621, 2478, 1619, 4349, 3989, 4771, 6386, 455, 3897, 2224, 1090,
          5933, 4521, 5939, 4403, 2398, 6385, 2705, 4900, 6365, 440, 4851, 1673, 4862, 2944, 4628, 4352, 588, 4595,
          1120, 3005, 5931, 2943, 3581, 4489, 1269, 1061, 5072, 5060, 2882]
model.predict(X_test[X_test.index.isin(eights)]).round()