In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight


In [9]:
data = pd.read_csv('../../DATA/scaled_data.csv')
x = data.drop('quality', axis=1)
y = data['quality']
lab = LabelEncoder()
y = lab.fit_transform(y)
i_train, i_test = list(StratifiedShuffleSplit(
    n_splits=1, test_size=0.25, random_state=7).split(x, y))[0]
x_train, y_train, x_test, y_test = x.iloc[i_train], y[i_train], x.iloc[i_test], y[i_test]


In [10]:
cw = class_weight.compute_class_weight(class_weight='balanced',
                                       classes=pd.unique(data['quality']),
                                       y=data['quality'])
cw.shape


(7,)

In [15]:
cw = dict(zip(np.unique(y), cw))

In [17]:
rparams = {'criterion': ['poisson', 'friedman_mse', 'absolute_error', 'squared_error'],
          'max_depth': np.arange(2, 20)}
cparams = {'criterion': ['log_loss', 'entropy', 'gini'],
          'max_depth': np.arange(2, 20)}


In [16]:
rtree = GridSearchCV(DecisionTreeRegressor(),
                                  rparams,
                                  n_jobs=4,
                                  scoring='r2'
                                  ).fit(x_train, y_train)


In [17]:
rp = rtree.best_estimator_.predict(x_test)


In [18]:
print(r2_score(y_test, rp), mean_squared_error(y_test, rp))

0.2593313281660258 0.015691431521002976


In [18]:
ctree = GridSearchCV(DecisionTreeClassifier(class_weight=cw),
                    cparams,
                    n_jobs=4,
                    scoring='f1_micro'
                    ).fit(x_train, y_train)




In [20]:
cp = ctree.best_estimator_.predict(x_test)


In [24]:
print(classification_report(y_test, cp))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00         8
           1       0.24      0.17      0.20        54
           2       0.61      0.71      0.65       535
           3       0.65      0.57      0.61       709
           4       0.53      0.58      0.55       270
           5       0.41      0.35      0.38        48
           6       0.00      0.00      0.00         1

    accuracy                           0.59      1625
   macro avg       0.35      0.34      0.34      1625
weighted avg       0.59      0.59      0.59      1625

