In [1]:
import pandas as pd
import numpy as np
import sys

from imblearn.under_sampling import NearMiss
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score
from sklearn.metrics import f1_score, classification_report

sys.path.append("..")

from models.decisiontrees import MyDecisionTree

# Подготовка данных

In [2]:
pd.set_option('display.max_columns', 30)

In [3]:
data_class = pd.read_csv('../data/rainAUS_prepared.csv')
data_class

Unnamed: 0,Date,MinTemp,MaxTemp,Rainfall,Evaporation,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,ord__Location,ord__WindDir9am,ord__WindGustDir,ord__WindDir3pm,ord__RainToday,RainTomorrow
0,12,13.4,22.9,0.6,4.0,44.0,20.0,24.0,71.0,22.0,1007.7,1007.1,8.0,7.0,16.9,21.8,2.0,13.0,13.0,14.0,0.0,0
1,12,7.4,25.1,0.0,4.0,44.0,4.0,22.0,44.0,25.0,1010.6,1007.8,7.0,7.0,17.2,24.3,2.0,6.0,14.0,15.0,0.0,0
2,12,12.9,25.7,0.0,4.0,46.0,19.0,26.0,38.0,30.0,1007.6,1008.7,7.0,2.0,21.0,23.2,2.0,13.0,15.0,15.0,0.0,0
3,12,9.2,28.0,0.0,4.0,24.0,11.0,9.0,45.0,16.0,1017.6,1012.8,7.0,7.0,18.1,26.5,2.0,9.0,4.0,0.0,0.0,0
4,12,17.5,32.3,1.0,4.0,41.0,7.0,20.0,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,2.0,1.0,13.0,7.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142188,6,3.5,21.8,0.0,4.0,31.0,15.0,13.0,59.0,27.0,1024.7,1021.2,7.0,7.0,9.4,20.9,41.0,2.0,0.0,0.0,0.0,0
142189,6,2.8,23.4,0.0,4.0,31.0,13.0,11.0,51.0,24.0,1024.6,1020.3,7.0,7.0,10.1,22.4,41.0,9.0,0.0,1.0,0.0,0
142190,6,3.6,25.3,0.0,4.0,22.0,13.0,9.0,56.0,21.0,1023.5,1019.1,7.0,7.0,10.9,24.5,41.0,9.0,6.0,3.0,0.0,0
142191,6,5.4,26.9,0.0,4.0,37.0,9.0,9.0,53.0,24.0,1021.0,1016.8,7.0,7.0,12.5,26.1,41.0,9.0,3.0,14.0,0.0,0


In [4]:
data_reg_white = pd.read_csv('../data/winequality-white.csv', sep = ';')
data_reg_red = pd.read_csv('../data/winequality-red.csv', sep = ';')

In [5]:
data_reg_white['color'] = 0
data_reg_red['color'] = 1

In [6]:
data_reg = pd.concat([data_reg_white, data_reg_red], axis = 0)
data_reg.reset_index()
data_reg

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,color
0,7.0,0.270,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6,0
1,6.3,0.300,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6,0
2,8.1,0.280,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6,0
3,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6,0
4,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5,1
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6,1
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6,1
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5,1


# Разделение выборок

In [7]:
y_cl = data_class['RainTomorrow'].to_numpy()
X_cl = data_class.drop(['RainTomorrow'], axis=1).to_numpy()

In [8]:
nm = NearMiss()
X_train_cl, X_test_cl, y_train_cl, y_test_cl = train_test_split(X_cl, y_cl, test_size=0.2)
X_train_cl, y_train_cl = nm.fit_resample(X_train_cl, y_train_cl.ravel())

In [9]:
y_reg = data_reg['quality'].to_numpy()
X_reg = data_reg.drop(['quality'], axis=1).to_numpy()
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, stratify = y_reg)

# Построение деревьев

In [11]:
def show_class(y_test, predict):
    print(classification_report(y_test, predict))    

In [12]:
def show_reg(y_test, y_pred):
    MAE = mean_absolute_error(y_test, y_pred)
    MSE = mean_squared_error(y_test, y_pred)
    MAPE = mean_absolute_percentage_error(y_test, y_pred)
    R2 = r2_score(y_test, y_pred)
    
    return {'MAE' : MAE, 'MSE' : MSE, 'MAPE': MAPE, 'R2' : R2}

## Тест собственной модели

## Классификация

In [13]:
model = MyDecisionTree(metric = 'gini', max_depth = 4, min_samples_split = 15)

In [14]:
model.fit(X_train_cl, y_train_cl)

{'index': 9, 'value': 76.0, 'metric': 0.44001698750602836, 'left_child': {'index': 9, 'value': 76.0, 'metric': 0.44001698750602836, 'left_child': {'index': 9, 'value': 76.0, 'metric': 0.44001698750602836, 'left_child': {'index': 9, 'value': 76.0, 'metric': 0.44001698750602836, 'left_child': {'predict': 0}, 'right_child': {'predict': 1}}, 'right_child': {'predict': 1}}, 'right_child': {'predict': 1}}, 'right_child': {'predict': 1}}


In [15]:
y_pred = model.predict(X_test_cl)

In [17]:
show_class(y_test_cl, y_pred)

              precision    recall  f1-score   support

           0       0.84      0.95      0.89     22057
           1       0.68      0.38      0.49      6382

    accuracy                           0.82     28439
   macro avg       0.76      0.67      0.69     28439
weighted avg       0.81      0.82      0.80     28439



## Регрессия

In [34]:
model = MyDecisionTree(metric = 'mse', max_depth = 4, min_samples_split = 2)

In [35]:
model.fit(X_train_reg, y_train_reg)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


{'index': 10, 'value': 10.9, 'metric': 0.6337291260509728, 'left_child': {'index': 10, 'value': 10.9, 'metric': 0.6337291260509728, 'left_child': {'index': 10, 'value': 10.9, 'metric': 0.6337291260509728, 'left_child': {'index': 10, 'value': 10.9, 'metric': 0.6337291260509728, 'left_child': {'predict': 5.549443943492636}, 'right_child': {'predict': 6.296256684491978}}, 'right_child': {'predict': 6.296256684491978}}, 'right_child': {'predict': 6.296256684491978}}, 'right_child': {'predict': 6.296256684491978}}


In [36]:
y_pred = model.predict(X_test_reg)

In [37]:
show_reg(y_test_reg, y_pred)

{'MAE': 0.674149903739359,
 'MSE': 0.6645957771585177,
 'MAPE': 0.12016806996200462,
 'R2': 0.12951358167995597}