In [9]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV, train_test_split
import warnings
import time

In [10]:
clear_df = pd.read_csv('readyDatasets/preprocessed_dataframe.csv')

In [11]:
X = clear_df.drop(columns='УСЬОГО')
y = clear_df['УСЬОГО']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=13)

In [12]:
# Initializing the Decision Tree Regression model
reg_decision_model = DecisionTreeRegressor()
reg_decision_model.fit(X_train, y_train)

# Predicting the target values of the test set
y_pred = reg_decision_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("MSE:", mse)
print("R2 score:", r2)

MSE: 156.003337259521
R2 score: 0.8994618668929035


In [13]:
parameters = {"splitter": ["best", "random"],
              "max_depth": np.arange(2, 30, 2),
              "min_samples_leaf": np.arange(1, 20),
              "min_weight_fraction_leaf": np.arange(0.1, 0.9, 0.1),
              "max_features": ["log2", "sqrt", 15, 20, 22, None],
              "max_leaf_nodes": [None, 50, 80, 90, 100, 150, 200]}

tuning_model = GridSearchCV(DecisionTreeRegressor(), parameters, cv=5)

In [None]:
warnings.simplefilter('ignore', FutureWarning)

start_time = time.time()
tuning_model.fit(X_train, y_train)
print('computation time: %.2f' % (time.time() - start_time))
print(tuning_model.best_params_)
print(tuning_model.best_score_)

тестирую с другого сайта

In [None]:
# setup parameter space
parameters = {'criterion': ['squared_error', 'absolute_error'],
              'max_depth': np.arange(1, 21, 2),
              'min_samples_split': np.arange(2, 11, 2),
              'max_leaf_nodes': np.arange(3, 26, 2)}

# create an instance of the grid search object
g2 = GridSearchCV(DecisionTreeRegressor(), parameters, cv=5)

# conduct grid search over the parameter space
start_time = time.time()
g2.fit(X_train, y_train)
duration = time.time() - start_time

# show the best parameter configuration found for regressor
g2.best_params_

In [None]:
from sklearn.metrics import mean_absolute_error

# compute performance on test set
model = g2.best_estimator_
y_pred = model.predict(X_test)
print('mse score: %.2f' % mean_squared_error(y_test, y_pred))
print('mae score: %.2f' % mean_absolute_error(y_test, y_pred))
print('computation time: %.2f' % duration)