In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston
from sklearn.tree import DecisionTreeRegressor

In [3]:
data = pd.read_csv(load_boston()['filename'], skiprows=1)

In [4]:
all_columns = [
 'CRIM',
 'ZN',
 'INDUS',
 'CHAS',
 'NOX',
 'RM',
 'AGE',
 'DIS',
 'RAD',
 'TAX',
 'PTRATIO',
 'B',
 'LSTAT'
] #полный список колонок с данными для обучения

In [5]:
X=data[all_columns]
y=data['MEDV']

In [16]:
X_train, X_test, y_train, y_test = train_test_split (X, y, test_size=0.3, random_state = 40)

In [17]:
def lin_reg (X_train, X_test, y_train, y_test):
    model_lr = LinearRegression()
    model_lr.fit( X_train, y_train )
    #print ("Точность линейной регрессии", model_lr.score(X_test, y_test))
    return model_lr.score(X_test, y_test)    

In [18]:
def des_tree (X_train, X_test, y_train, y_test, criterion='mse', max_depth=None, min_samples_split=2, min_samples_leaf=1):
    model_dtr = DecisionTreeRegressor(criterion=criterion, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)
    model_dtr.fit( X_train, y_train )
    #print ("Точность дерева решений критерий 'mse'", model_dtr.score(X_test, y_test))
    return model_dtr.score(X_test, y_test)

In [19]:
lin_reg (X_train, X_test, y_train, y_test) #Точность линейной регрессии

0.7215519718844154

In [20]:
des_tree (X_train, X_test, y_train, y_test) #Точность дерева решений критерий 'mse' с параметрами по умолчанию

0.6825144853751834

In [21]:
des_tree (X_train, X_test, y_train, y_test, criterion='friedman_mse') #Точность дерева решений критерий 'friedman_mse' с параметрами по умолчанию

0.6874180121102464

In [22]:
des_tree (X_train, X_test, y_train, y_test, criterion='mae') #Точность дерева решений критерий 'mae' с параметрами по умолчанию

0.5448237999733976

In [80]:
def search_parameters (X_train, X_test, y_train, y_test):
    dtr_mse_score_t = dtr_mse_score_t1 = dtr_mse_score_t2 = des_tree (X_train, X_test, y_train, y_test)
    dtr_f_mse_score_t = dtr_f_mse_score_t1 = dtr_f_mse_score_t2 = des_tree (X_train, X_test, y_train, y_test, criterion='friedman_mse')
    dtr_mae_score_t = dtr_mae_score_t1 = dtr_mae_score_t2 = des_tree (X_train, X_test, y_train, y_test, criterion='mae')
    max_depth_mse = max_depth_mse1 = max_depth_f_mse = max_depth_f_mse1 = max_depth_mae = max_depth_mae1 = 1
    min_samples_split_mse = min_samples_split_mse1 = min_samples_split_f_mse = min_samples_split_f_mse1 = min_samples_split_mae = min_samples_split_mae1 = 2
    min_samples_leaf_mse = min_samples_leaf_f_mse = min_samples_leaf_mae = 1
    for i in range(1, 500):
        dtr_mse_score = des_tree (X_train, X_test, y_train, y_test, 'mse', i)
        if dtr_mse_score > dtr_mse_score_t:
            dtr_mse_score_t = dtr_mse_score
            max_depth_mse = i
            for j in range (2,500):
                dtr_mse_score = des_tree (X_train, X_test, y_train, y_test, 'mse', i, j)
                if dtr_mse_score > dtr_mse_score_t1:
                    dtr_mse_score_t1 = dtr_mse_score
                    min_samples_split_mse = j
                    max_depth_mse1 = i
                    for z in range (1,500):
                        dtr_mse_score = des_tree (X_train, X_test, y_train, y_test, 'mse', i, j, z)
                        if dtr_mse_score > dtr_mse_score_t2:
                            dtr_mse_score_t2 = dtr_mse_score
                            max_depth_mse1 = i
                            min_samples_split_mse1 = j
                            min_samples_leaf_mse = z
#                             print (dtr_mse_score_t2, i, j, z)
        dtr_f_mse_score = des_tree (X_train, X_test, y_train, y_test, 'friedman_mse', i)
        if dtr_f_mse_score > dtr_f_mse_score_t:
            dtr_f_mse_score_t = dtr_f_mse_score
            max_depth_f_mse = i
            for j in range (2,500):
                dtr_f_mse_score = des_tree (X_train, X_test, y_train, y_test, 'friedman_mse', i, j)
                if dtr_f_mse_score > dtr_f_mse_score_t1:
                    dtr_f_mse_score_t1 = dtr_f_mse_score
                    min_samples_split_f_mse = j
                    max_depth_f_mse1 = i
                    for z in range (1,500):
                        dtr_f_mse_score = des_tree (X_train, X_test, y_train, y_test, 'friedman_mse', i, j, z)
                        if dtr_f_mse_score > dtr_f_mse_score_t2:
                            dtr_f_mse_score_t2 = dtr_f_mse_score
                            max_depth_f_mse1 = i
                            min_samples_split_f_mse1 = j
                            min_samples_leaf_f_mse = z
#                             print ('f', dtr_f_mse_score_t2, i, j, z)
        dtr_mae_score = des_tree (X_train, X_test, y_train, y_test, 'mae', i)
        if dtr_mae_score > dtr_mae_score_t:
            dtr_mae_score_t = dtr_mae_score
            max_depth_mae = i
            for j in range (2,500):
                dtr_mae_score = des_tree (X_train, X_test, y_train, y_test, 'mae', i, j)
                if dtr_mae_score > dtr_mae_score_t1:
                    dtr_mae_score_t1 = dtr_mae_score
                    min_samples_split_mae = j
                    max_depth_mae1 = i
                    for z in range (1,500):
                        dtr_mae_score = des_tree (X_train, X_test, y_train, y_test, 'mae', i, j, z)
                        if dtr_mae_score > dtr_mae_score_t2:
                            dtr_mae_score_t2 = dtr_mae_score
                            max_depth_mae1 = i
                            min_samples_split_mae1 = j
                            min_samples_leaf_mae = z
#                             print ('f', dtr_mae_score_t2, i, j, z)
    if dtr_mse_score_t2 > dtr_f_mse_score_t2 and dtr_mse_score_t2 > dtr_mae_score_t2:
        return "mse", dtr_mse_score_t2, max_depth_mse1, min_samples_split_mse1, min_samples_leaf_mse
    elif dtr_f_mse_score_t2 > dtr_mse_score_t2 and dtr_f_mse_score_t2 > dtr_mae_score_t2:
        print (dtr_mse_score_t2, dtr_f_mse_score_t2, dtr_mae_score_t2)
        return "friedman_mse", dtr_f_mse_score_t2, max_depth_f_mse1, min_samples_split_f_mse1, min_samples_leaf_f_mse
    else:
        return "mae", dtr_mae_score_t2, max_depth_mae1, min_samples_split_mae1, min_samples_leaf_mae

In [81]:
criterion, result, depth, sample_split, sample_leaf = search_parameters (X_train, X_test, y_train, y_test)

0.7676984124330649 0.7767601499325 0.6679686510066037


In [82]:
print ("Наилучший результат", result, "достигается при использовании метода", criterion, ", максимальной глубине дерева равной", depth, ", минимальном количестве образцов необходимых для разделения узла равных", sample_split, "и минимальном количестве образцов необходимых для нахождения в узле равном", sample_leaf)

Наилучший результат 0.7767601499325 достигается при использовании метода friedman_mse , максимальной глубине дерева равной 13 , минимальном количестве образцов необходимых для разделения узла равных 2 и минимальном количестве образцов необходимых для нахождения в узле равном 3


In [83]:
des_tree (X_train, X_test, y_train, y_test, criterion, depth, sample_split, sample_leaf)

0.7660541717437479

In [84]:
des_tree (X_train, X_test, y_train, y_test, 'mse', 6, 5, 3)

0.7767601499325