In [51]:
%clear

[H[2J

# Sekcja Importowania
Importujemy potrzebne biblioteki do załadowania danych i przeprowadzenia predykcji na modelu. Dodatkowo importowane są funkcje straty zaimportowane w pliku losses.py

In [52]:
import pandas as pd
from losses import rmse_loss, mse_loss
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt

# Ładowanie danych

In [53]:
car_dataset = pd.read_csv("../data/Carseats.csv")
car_dataset = pd.get_dummies(car_dataset, columns=["ShelveLoc", "Urban", "US"])
car_dataset

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,Age,Education,ShelveLoc_Bad,ShelveLoc_Good,ShelveLoc_Medium,Urban_No,Urban_Yes,US_No,US_Yes
0,9.50,138,73,11,276,120,42,17,True,False,False,False,True,False,True
1,11.22,111,48,16,260,83,65,10,False,True,False,False,True,False,True
2,10.06,113,35,10,269,80,59,12,False,False,True,False,True,False,True
3,7.40,117,100,4,466,97,55,14,False,False,True,False,True,False,True
4,4.15,141,64,3,340,128,38,13,True,False,False,False,True,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,12.57,138,108,17,203,128,33,14,False,True,False,False,True,False,True
396,6.14,139,23,3,37,120,55,11,False,False,True,True,False,False,True
397,7.41,162,26,12,368,159,40,18,False,False,True,False,True,False,True
398,5.94,100,79,7,284,95,50,12,True,False,False,False,True,False,True


# Podział na zbiór testowy i treningowy

In [54]:
from sklearn.model_selection import train_test_split
X = car_dataset.copy()
X = X.drop(columns=["Sales"])
y = car_dataset["Sales"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Inicjalizacja modelu

In [55]:
regressor = DecisionTreeRegressor(
    criterion="squared_error",
    splitter="best",
    max_depth=100,
    min_samples_split=10,
    min_samples_leaf=5,
    min_weight_fraction_leaf=0.02,
    max_features=None,
    random_state=None,
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    ccp_alpha=0.1,
    monotonic_cst=None,
)
regressor = regressor.fit(X_train, y_train)

# Wyniki dla modelu przed dokładnym doborem parametrów

In [56]:
score = regressor.score(X_test, y_test)
print(f"R^2 score: {score:.9f}")

R^2 score: 0.406122260


In [57]:
y_pred = regressor.predict(X_test)
results = pd.DataFrame({"Actual": y_test, "Predicted": y_pred})

In [58]:
print(f"MSE loss: {mse_loss(y_test, y_pred)}")

MSE loss: 4.832593963218457


In [59]:
print(f"RMSE loss: {rmse_loss(y_test, y_pred)}")

RMSE loss: 2.1983161654362773


In [60]:
results

Unnamed: 0,Actual,Predicted
361,8.68,10.038889
55,6.85,6.530588
332,5.74,10.038889
160,4.67,5.873000
44,4.16,9.289375
...,...,...
267,5.83,4.619556
24,10.14,4.619556
343,5.99,4.619556
295,4.21,6.530588


# Funkcja służąca do stworzenia wykresów pokazujących zmiany metryk w trakcie trenowania

In [61]:
def create_plots(min_samples_split_list, r2_scores, mse_losses, rmse_losses):

    fig, axs = plt.subplots(3, figsize=(10, 15))
    axs[0].plot(min_samples_split_list, r2_scores, marker='o')
    axs[0].set_title('R^2 score')
    axs[0].set_xlabel('min_samples_split')
    axs[0].set_ylabel('R^2 score')
    axs[0].grid(True)
    axs[1].plot(min_samples_split_list, mse_losses, marker='o')
    axs[1].set_title('MSE loss')
    axs[1].set_xlabel('min_samples_split')
    axs[1].set_ylabel('MSE loss')
    axs[1].grid(True)
    axs[2].plot(min_samples_split_list, rmse_losses, marker='o')
    axs[2].set_title('RMSE loss')
    axs[2].set_xlabel('min_samples_split')
    axs[2].set_ylabel('RMSE loss')
    axs[2].grid(True)
    plt.tight_layout()
    plt.show()

# Funkcja obliczająca metryki

In [62]:
def calculate_scores(regressor, X, y):
    score = regressor.score(X, y)
    y_pred = regressor.predict(X)
    mse = mse_loss(y, y_pred)
    rmse = rmse_loss(y, y_pred)
    return score, mse, rmse

# Badanie wpływu maksymalnej głębokości drzewa

In [63]:
max_depth_list = [1, 5, 10, 15, 20, 25,  50, 100]

In [64]:
for max_depth in max_depth_list:
    print(f"=======MAX DEPTH = {max_depth}======")
    regressor.max_depth = max_depth
    regressor = regressor.fit(X_train, y_train)
    score, mse, rmse = calculate_scores(regressor, X_test, y_test)
    print(f"R^2 score: {score:.9f}")
    print(f"MSE loss: {mse}")
    print(f"RMSE loss: {rmse}")

R^2 score: 0.271251413
MSE loss: 5.930085913034385
RMSE loss: 2.4351767724406344
R^2 score: 0.404172343
MSE loss: 4.84846112883198
RMSE loss: 2.2019221441349783
R^2 score: 0.406122260
MSE loss: 4.832593963218457
RMSE loss: 2.1983161654362773
R^2 score: 0.406122260
MSE loss: 4.832593963218457
RMSE loss: 2.1983161654362773
R^2 score: 0.406122260
MSE loss: 4.832593963218456
RMSE loss: 2.198316165436277
R^2 score: 0.406122260
MSE loss: 4.832593963218457
RMSE loss: 2.1983161654362773
R^2 score: 0.406122260
MSE loss: 4.832593963218455
RMSE loss: 2.198316165436277
R^2 score: 0.406122260
MSE loss: 4.832593963218457
RMSE loss: 2.1983161654362773


In [65]:
for max_depth in max_depth_list:
    print(f"=======MAX DEPTH = {max_depth}======")
    regressor.max_depth = max_depth
    regressor = regressor.fit(X_train, y_train)
    score, mse, rmse = calculate_scores(regressor, X_train, y_train)
    print(f"R^2 score: {score:.9f}")
    print(f"MSE loss: {mse}")
    print(f"RMSE loss: {rmse}")

R^2 score: 0.239510738
MSE loss: 5.980709344207817
RMSE loss: 2.44554888403561
R^2 score: 0.651389950
MSE loss: 2.741571101162247
RMSE loss: 1.6557690361769202
R^2 score: 0.665797562
MSE loss: 2.628265436152777
RMSE loss: 1.621192596872061
R^2 score: 0.665797562
MSE loss: 2.628265436152777
RMSE loss: 1.621192596872061
R^2 score: 0.665797562
MSE loss: 2.628265436152777
RMSE loss: 1.621192596872061
R^2 score: 0.665797562
MSE loss: 2.628265436152777
RMSE loss: 1.621192596872061
R^2 score: 0.665797562
MSE loss: 2.628265436152777
RMSE loss: 1.621192596872061
R^2 score: 0.665797562
MSE loss: 2.6282654361527773
RMSE loss: 1.621192596872061


In [66]:
max_depth = 10
regressor.max_depth = max_depth

# Badanie wyników dla różnych minimalnych ilości próbek do podziału

In [67]:
min_samples_split_list = [2, 5, 10, 15, 25, 50, 100]

In [68]:
for min_samples_split in min_samples_split_list:
    print(f"=======MIN SAMPLES  SPLIT= {min_samples_split}======")
    regressor.min_samples_split = min_samples_split
    regressor = regressor.fit(X_train, y_train)
    score, mse, rmse = calculate_scores(regressor, X_test, y_test)
    print(f"R^2 score: {score:.9f}")
    print(f"MSE loss: {mse}")
    print(f"RMSE loss: {rmse}")

R^2 score: 0.406122260
MSE loss: 4.832593963218457
RMSE loss: 2.1983161654362773
R^2 score: 0.406122260
MSE loss: 4.832593963218457
RMSE loss: 2.1983161654362773
R^2 score: 0.406122260
MSE loss: 4.832593963218457
RMSE loss: 2.1983161654362773
R^2 score: 0.406122260
MSE loss: 4.832593963218455
RMSE loss: 2.198316165436277
R^2 score: 0.406122260
MSE loss: 4.832593963218457
RMSE loss: 2.1983161654362773
R^2 score: 0.455776194
MSE loss: 4.42854227305358
RMSE loss: 2.1044101960058974
R^2 score: 0.364483047
MSE loss: 5.171427016189623
RMSE loss: 2.274077179031007


In [69]:
for min_samples_split in min_samples_split_list:
    print(f"=======MIN SAMPLES  SPLIT= {min_samples_split}======")
    regressor.min_samples_split = min_samples_split
    regressor = regressor.fit(X_train, y_train)
    score, mse, rmse = calculate_scores(regressor, X_train, y_train)
    print(f"R^2 score: {score:.9f}")
    print(f"MSE loss: {mse}")
    print(f"RMSE loss: {rmse}")

R^2 score: 0.665797562
MSE loss: 2.628265436152777
RMSE loss: 1.621192596872061
R^2 score: 0.665797562
MSE loss: 2.628265436152777
RMSE loss: 1.621192596872061
R^2 score: 0.665797562
MSE loss: 2.628265436152777
RMSE loss: 1.621192596872061
R^2 score: 0.665797562
MSE loss: 2.6282654361527773
RMSE loss: 1.621192596872061
R^2 score: 0.665797562
MSE loss: 2.628265436152777
RMSE loss: 1.621192596872061
R^2 score: 0.577801538
MSE loss: 3.3202918350201274
RMSE loss: 1.8221667967066373
R^2 score: 0.431604387
MSE loss: 4.470028869250494
RMSE loss: 2.1142442785190396


In [70]:
min_samples_split = 10
regressor.min_samples_split = min_samples_split

# Badanie wpływu wartości minimalnej ilości próbek w liściu

In [71]:
min_samples_leaf_list = [2, 5, 10, 15, 25, 50, 100]

In [72]:
for min_samples_leaf in min_samples_leaf_list:
    print(f"=======MIN SAMPLES  LEAF= {min_samples_leaf}======")
    regressor.min_samples_leaf = min_samples_leaf
    regressor = regressor.fit(X_train, y_train)
    score = regressor.score(X_test, y_test)
    y_pred = regressor.predict(X_test)
    print(f"R^2 score: {score:.9f}")
    print(f"MSE loss: {mse_loss(y_test, y_pred)}")
    print(f"RMSE loss: {rmse_loss(y_test, y_pred)}")

R^2 score: 0.406122260
MSE loss: 4.832593963218456
RMSE loss: 2.198316165436277
R^2 score: 0.406122260
MSE loss: 4.832593963218456
RMSE loss: 2.198316165436277
R^2 score: 0.409390922
MSE loss: 4.8059957002007705
RMSE loss: 2.1922581280954967
R^2 score: 0.480799311
MSE loss: 4.224920289622524
RMSE loss: 2.055461089299071
R^2 score: 0.416951206
MSE loss: 4.744474982514646
RMSE loss: 2.178181577030401
R^2 score: 0.361462808
MSE loss: 5.196003774135618
RMSE loss: 2.279474451301356
R^2 score: 0.150883490
MSE loss: 6.909562425247833
RMSE loss: 2.6286046536609176


In [73]:
for min_samples_leaf in min_samples_leaf_list:
    print(f"=======MIN SAMPLES  LEAF= {min_samples_leaf}======")
    regressor.min_samples_leaf = min_samples_leaf
    regressor = regressor.fit(X_train, y_train)
    score, mse, rmse = calculate_scores(regressor, X_train, y_train)
    print(f"R^2 score: {score:.9f}")
    print(f"MSE loss: {mse}")
    print(f"RMSE loss: {rmse}")

R^2 score: 0.665797562
MSE loss: 2.628265436152777
RMSE loss: 1.621192596872061
R^2 score: 0.665797562
MSE loss: 2.628265436152777
RMSE loss: 1.621192596872061
R^2 score: 0.686392270
MSE loss: 2.4663026495667575
RMSE loss: 1.5704466401526533
R^2 score: 0.624025447
MSE loss: 2.9567735325726403
RMSE loss: 1.7195271246981363
R^2 score: 0.565621546
MSE loss: 3.4160788456903743
RMSE loss: 1.8482637381311073
R^2 score: 0.405578659
MSE loss: 4.674702783149486
RMSE loss: 2.162106098957562
R^2 score: 0.184200126
MSE loss: 6.415688125277816
RMSE loss: 2.5329208683410966


In [74]:
min_samples_leaf = 15
regressor.min_samples_leaf = min_samples_leaf

# Badanie wpływu strategi

In [75]:
strategies = ["random", "best"]

In [76]:
for strategy in strategies:
    print(f"=======Strategy= {strategy}======")
    regressor.splitter = strategy
    regressor = regressor.fit(X_train, y_train)
    score, mse, rmse = calculate_scores(regressor, X_test, y_test)
    print(f"R^2 score: {score:.9f}")
    print(f"MSE loss: {mse}")
    print(f"RMSE loss: {rmse}")

R^2 score: 0.346567270
MSE loss: 5.3172140543871675
RMSE loss: 2.3059085095439427
R^2 score: 0.480799311
MSE loss: 4.224920289622522
RMSE loss: 2.0554610892990706


In [77]:
for strategy in strategies:
    print(f"=======Strategy= {strategy}======")
    regressor.splitter = strategy
    regressor = regressor.fit(X_train, y_train)
    score, mse, rmse = calculate_scores(regressor, X_train, y_train)
    print(f"R^2 score: {score:.9f}")
    print(f"MSE loss: {mse}")
    print(f"RMSE loss: {rmse}")

R^2 score: 0.503552617
MSE loss: 3.904207002770846
RMSE loss: 1.975906628049728
R^2 score: 0.624025447
MSE loss: 2.9567735325726403
RMSE loss: 1.7195271246981363


# Porównanie z modelem Liniowym

In [78]:
from sklearn.linear_model import LinearRegression

In [79]:
regressor = LinearRegression()
regressor = regressor.fit(X_train, y_train)

# Zbiór walidacyjny

In [80]:
score = regressor.score(X_test, y_test)

In [81]:
y_pred = regressor.predict(X_test)
results = pd.DataFrame({"Actual": y_test, "Predicted": y_pred})

In [82]:
print(f"R^2 score: {score:.9f}")
print(f"MSE loss: {mse_loss(y_test, y_pred)}")
print(f"RMSE loss: {rmse_loss(y_test, y_pred)}")

R^2 score: 0.880849923
MSE loss: 0.9695664689265164
RMSE loss: 0.9846656635257047


# Zbiór treningowy

In [83]:
score = regressor.score(X_train, y_train)
y_pred = regressor.predict(X_train)

In [84]:
print(f"R^2 score: {score:.9f}")
print(f"MSE loss: {mse_loss(y_train, y_pred)}")
print(f"RMSE loss: {rmse_loss(y_train, y_pred)}")

R^2 score: 0.870075614
MSE loss: 1.0217632632060045
RMSE loss: 1.0108230622646104
