In [193]:
%clear

[H[2J

# Sekcja Importowania
 Importujemy potrzebne biblioteki do załadowania danych i przeprowadzenia predykcji na modelu. Dodatkowo importowane są funkcje straty zaimportowane w pliku losses.py

In [194]:
import pandas as pd
from losses import rmse_loss, mse_loss
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt

# Ładowanie danych

In [195]:
house_dataset = pd.read_csv("../data/ParisHousing.csv")
house_dataset

Unnamed: 0,squareMeters,numberOfRooms,hasYard,hasPool,floors,cityCode,cityPartRange,numPrevOwners,made,isNewBuilt,hasStormProtector,basement,attic,garage,hasStorageRoom,hasGuestRoom,price
0,75523,3,0,1,63,9373,3,8,2005,0,1,4313,9005,956,0,7,7559081.5
1,80771,39,1,1,98,39381,8,6,2015,1,0,3653,2436,128,1,2,8085989.5
2,55712,58,0,1,19,34457,6,8,2021,0,0,2937,8852,135,1,9,5574642.1
3,32316,47,0,0,6,27939,10,4,2012,0,1,659,7141,359,0,3,3232561.2
4,70429,19,1,1,90,38045,3,7,1990,1,0,8435,2429,292,1,4,7055052.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1726,89,0,1,5,73133,7,6,2009,0,1,9311,1698,218,0,4,176425.9
9996,44403,29,1,1,12,34606,9,4,1990,0,1,9061,1742,230,0,0,4448474.0
9997,83841,3,0,0,69,80933,10,10,2005,1,1,8304,7730,345,1,9,8390030.5
9998,59036,70,0,0,96,55856,1,3,2010,0,1,2590,6174,339,1,4,5905107.0


# Podział na zbiór testowy i treningowy

In [196]:
from sklearn.model_selection import train_test_split

X = house_dataset.copy()
X = X.drop(columns=["price"])
y = house_dataset["price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Inicjalizacja modelu

In [197]:
regressor = DecisionTreeRegressor(
    criterion="squared_error",
    splitter="best",
    max_depth=100,
    min_samples_split=10,
    min_samples_leaf=5,
    min_weight_fraction_leaf=0.0,
    max_features=None,
    random_state=None,
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    ccp_alpha=0.01,
    monotonic_cst=None,
)
regressor = regressor.fit(X_train, y_train)

# Wyniki dla modelu przed dokładnym doborem parametrów

In [198]:
score = regressor.score(X_test, y_test)
print(f"R^2 score: {score:.9f}")

R^2 score: 0.999997018


In [199]:
y_pred = regressor.predict(X_test)
results = pd.DataFrame({"Actual": y_test, "Predicted": y_pred})

In [200]:
print(f"MSE loss: {mse_loss(y_test, y_pred)}")

MSE loss: 25191642.961707342


In [201]:
print(f"RMSE loss: {rmse_loss(y_test, y_pred)}")

RMSE loss: 5019.12770924464


In [202]:
results

Unnamed: 0,Actual,Predicted
3882,1935699.6,1.935062e+06
3584,9914114.3,9.907731e+06
2251,8287927.8,8.279828e+06
3379,7623937.0,7.627994e+06
169,521650.5,5.302363e+05
...,...,...
2668,1684521.7,1.682010e+06
9786,168970.0,1.745384e+05
2415,5819120.4,5.812607e+06
6048,8331662.9,8.344137e+06


# Funkcja służąca do stworzenia wykresów pokazujących zmiany metryk w trakcie trenowania

In [203]:
def create_plots(min_samples_split_list, r2_scores, mse_losses, rmse_losses):

    fig, axs = plt.subplots(3, figsize=(10, 15))
    axs[0].plot(min_samples_split_list, r2_scores, marker="o")
    axs[0].set_title("R^2 score")
    axs[0].set_xlabel("min_samples_split")
    axs[0].set_ylabel("R^2 score")
    axs[0].grid(True)
    axs[1].plot(min_samples_split_list, mse_losses, marker="o")
    axs[1].set_title("MSE loss")
    axs[1].set_xlabel("min_samples_split")
    axs[1].set_ylabel("MSE loss")
    axs[1].grid(True)
    axs[2].plot(min_samples_split_list, rmse_losses, marker="o")
    axs[2].set_title("RMSE loss")
    axs[2].set_xlabel("min_samples_split")
    axs[2].set_ylabel("RMSE loss")
    axs[2].grid(True)
    plt.tight_layout()
    plt.show()

# Funkcja obliczająca metryki

In [204]:
def calculate_scores(regressor, X, y):
    score = regressor.score(X, y)
    y_pred = regressor.predict(X)
    mse = mse_loss(y, y_pred)
    rmse = rmse_loss(y, y_pred)
    return score, mse, rmse

# Badanie wpływu maksymalnej głębokości drzewa

In [205]:
max_depth_list = [1, 5, 10, 15, 20, 25, 50, 100]

In [206]:
for max_depth in max_depth_list:
    print(f"=======MAX DEPTH = {max_depth}======")
    regressor.max_depth = max_depth
    regressor = regressor.fit(X_train, y_train)
    score, mse, rmse = calculate_scores(regressor, X_test, y_test)
    print(f"R^2 score: {score:.9f}")
    print(f"MSE loss: {mse}")
    print(f"RMSE loss: {rmse}")

R^2 score: 0.745603822
MSE loss: 2149199391202.1704
RMSE loss: 1466014.7991074887
R^2 score: 0.999046410
MSE loss: 8056154199.726986
RMSE loss: 89756.08168657422
R^2 score: 0.999996919
MSE loss: 26029537.208752457
RMSE loss: 5101.915053070999
R^2 score: 0.999997008
MSE loss: 25279839.233937684
RMSE loss: 5027.906048638706
R^2 score: 0.999997008
MSE loss: 25273748.62664767
RMSE loss: 5027.30033185284
R^2 score: 0.999997010
MSE loss: 25256206.03604152
RMSE loss: 5025.555296287319
R^2 score: 0.999997016
MSE loss: 25212274.19591512
RMSE loss: 5021.182549550964
R^2 score: 0.999997017
MSE loss: 25198039.266758222
RMSE loss: 5019.76486170002


In [207]:
for max_depth in max_depth_list:
    print(f"=======MAX DEPTH = {max_depth}======")
    regressor.max_depth = max_depth
    regressor = regressor.fit(X_train, y_train)
    score, mse, rmse = calculate_scores(regressor, X_train, y_train)
    print(f"R^2 score: {score:.9f}")
    print(f"MSE loss: {mse}")
    print(f"RMSE loss: {rmse}")

R^2 score: 0.747017049
MSE loss: 2083540278086.7585
RMSE loss: 1443447.3589593624
R^2 score: 0.999028366
MSE loss: 8002273636.342003
RMSE loss: 89455.42821060108
R^2 score: 0.999998711
MSE loss: 10616007.202105293
RMSE loss: 3258.221478369034
R^2 score: 0.999999005
MSE loss: 8195984.575668451
RMSE loss: 2862.8630033007958
R^2 score: 0.999999005
MSE loss: 8195984.575668451
RMSE loss: 2862.8630033007958
R^2 score: 0.999999005
MSE loss: 8195984.575668451
RMSE loss: 2862.8630033007958
R^2 score: 0.999999005
MSE loss: 8195984.575668452
RMSE loss: 2862.8630033007958
R^2 score: 0.999999005
MSE loss: 8195984.575668451
RMSE loss: 2862.8630033007958


In [208]:
max_depth = 10
regressor.max_depth = max_depth

# Badanie wyników dla różnych minimalnych ilości próbek do podziału

In [209]:
min_samples_split_list = [2, 5, 10, 15, 25, 50, 100]

In [210]:
for min_samples_split in min_samples_split_list:
    print(f"=======MIN SAMPLES  SPLIT= {min_samples_split}======")
    regressor.min_samples_split = min_samples_split
    regressor = regressor.fit(X_train, y_train)
    score, mse, rmse = calculate_scores(regressor, X_test, y_test)
    print(f"R^2 score: {score:.9f}")
    print(f"MSE loss: {mse}")
    print(f"RMSE loss: {rmse}")

R^2 score: 0.999996919
MSE loss: 26029537.208752275
RMSE loss: 5101.915053070981
R^2 score: 0.999996931
MSE loss: 25931440.026330683
RMSE loss: 5092.292217295732
R^2 score: 0.999996919
MSE loss: 26029537.208752643
RMSE loss: 5101.915053071018
R^2 score: 0.999996363
MSE loss: 30722506.08970426
RMSE loss: 5542.788656416936
R^2 score: 0.999993503
MSE loss: 54890834.66022049
RMSE loss: 7408.834905720365
R^2 score: 0.999980929
MSE loss: 161116212.17014238
RMSE loss: 12693.156115408901
R^2 score: 0.999936399
MSE loss: 537313115.4266671
RMSE loss: 23180.015431976466


In [211]:
for min_samples_split in min_samples_split_list:
    print(f"=======MIN SAMPLES  SPLIT= {min_samples_split}======")
    regressor.min_samples_split = min_samples_split
    regressor = regressor.fit(X_train, y_train)
    score, mse, rmse = calculate_scores(regressor, X_train, y_train)
    print(f"R^2 score: {score:.9f}")
    print(f"MSE loss: {mse}")
    print(f"RMSE loss: {rmse}")

R^2 score: 0.999998711
MSE loss: 10616007.202105293
RMSE loss: 3258.221478369034
R^2 score: 0.999998711
MSE loss: 10616007.202105293
RMSE loss: 3258.221478369034
R^2 score: 0.999998711
MSE loss: 10616007.202105293
RMSE loss: 3258.221478369034
R^2 score: 0.999998230
MSE loss: 14578939.96418611
RMSE loss: 3818.237808752371
R^2 score: 0.999995680
MSE loss: 35582609.543959446
RMSE loss: 5965.1160545256325
R^2 score: 0.999984097
MSE loss: 130974988.08609723
RMSE loss: 11444.430439567415
R^2 score: 0.999939404
MSE loss: 499063334.17776746
RMSE loss: 22339.725472300852


In [212]:
min_samples_split = 10
regressor.min_samples_split = min_samples_split

# Badanie wpływu wartości minimalnej ilości próbek w liściu

In [213]:
min_samples_leaf_list = [2, 5, 10, 15, 25, 50, 100]

In [214]:
for min_samples_leaf in min_samples_leaf_list:
    print(f"=======MIN SAMPLES  LEAF= {min_samples_leaf}======")
    regressor.min_samples_leaf = min_samples_leaf
    regressor = regressor.fit(X_train, y_train)
    score, mse, rmse = calculate_scores(regressor, X_test, y_test)
    print(f"R^2 score: {score:.9f}")
    print(f"MSE loss: {mse}")
    print(f"RMSE loss: {rmse}")

R^2 score: 0.999996968
MSE loss: 25615209.345885575
RMSE loss: 5061.147038556139
R^2 score: 0.999996927
MSE loss: 25960532.548076555
RMSE loss: 5095.147941726183
R^2 score: 0.999994716
MSE loss: 44638282.02641104
RMSE loss: 6681.18866867349
R^2 score: 0.999991141
MSE loss: 74839917.07678172
RMSE loss: 8651.00670886237
R^2 score: 0.999980902
MSE loss: 161343481.54028413
RMSE loss: 12702.105397936364
R^2 score: 0.999936515
MSE loss: 536334614.27614117
RMSE loss: 23158.899245778957
R^2 score: 0.999762720
MSE loss: 2004594981.028098
RMSE loss: 44772.703526011224


In [215]:
for min_samples_leaf in min_samples_leaf_list:
    print(f"=======MIN SAMPLES  LEAF= {min_samples_leaf}======")
    regressor.min_samples_leaf = min_samples_leaf
    regressor = regressor.fit(X_train, y_train)
    score, mse, rmse = calculate_scores(regressor, X_train, y_train)
    print(f"R^2 score: {score:.9f}")
    print(f"MSE loss: {mse}")
    print(f"RMSE loss: {rmse}")

R^2 score: 0.999998759
MSE loss: 10223473.853017583
RMSE loss: 3197.416746846989
R^2 score: 0.999998711
MSE loss: 10616007.202105293
RMSE loss: 3258.221478369034
R^2 score: 0.999996778
MSE loss: 26539919.960270625
RMSE loss: 5151.690980665535
R^2 score: 0.999993408
MSE loss: 54293892.24837299
RMSE loss: 7368.438928862272
R^2 score: 0.999983852
MSE loss: 132993967.67576046
RMSE loss: 11532.301057280827
R^2 score: 0.999939257
MSE loss: 500271390.0732694
RMSE loss: 22366.747418283005
R^2 score: 0.999755776
MSE loss: 2011400278.7771716
RMSE loss: 44848.63742386352


In [216]:
min_samples_leaf = 5
regressor.min_samples_leaf = min_samples_leaf

# Badanie wpływu strategi

In [217]:
strategies = ["random", "best"]

In [218]:
for strategy in strategies:
    print(f"=======Strategy= {strategy}======")
    regressor.splitter = strategy
    regressor = regressor.fit(X_train, y_train)
    score, mse, rmse = calculate_scores(regressor, X_test, y_test)
    print(f"R^2 score: {score:.9f}")
    print(f"MSE loss: {mse}")
    print(f"RMSE loss: {rmse}")

R^2 score: 0.999538421
MSE loss: 3899531328.4264646
RMSE loss: 62446.22749555384
R^2 score: 0.999996920
MSE loss: 26019636.29856142
RMSE loss: 5100.944647666883


In [219]:
for strategy in strategies:
    print(f"=======Strategy= {strategy}======")
    regressor.splitter = strategy
    regressor = regressor.fit(X_train, y_train)
    score, mse, rmse = calculate_scores(regressor, X_train, y_train)
    print(f"R^2 score: {score:.9f}")
    print(f"MSE loss: {mse}")
    print(f"RMSE loss: {rmse}")

R^2 score: 0.999075543
MSE loss: 7613729773.15279
RMSE loss: 87256.68898802424
R^2 score: 0.999998711
MSE loss: 10616007.202105293
RMSE loss: 3258.221478369034


# Porównanie z modelem Liniowym

In [220]:
from sklearn.linear_model import LinearRegression

In [221]:
regressor = LinearRegression()
regressor = regressor.fit(X_train, y_train)

# Zbiór walidacyjny

In [222]:
score = regressor.score(X_test, y_test)

In [223]:
y_pred = regressor.predict(X_test)
results = pd.DataFrame({"Actual": y_test, "Predicted": y_pred})

In [224]:
print(f"R^2 score: {score:.9f}")
print(f"MSE loss: {mse_loss(y_test, y_pred)}")
print(f"RMSE loss: {rmse_loss(y_test, y_pred)}")

R^2 score: 0.999999586
MSE loss: 3497662.1023542513
RMSE loss: 1870.2037595818942


# Zbiór treningowy

In [225]:
score = regressor.score(X_train, y_train)
y_pred = regressor.predict(X_train)

In [226]:
print(f"R^2 score: {score:.9f}")
print(f"MSE loss: {mse_loss(y_train, y_pred)}")
print(f"RMSE loss: {rmse_loss(y_train, y_pred)}")

R^2 score: 0.999999560
MSE loss: 3626633.7287649205
RMSE loss: 1904.3722663294907
