In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import numpy as np

DATASET_PATH = "Dane\\CaliforniaHousing.csv"

In [None]:
# Wczytanie datasetu California Housing
dataset = pd.read_csv(DATASET_PATH)

In [None]:
# Informacje o zestawie danych
dataset.info()

### Brakujące dane

In [None]:
# Znalezienie brakujących danych
dataset.isnull().any()

In [None]:
# W rzędzie jest brakująca wartość
isnull = dataset.isnull().any(axis=1)
print(np.count_nonzero(isnull),"brakujących wartości.")
# Usunięcie rzędu
dataset = dataset.drop(np.asarray(isnull).nonzero()[0].tolist() ,axis=0)

In [None]:
# Podgląd pierwszych 10 rzędów
dataset.head(10)

In [None]:
# Znalezienie kolumn z danymi kategorialnymi (nienumerycznymi)
dataset_cat=dataset.select_dtypes(include='object')
dataset_cat.columns

### Wydzielenie zmiennej zależnej

In [None]:
# Wydzielenie zmiennej zależnej (Y)
dataset["median_house_value"] /= 1000
x,y = dataset.drop(columns=["median_house_value"]), dataset["median_house_value"]

### EDA

In [None]:
# Podstawowa analiza statystyczna
x.describe()

In [None]:
# Hisotgramy zmiennych niezależnych
x.hist(figsize=(15,10), bins=20)

In [None]:
# Analiza korelacji pomiędzy zmiennymi

import seaborn as sns
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(10, 6))

sns.heatmap(x.select_dtypes(exclude='object').corr(), ax=ax, annot=True)

### Train, Test, Val split

In [None]:
# Podział na subsety TRAIN, TEST oraz VAL

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.4, random_state=5, shuffle=True)
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=0.5, random_state=5, shuffle=True)

In [None]:
x_train.describe()

In [None]:
x_val.describe()

In [None]:
x_test.describe()

### Skalowanie wartości niezależnych

In [None]:
# Skalowanie zmiennych niezależnych numerycznych z użyciem StandardScaler
# Enkodowanie zmiennych niezależnych kategorycznych z użyciem OrdinalEncoder

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder 
from sklearn.compose import make_column_transformer


col_categorical = x_train.select_dtypes(include='object').columns
col_numerical = x_train.select_dtypes(exclude='object').columns

col_transformer = make_column_transformer(
    (StandardScaler(), col_numerical),
    (OrdinalEncoder(), col_categorical)
)

### Regresja - model regresji linowej, drzewo decyzyjne

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

In [None]:
# Regresja liniowa

from sklearn.linear_model import LinearRegression

linear_regressor = Pipeline([
    ('col_transformer', col_transformer),
    ('linear_regressor', LinearRegression(positive=True))
])
linear_regressor.fit(x_train, y_train)

y_pred = linear_regressor.predict(x_train)
print(f"RMSE: {mean_squared_error(y_train, y_pred)**.5:.2f}")
print(f"R^2: {r2_score(y_train, y_pred):.3f}")

fig, ax = plt.subplots(1,1,figsize=(5,5))

ax.plot(y_train, y_pred, '.')
ax.plot([0, np.max(y_val)], [0, np.max(y_val)], color='red', linestyle='--', linewidth=1)
ax.set_xlabel("Oczekiwane wartości")
ax.set_ylabel("Predykcje")
ax.set_title("Zestaw treningowy")
ax.grid(True)
plt.show()


In [None]:
# Drzewo decyzyjne

from sklearn.tree import DecisionTreeRegressor

decision_tree = Pipeline([
    ('col_transformer', col_transformer),
    ('decision_tree', DecisionTreeRegressor())
])
decision_tree.fit(x_train, y_train)

y_pred = decision_tree.predict(x_train)
print(f"RMSE: {mean_squared_error(y_train, y_pred)**.5:.2f}")
print(f"R^2: {r2_score(y_train, y_pred):.3f}")

fig, ax = plt.subplots(1,1,figsize=(5,5))

ax.plot(y_train, y_pred, '.')
ax.plot([0, np.max(y_val)], [0, np.max(y_val)], color='red', linestyle='--', linewidth=1)
ax.set_xlabel("Oczekiwane wartości")
ax.set_ylabel("Predykcje")
ax.set_title("Zestaw treningowy")
ax.grid(True)
plt.show()

### Ewaluacja

In [None]:
# Ewaluacja regresji liniowej na zestawie walidacyjnym

y_pred = linear_regressor.predict(x_val)
print(f"RMSE: {mean_squared_error(y_val, y_pred)**.5:.2f}")
print(f"R^2: {r2_score(y_val, y_pred):.3f}")

fig, ax = plt.subplots(1,1,figsize=(5,5))

ax.plot(y_val, y_pred, '.')
ax.plot([0, np.max(y_val)], [0, np.max(y_val)], color='red', linestyle='--', linewidth=1)
ax.set_xlabel("Oczekiwane wartości")
ax.set_ylabel("Predykcje")
ax.set_title("Zestaw walidacyjny")
ax.grid(True)
plt.show()

In [None]:
# Ewaluacja drzewa decyzyjnego na zestawie walidacyjnym

y_pred = decision_tree.predict(x_val)
print(f"RMSE: {mean_squared_error(y_val, y_pred)**.5:.2f}")
print(f"R^2: {r2_score(y_val, y_pred):.3f}")

fig, ax = plt.subplots(1,1,figsize=(5,5))

ax.plot(y_val, y_pred, '.')
ax.plot([0, np.max(y_val)], [0, np.max(y_val)], color='red', linestyle='--', linewidth=1)
ax.set_xlabel("Oczekiwane wartości")
ax.set_ylabel("Predykcje")
ax.set_title("Zestaw walidacyjny")
ax.grid(True)
plt.show()

### Strojenie hiperparametrów

In [None]:
# Listowanie parametrów drzewa decyzyjnego
decision_tree['decision_tree'].get_params()

In [None]:
# Tuning hiperparametrów drzewa decyzyjnego z wykorzystaniem GridSearchCV 
from sklearn.model_selection import GridSearchCV

parameters = {
    'decision_tree__max_depth': [5, 10, 15, 20, 25],
    'decision_tree__min_samples_split': [2, 5, 10, 15, 20],
    'decision_tree__min_samples_leaf': [1, 2, 5, 10, 15]
}
grid_search = GridSearchCV(decision_tree, parameters)
grid_search.fit(x_val, y_val)

grid_search.best_params_

In [None]:
# Zastosowanie hiperparametrów i ponowny trening
for param, val in grid_search.best_params_.items():
    decision_tree.set_params(**{param: val})

decision_tree.fit(x_train, y_train)

In [None]:
# Ewaluacja na zestawie walidacyjnym
y_pred = decision_tree.predict(x_val)
print(f"RMSE: {mean_squared_error(y_val, y_pred)**.5:.2f}")
print(f"R^2: {r2_score(y_val, y_pred):.3f}")

fig, ax = plt.subplots(1,1,figsize=(5,5))

ax.plot(y_val, y_pred, '.')
ax.plot([0, np.max(y_val)], [0, np.max(y_val)], color='red', linestyle='--', linewidth=1)
ax.set_xlabel("Oczekiwane wartości")
ax.set_ylabel("Predykcje")
ax.set_title("Zestaw walidacyjny")
ax.grid(True)
plt.show()

### Selekcja cech metodą LASSO

In [None]:
# Regresja Lasso do oceny ważności zmiennych niezależnych

from sklearn.linear_model import Lasso

lasso = Pipeline([
    ('col_transformer', col_transformer),
    ('lasso', Lasso(alpha=1e-05, max_iter=4000))
])

lasso.fit(x_train, y_train)

In [None]:
# Wyznaczenie ważności zmiennych niezależnych i ich wizualizacja

lasso_coef = np.abs(lasso['lasso'].coef_)
lasso_coef /= np.sum(lasso_coef)

THRESH = 0.05

decision_tree_coef = np.abs(decision_tree['decision_tree'].feature_importances_)
decision_tree_coef /= np.sum(decision_tree_coef)

# plotting the Column Names and Importance of Columns. 
fig,axes = plt.subplots(1,2,figsize=(10,3))

axes[0].bar(x_train.columns.values, lasso_coef)
axes[0].axhline(y=THRESH, color='r', linestyle='-')
axes[0].grid()
axes[0].set_xticks(x_train.columns.values)
axes[0].set_xticklabels(x_train.columns.values, rotation = 90)
axes[0].set_title("Ważność cech wyznaczona metodą LASSO")
axes[0].set_xlabel("Nazwa cechy")
axes[0].set_ylabel("Wpływ")

axes[1].bar(x_train.columns.values, decision_tree_coef)
axes[1].grid()
axes[1].set_xticks(x_train.columns.values)
axes[1].set_xticklabels(x_train.columns.values, rotation = 90)
axes[1].set_title("Wpływ cech na model - Decision Tree")
axes[1].set_xlabel("Nazwa cechy")
axes[1].set_ylabel("Wpływ")

plt.show()

features_selected = x_train.columns[lasso_coef > THRESH]
features_ignored = x_train.columns[lasso_coef <= THRESH]
print(features_selected)

In [None]:
# Usunięcie zmiennych nieistotnych
x_train = x_train.drop(columns=features_ignored)
x_val = x_val.drop(columns=features_ignored)
x_test = x_test.drop(columns=features_ignored)

In [None]:
# Ponowne stworzenie pipeline'u oraz trenowanie modelu drzewa decyzyjnego dla zestawu z nowymi zmiennymi

col_categorical = x_train.select_dtypes(include='object').columns
col_numerical = x_train.select_dtypes(exclude='object').columns

col_transformer = make_column_transformer(
    (StandardScaler(), col_numerical),
    (OrdinalEncoder(), col_categorical)
)

decision_tree = Pipeline([
    ('col_transformer', col_transformer),
    ('decision_tree', DecisionTreeRegressor())
])

parameters = {
    'decision_tree__max_depth': [5, 10, 15, 20, 25],
    'decision_tree__min_samples_split': [2, 5, 10, 15, 20],
    'decision_tree__min_samples_leaf': [1, 2, 5, 10, 15]
}
grid_search = GridSearchCV(decision_tree, parameters)
grid_search.fit(x_val, y_val)

grid_search.best_params_

for param, val in grid_search.best_params_.items():
    decision_tree.set_params(**{param: val})

decision_tree.fit(x_train, y_train)

In [None]:
# Wyznaczenie wpłwu cech niezależnych na model

decision_tree_coef = np.abs(decision_tree['decision_tree'].feature_importances_)
decision_tree_coef /= np.sum(decision_tree_coef)

# plotting the Column Names and Importance of Columns. 
fig,ax = plt.subplots(1,1,figsize=(5,3))

ax.bar(x_train.columns.values, decision_tree_coef)
ax.grid()
ax.set_xticks(x_train.columns.values)
ax.set_xticklabels(x_train.columns.values, rotation = 90)
ax.set_title("Wpływ cech na model - Decision Tree")
ax.set_xlabel("Nazwa cechy")
ax.set_ylabel("Wpływ")

plt.show()

In [None]:
# Ewaluacja na zestawie walidacyjnym
y_pred = decision_tree.predict(x_val)
print(f"RMSE: {mean_squared_error(y_val, y_pred)**.5:.2f}")
print(f"R^2: {r2_score(y_val, y_pred):.3f}")

fig, ax = plt.subplots(1,1,figsize=(5,5))

ax.plot(y_val, y_pred, '.')
ax.plot([0, np.max(y_val)], [0, np.max(y_val)], color='red', linestyle='--', linewidth=1)
ax.set_xlabel("Oczekiwane wartości")
ax.set_ylabel("Predykcje")
ax.set_title("Zestaw walidacyjny")
ax.grid(True)
plt.show()

### Walidacja na zbiorze testowym (finalna)

In [None]:
y_pred = decision_tree.predict(x_test)
print(f"RMSE: {mean_squared_error(y_test, y_pred)**.5:.2f}")
print(f"R^2: {r2_score(y_test, y_pred):.3f}")

fig, ax = plt.subplots(1,1,figsize=(5,5))

ax.plot(y_test, y_pred, '.')
ax.plot([0, np.max(y_val)], [0, np.max(y_val)], color='red', linestyle='--', linewidth=1)
ax.set_xlabel("Oczekiwane wartości")
ax.set_ylabel("Predykcje")
ax.set_title("Zestaw testowy") 
ax.grid(True)
plt.show()