In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
import numpy as np
import joblib
from xgboost import XGBRegressor
from skopt import BayesSearchCV
from skopt.space import Real, Integer
from sklearn.metrics import make_scorer
import geopandas as gpd
from shapely.geometry import Point
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


# Wczytaj dane
df = pd.read_csv("data/housing.csv")

#geometry data
gdf_points = gpd.GeoDataFrame(
    df,
    geometry=gpd.points_from_xy(df.longitude, df.latitude),
    crs="EPSG:4326"
)

gdf_county = gpd.read_file("data/geodata/CA_Counties.shp")  # shapefile z opisem hrabstw
gdf_county = gdf_county.to_crs("EPSG:4326")  # ujednolicenie CRS :contentReference[oaicite:6]{index=6}

print(gdf_county.columns)

gdf_joined = gpd.sjoin(
    gdf_points,
    gdf_county[['geometry', 'NAME']],  # zmień 'NAME' jeśli inna kolumna zawiera nazwę hrabstwa
    how="left",
    predicate="within"
)


gdf_joined["county"] = gdf_joined["NAME"]

print(gdf_joined["county"].value_counts().head())

df_model = gdf_joined.drop(columns=["geometry", "index_right", "NAME"])  # zależnie od wersji geopandas

df_model = pd.get_dummies(df_model, columns=["county"], prefix="county")

print(df_model.columns)

Index(['STATEFP', 'COUNTYFP', 'COUNTYNS', 'GEOID', 'NAME', 'NAMELSAD', 'LSAD',
       'CLASSFP', 'MTFCC', 'CSAFP', 'CBSAFP', 'METDIVFP', 'FUNCSTAT', 'ALAND',
       'AWATER', 'INTPTLAT', 'INTPTLON', 'Shape_Leng', 'Shape_Area',
       'geometry'],
      dtype='object')
county
Los Angeles    5824
Orange         1618
San Diego      1609
Alameda        1017
Santa Clara    1003
Name: count, dtype: int64
Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity', 'county_Alameda',
       'county_Alpine', 'county_Amador', 'county_Butte', 'county_Calaveras',
       'county_Colusa', 'county_Contra Costa', 'county_Del Norte',
       'county_El Dorado', 'county_Fresno', 'county_Glenn', 'county_Humboldt',
       'county_Imperial', 'county_Inyo', 'county_Kern', 'county_Kings',
       'county_Lake', 'county_Lassen', 'county_Los Angeles', 'county_Madera',
       'county_Marin

In [None]:
class CustomFeaturesAdder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.median_total_bedrooms = X["total_bedrooms"].median()
        return self

    def transform(self, X):
        X = X.copy()
        X["total_bedrooms"] = X["total_bedrooms"].fillna(self.median_total_bedrooms)
        X["rooms_per_household"] = X["total_rooms"] / X["households"]
        X["bedrooms_per_room"] = X["total_bedrooms"] / X["total_rooms"]
        X["population_per_household"] = X["population"] / X["households"]
        return X


In [None]:
df = df_model.copy()

# Wypełnij braki medianą
df["total_bedrooms"].fillna(df["total_bedrooms"].median(), inplace=True)
df["rooms_per_household"] = df["total_rooms"] / df["households"]
df["bedrooms_per_room"] = df["total_bedrooms"] / df["total_rooms"]
df["population_per_household"] = df["population"] / df["households"]


log_cols = ["median_income", "population", "households", "median_house_value"]
for col in log_cols:
    df[col] = np.log1p(df[col])

# Teraz usuwamy surowe latitude i longitude
df.drop(["latitude", "longitude"], axis=1, inplace=True)

In [None]:


# Dane wejściowe i wyjściowe
X = df.drop("median_house_value", axis=1)
y = df["median_house_value"]

# 5. Identyfikacja cech numerycznych i kategorycznych
num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()


# 5. Identyfikacja cech numerycznych i kategorycznych
num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()

# 6. Pipeline preprocessingu
numerical_pipeline = Pipeline([
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline([
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse=False))
])

preprocessor = ColumnTransformer([
    ("num", numerical_pipeline, num_cols),
    ("cat", categorical_pipeline, cat_cols)
])

X_processed = preprocessor.fit_transform(X)


# --- Modele ---
rf = RandomForestRegressor(random_state=42)
xgb = XGBRegressor(random_state=42, verbosity=0)

# --- Pipeline Random Forest ---
pipeline_rf = Pipeline([
    ("preprocessing", preprocessor),
    ("model", rf)
])

# --- Pipeline XGBoost ---
pipeline_xgb = Pipeline([
    ("preprocessing", preprocessor),
    ("model", xgb)
])

# --- Podział na train/test ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Trening i ewaluacja z odlogarytmowaniem ---
def evaluate_model(pipeline, name="Model"):
    pipeline.fit(X_train, y_train)
    y_pred_log = pipeline.predict(X_test)
    
    # Odlogarytmowanie
    y_test_exp = np.expm1(y_test)
    y_pred_exp = np.expm1(y_pred_log)
    
    mae = mean_absolute_error(y_test_exp, y_pred_exp)
    rmse = mean_squared_error(y_test_exp, y_pred_exp, squared=False)
    r2 = r2_score(y_test_exp, y_pred_exp)
    
    print(f"\n{name} Results (on original scale):")
    print(f"MAE : {mae:,.2f}")
    print(f"RMSE: {rmse:,.2f}")
    print(f"R²  : {r2:.4f}")
    
    return pipeline


# Trening i ocena
model_rf = evaluate_model(pipeline_rf, "Random Forest")
model_xgb = evaluate_model(pipeline_xgb, "XGBoost")




Random Forest Results (on original scale):
MAE : 40,532.65
RMSE: 60,154.24
R²  : 0.7239





XGBoost Results (on original scale):
MAE : 41,863.95
RMSE: 61,547.72
R²  : 0.7109


In [3]:
import joblib

# Zapisz pipeline do pliku
joblib.dump(model_xgb, "xgb_pipeline.pkl")
joblib.dump(model_rf, "rf_pipeline.pkl")


['rf_pipeline.pkl']

In [9]:
print(df["median_house_value"])

0        452600.0
1        358500.0
2        352100.0
3        341300.0
4        342200.0
           ...   
20635     78100.0
20636     77100.0
20637     92300.0
20638     84700.0
20639     89400.0
Name: median_house_value, Length: 20640, dtype: float64
