In [12]:
# 1. Importy
import pandas as pd
from pycaret.regression import setup, compare_models, predict_model, save_model, load_model

In [13]:
# 2. Wczytanie danych
df = pd.read_csv('Ceny mieszkań.csv', sep=';', encoding='iso-8859-2')

In [14]:
# 3. Podgląd danych
print(df.head())
print(df.columns)

  Unnamed: 0 2020 I kwarta¸ 2020 II kwarta¸ 2020 III kwarta¸ 2020 IV kwarta¸  \
0     GdaÄsk          8 383           8 480            8 882           8 874   
1     Gdynia          7 667           7 727            7 996           7 906   
2      Sopot          11760           11700            11780           12050   
3  Wejherowo           4500            4600             4700            4800   
4       Reda           5500            5600             5700            5800   

  2021 I kwarta¸ 2021 II kwarta¸ 2021 III kwarta¸ 2021 IV kwarta¸  \
0          9 031           9 071            9 566          10 208   
1          8 025           8 465            8 775           8 943   
2          12200           12800            13400           14100   
3           5000            5200             5400            5600   
4           5900            6000             6100            6200   

  2022 I kwarta¸ 2022 II kwarta¸ 2022 III kwarta¸ 2022 IV kwarta¸  \
0         10 239          10 813   

In [15]:
# 4. Sprawdzenie typu danych
print(df.dtypes)

Unnamed: 0          object
2020 I kwarta¸      object
2020 II kwarta¸     object
2020 III kwarta¸    object
2020 IV kwarta¸     object
2021 I kwarta¸      object
2021 II kwarta¸     object
2021 III kwarta¸    object
2021 IV kwarta¸     object
2022 I kwarta¸      object
2022 II kwarta¸     object
2022 III kwarta¸    object
2022 IV kwarta¸     object
2023 I kwarta¸      object
2023 II kwarta¸     object
2023 III kwarta¸    object
2023 IV kwarta¸     object
2024 I kwarta¸      object
2024 II kwarta¸     object
2024 III kwarta¸    object
dtype: object


In [16]:
# Zamiana z szerokiego formatu (kolumny jako kwartały) na długi
df_long = df.melt(id_vars=df.columns[0], var_name='Okres', value_name='Cena')
df_long.columns = ['Miasto', 'Okres', 'Cena']

# Rozdzielenie kolumny "Okres" na rok i kwartał
df_long[['Rok', 'Kwartał']] = df_long['Okres'].str.extract(r'(\d{4})\s+([IVX]+)\s+kwarta')

# Usunięcie kolumny "Okres"
df_long.drop(columns=['Okres'], inplace=True)

# Zamiana kwartałów na liczby
map_kwartał = {'I': 1, 'II': 2, 'III': 3, 'IV': 4}
df_long['Kwartał'] = df_long['Kwartał'].map(map_kwartał)

# Czyszczenie kolumny Cena z dziwnych znaków i zamiana na float
df_long['Cena'] = (
    df_long['Cena'].astype(str)
    .str.replace(r'[^\d]', '', regex=True)
    .replace('', pd.NA)
    .astype(float)
)

# Usunięcie braków
df_long.dropna(inplace=True)

# Upewnij się, że Rok to liczba
df_long['Rok'] = df_long['Rok'].astype(int)

df_long.head()


Unnamed: 0,Miasto,Cena,Rok,Kwartał
0,GdaÄsk,8383.0,2020,1
1,Gdynia,7667.0,2020,1
2,Sopot,11760.0,2020,1
3,Wejherowo,4500.0,2020,1
4,Reda,5500.0,2020,1


In [17]:
# 5. Uruchomienie PyCaret - regresja
exp = setup(
    data=df_long,
    target='Cena',  # <-- upewnij się, że to dokładna nazwa kolumny z ceną za m²
    categorical_features=['Miasto', 'Rok', 'Kwartał'],
    session_id=123,
    verbose=True
)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Cena
2,Target type,Regression
3,Original data shape,"(114, 4)"
4,Transformed data shape,"(114, 16)"
5,Transformed train set shape,"(79, 16)"
6,Transformed test set shape,"(35, 16)"
7,Categorical features,3
8,Preprocess,True
9,Imputation type,simple


In [18]:
# 6. Porównanie modeli
best_model = compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,280.7405,163709.7015,363.2195,0.9741,0.0372,0.0303,0.027
lr,Linear Regression,319.8729,162994.5257,382.4463,0.966,0.0423,0.0364,0.798
br,Bayesian Ridge,322.9989,163611.5928,383.8827,0.966,0.0429,0.0368,0.015
lasso,Lasso Regression,323.2158,163936.9477,384.3523,0.9659,0.0429,0.0368,0.4
llar,Lasso Least Angle Regression,324.6178,166405.4764,388.8814,0.9657,0.0439,0.0373,0.017
lar,Least Angle Regression,342.7125,179554.5369,405.012,0.9638,0.0476,0.0401,0.018
ridge,Ridge Regression,379.7876,256535.8356,462.9669,0.961,0.0464,0.0404,0.016
par,Passive Aggressive Regressor,348.008,239988.2499,453.0485,0.9607,0.0434,0.0354,0.016
huber,Huber Regressor,351.047,205489.0572,432.5831,0.9593,0.0454,0.0384,0.017
et,Extra Trees Regressor,370.4757,261525.0574,478.1262,0.9586,0.0491,0.0392,0.04


Processing:   0%|          | 0/77 [00:00<?, ?it/s]

In [19]:
# 7. Predykcja (na tym samym zbiorze – tylko testowo)
preds = predict_model(best_model)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Gradient Boosting Regressor,251.5697,160333.3176,400.4164,0.987,0.0314,0.0236


In [20]:
# 8. Zapisanie najlepszego modelu
save_model(best_model, 'model_ceny_mieszkan')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(include=[], transformer=SimpleImputer())),
                 ('categorical_imputer',
                  TransformerWrapper(include=['Miasto', 'Rok', 'Kwartał'],
                                     transformer=SimpleImputer(strategy='most_frequent'))),
                 ('onehot_encoding',
                  TransformerWrapper(include=['Miasto', 'Rok', 'Kwartał'],
                                     transformer=OneHotEncoder(cols=['Miasto',
                                                                     'Rok',
                                                                     'Kwartał'],
                                                               handle_missing='return_nan',
                                                               use_cat_names=True))),
                 ('clean_column_names',
                  TransformerWrapper(transformer=CleanColumnNames()

In [None]:
# 9. Wczytanie modelu (opcjonalnie później)
# model = load_model('model_ceny_mieszkan')