### Import packages

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor

import pickle

### Read data

In [2]:
housing_data_path = "all_platforms_data.csv"
df_housing = pd.read_csv(filepath_or_buffer=housing_data_path, delimiter=",", header=0, encoding="utf-8")

print(df_housing.shape)
print(df_housing.columns)

(18883, 28)
Index(['is_primary_nan', 'total_area', 'basement', 'elevator', 'balcony',
       'property_level', 'heating_type_nan', 'balcony_nan', 'row_price_m2',
       'total_property_level', 'parking_nan', 'url', 'building_type_nan',
       'building_year', 'heating_type', 'price', 'is_primary',
       'total_property_level_nan', 'elevator_nan', 'id', 'is_parking',
       'no_rooms', 'district_norm', 'building_type', 'property_level_nan',
       'building_year_nan', 'portal_name', 'basement_nan'],
      dtype='object')


## Prepare data

### Drop unnecessary columns

In [3]:
columns_to_drop = ["id", "url", "portal_name", "row_price_m2"]
df_housing = df_housing.drop(columns=columns_to_drop)

print(df_housing.shape)

(18883, 24)


### Drop duplicates

In [4]:
df_housing = df_housing.drop_duplicates(keep='last')

print(df_housing.shape)

(17800, 24)


### Prepare 3 datasets

In [5]:
# Define scaler
scaler = MinMaxScaler()

Dataset 1: Minimal transformation

In [6]:
print("Preparing first dataset: Dummies only")

# Dataset 1: Dummies only
df_dummies_only = df_housing.copy()
df_dummies_only = pd.get_dummies(df_dummies_only, drop_first=True)

Preparing first dataset: Dummies only


Dataset 2: Minimal transformation with numerical attributes scaling

In [7]:
print("Preparing second dataset: Dummies and scaling")

# Dataset 2: Dummies and scaling
df_dummies_scaling = df_housing.copy()

# Get dummies
df_dummies_scaling = pd.get_dummies(df_dummies_scaling, drop_first=True)

# Scale numerical features
numerical_features = df_dummies_scaling.select_dtypes(include=[np.number]).columns

df_dummies_scaling[numerical_features] = scaler.fit_transform(df_dummies_scaling[numerical_features])

Preparing second dataset: Dummies and scaling


Dataset 3: Full transformation with reduced number of features

In [8]:
print("Preparing third dataset: Dummies, scaling, low-correlation features removal")

df_housing = pd.get_dummies(df_housing, drop_first=True)
# Drop unnecessary features
correlation_matrix = df_housing.corr()
correlation_with_price = correlation_matrix["price"]
columns_to_remove = correlation_with_price[(correlation_with_price > -0.05) & (correlation_with_price < 0.05)].index
columns_to_remove = [col for col in columns_to_remove if "_norm" not in col]

df_dummies_scaling_filtered = df_housing.drop(columns=columns_to_remove)

# Get dummies
df_dummies_scaling_filtered = pd.get_dummies(df_dummies_scaling_filtered, drop_first=True)

# Scale numerical features
numerical_features_filtered = df_dummies_scaling_filtered.select_dtypes(include=[np.number]).columns
df_dummies_scaling_filtered[numerical_features_filtered] = scaler.fit_transform(df_dummies_scaling_filtered[numerical_features_filtered])

Preparing third dataset: Dummies, scaling, low-correlation features removal


In [9]:
datasets = {
    'Dummies Only': df_dummies_only,
    'Dummies and Scaling': df_dummies_scaling,
    'Dummies, Scaling, and Filtering': df_dummies_scaling_filtered
}

print(f'Prepared datasets: \n1st: {df_dummies_only.shape}, \n2nd: {df_dummies_scaling.shape}, \n3rd: {df_dummies_scaling_filtered.shape}')
print(f"Deleted: {set(df_dummies_scaling.columns) - set(df_dummies_scaling_filtered.columns)}")


Prepared datasets: 
1st: (17800, 51), 
2nd: (17800, 51), 
3rd: (17800, 33)
Deleted: {'building_type_house', 'heating_type_central', 'property_level_nan', 'total_property_level', 'building_year_nan', 'balcony', 'basement_nan', 'heating_type_individual', 'building_type_ribbon', 'balcony_nan', 'is_primary_nan', 'property_level', 'heating_type_other', 'heating_type_electrical', 'building_type_infill', 'total_property_level_nan', 'heating_type_gas', 'heating_type_nan'}


## Perform ML on all datasets

### Define hyperparameters grid

In [10]:
param_grids = {
    'Random Forest': {
        'model': RandomForestRegressor(random_state=42),
        'params': {'n_estimators': [100, 200, 300]}
    },
    'Gradient Boosting': {
        'model': GradientBoostingRegressor(random_state=42),
        'params': {'n_estimators': [100, 200, 300]}
    },
    'XGBoost': {
        'model': XGBRegressor(random_state=42),
        'params': {'n_estimators': [100, 200, 300]}
    },
    'Linear Regression': {
        'model': LinearRegression(),
        'params': {}
    }
}

### Run ML

In [11]:
results = []

for dataset_name, df in datasets.items():
    X = df.drop(columns=['price'])
    y = df['price']

    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

    for model_name, config in param_grids.items():
        model = config['model']
        param_grid = config['params']

        grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=0)
        grid_search.fit(X_train, y_train)

        best_model = grid_search.best_estimator_

        val_pred = best_model.predict(X_val)
        r2_val = r2_score(y_val, val_pred)

        test_pred = best_model.predict(X_test)
        r2_test = r2_score(y_test, test_pred)

        results.append({
            'Model': model_name,
            'Dataset': dataset_name,
            'Params': grid_search.best_params_,
            'Validation R2': round(r2_val, 5),
            'Test R2': round(r2_test, 5)
        })

pd.options.display.float_format = '{:.4f}'.format
results_df = pd.DataFrame(results)
results_df.sort_values("Test R2", ascending=False, inplace=True)

print("\nResults Summary:\n")
results_df


Results Summary:



Unnamed: 0,Model,Dataset,Params,Validation R2,Test R2
2,XGBoost,Dummies Only,{'n_estimators': 300},0.874,0.8647
6,XGBoost,Dummies and Scaling,{'n_estimators': 300},0.8742,0.8642
0,Random Forest,Dummies Only,{'n_estimators': 300},0.8765,0.8634
4,Random Forest,Dummies and Scaling,{'n_estimators': 300},0.8765,0.8632
8,Random Forest,"Dummies, Scaling, and Filtering",{'n_estimators': 300},0.8702,0.8552
10,XGBoost,"Dummies, Scaling, and Filtering",{'n_estimators': 200},0.8634,0.8497
1,Gradient Boosting,Dummies Only,{'n_estimators': 300},0.8312,0.8232
5,Gradient Boosting,Dummies and Scaling,{'n_estimators': 300},0.8312,0.8231
9,Gradient Boosting,"Dummies, Scaling, and Filtering",{'n_estimators': 300},0.8237,0.8174
3,Linear Regression,Dummies Only,{},0.7593,0.7505


## Choose and save best model

In [12]:
best_model = results_df[results_df['Test R2'] == results_df['Test R2'].max()].iloc[0]

print(f"\nBest Model: {best_model['Model']}")
print(f"Best Model Parameters: {best_model['Params']}")
print(f"Best Model Validation R2: {best_model['Validation R2']}")
print(f"Best Model Test R2: {best_model['Test R2']}")
print(f"Used Dataset: {best_model['Dataset']}")

best_model_object = grid_search.best_estimator_
with open('best_model.pkl', 'wb') as f:
    pickle.dump(best_model_object, f)


Best Model: XGBoost
Best Model Parameters: {'n_estimators': 300}
Best Model Validation R2: 0.87405
Best Model Test R2: 0.86465
Used Dataset: Dummies Only
