# Real esate advertisement

In [84]:
import hashlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
import itertools

### Random seed based on my Neptun code

In [85]:
neptun = "JPWF8N"
seed = int(hashlib.sha256(neptun.encode("utf-8")).hexdigest(), 16) % 10**8
print(f"Random seed based on NEPTUN code: {seed}")

Random seed based on NEPTUN code: 75628879


## Data preparation

In [86]:
df = pd.read_csv("../data/ingatlan.csv")
cols_shuffled = np.random.RandomState(seed).permutation(df.columns)
df = df[cols_shuffled]
df.describe()

Unnamed: 0,ad_view_cnt,nr,balcony_area,price_created_at,postcode,active_days,district,property_area,small_room_cnt,room_cnt
count,183564.0,183565.0,175966.0,183565.0,115475.0,183565.0,176009.0,183565.0,182981.0,183565.0
mean,262.264082,196659.747942,7.748792,20.564001,1103.395895,44.071593,9.730434,48.443984,0.547177,1.476393
std,556.838684,113179.241269,2360.665258,171.496565,50.789818,47.969011,4.782807,12.776044,0.74881,0.972892
min,0.0,7.0,0.0,0.0,1011.0,1.0,1.0,0.0,-1.0,0.0
25%,41.0,98117.0,0.0,13.2,1063.0,11.0,6.0,40.0,0.0,1.0
50%,102.0,196774.0,0.0,16.9,1101.0,28.0,10.0,50.0,0.0,1.0
75%,265.0,294824.0,3.0,23.9,1141.0,60.0,13.0,60.0,1.0,2.0
max,40248.0,394178.0,990257.0,41796.0,1239.0,544.0,20.0,70.0,56.0,215.0


In [87]:
df_raw = df.copy()  # keep a copy of the raw data

In [88]:
df["price_per_m2"] = df["price_created_at"] / df["property_area"]
df = df.drop(columns=["price_created_at"])

### Functions for data processing

prompt: Create function to remove outliers

In [89]:
def remove_outliers_iqr(df, column, factor=1.5):
    q1 = df[column].quantile(0.25)
    q3 = df[column].quantile(0.75)
    iqr = q3 - q1
    lower = q1 - factor * iqr
    upper = q3 + factor * iqr
    return df[(df[column] >= lower) & (df[column] <= upper)]

prompt: Create parser for the building floors, that can have the following values: 1-10, more than 10, missing

In [90]:
def building_floor_count_parser(value):
    # distinct values are: 1-10, more than 10, nan
    if pd.isna(value):
        return np.nan

    value = str(value).strip().lower()

    if "more" in value and "10" in value:
        return 11
    else:
        try:
            return int(value)
        except ValueError:
            return np.nan

prompt: Create parser for the floor umber that can be the following 1-10, 10 plus, ground floor, mezzanine floor, 10 plus

In [91]:
def floor_parser(val):
    if pd.isna(val):
        return np.nan

    val = str(val).strip().lower()

    if "basement" in val:
        return -1
    elif "ground" in val:
        return 0
    elif "mezzanine" in val:
        return 0.5
    elif "plus" in val:
        nums = [int(s) for s in val.split() if s.isdigit()]
        return nums[0] + 1 if nums else 11
    else:
        try:
            return int(val)
        except ValueError:
            return np.nan

### Basic data preparation (deleting unnecessary data, type conversions, treating missing values etc., no experimenting with variable encoding and escoling yet)

Convert property_floor and building_floor_count

In [92]:
df['property_floor'] = df['property_floor'].apply(floor_parser)
df['building_floor_count'] = df['building_floor_count'].apply(building_floor_count_parser)

Convert date time

In [93]:
df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce')
df['month'] = df['created_at'].dt.month
df['year'] = df['created_at'].dt.year
df['day'] = df['created_at'].dt.day
df = df.drop(columns=['created_at'])

In [94]:
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()
# print("Numerical columns:", num_cols)
# print("Categorical columns:", cat_cols)
# print(df[num_cols].isna().sum())
# print(df[cat_cols].isna().sum())

In [95]:
df[num_cols] = df[num_cols].fillna(df[num_cols].median())
df[cat_cols] = df[cat_cols].fillna("missing")
# print(df[num_cols].isna().sum())
# print(df[cat_cols].isna().sum())

Remove outliers

In [96]:
df = remove_outliers_iqr(df, 'property_area', factor=3)
df = remove_outliers_iqr(df, 'balcony_area', factor=3)
df = remove_outliers_iqr(df, 'price_per_m2', factor=3)

Drop unnecessary columns

In [97]:
df.drop(columns=['ad_view_cnt', 'active_days', 'nr', 'county','property_type','city'], inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 178459 entries, 0 to 183564
Data columns (total 19 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   orientation              178459 non-null  object 
 1   view_type                178459 non-null  object 
 2   heating_type             178459 non-null  object 
 3   property_condition_type  178459 non-null  object 
 4   balcony_area             178459 non-null  float64
 5   property_subtype         178459 non-null  object 
 6   elevator_type            178459 non-null  object 
 7   postcode                 178459 non-null  float64
 8   building_floor_count     178459 non-null  float64
 9   garden_access            178459 non-null  object 
 10  district                 178459 non-null  float64
 11  property_area            178459 non-null  float64
 12  property_floor           178459 non-null  float64
 13  small_room_cnt           178459 non-null  float64
 14  room_cnt 

### Variable encoding and scaling configs

In [98]:
df_orientation = df['orientation'].copy()
df_heating_type = df['heating_type'].copy()
df_property_condition_type = df['property_condition_type'].copy()
df_property_floor = df['property_floor'].copy()

In [99]:
df_other = df.drop(columns=['orientation', 'heating_type', 'property_condition_type', 'property_floor'])

Encoding data that do not have to be examined

In [100]:
print("\nView type unique values:")
print(df_other['view_type'].unique())
print("\nProperty subtype unique values:")
print(df_other['property_subtype'].unique())
print("\nElevator type unique values:")
print(df_other['elevator_type'].unique())
print("\nGarden access unique values:")
print(df_other['garden_access'].unique())


View type unique values:
['garden view' 'missing' 'street view' 'courtyard view' 'panoramic']

Property subtype unique values:
['prefabricated panel flat (for sale)' 'brick flat (for sale)' 'missing'
 'prefabricated panel flat (for rent)' 'terraced house']

Elevator type unique values:
['yes' 'missing' 'none']

Garden access unique values:
['missing' 'none' 'yes']


Functions for encoding in df_others

In [101]:
def encode_view_type(df):
    df = df.fillna('missing')
    return pd.get_dummies(df, prefix='view')

def encode_property_subtype(df):
    df = df.fillna('missing')
    return pd.get_dummies(df, prefix='subtype')

def encode_elevator_type(df):
    mapping = {
        'yes': 1,
        'none': 0,
        'missing': 0
    }
    return df.map(mapping).fillna(0).astype(int)

def encode_garden_access(df):
    mapping = {
        'yes': 1,
        'none': 0,
        'missing': 0
    }
    return df.map(mapping).fillna(0).astype(int)


In [102]:
def preprocess_fixed_features(df):
    df_processed = df.copy()
    
    # 1. Encode view_type (one-hot)
    df_view = encode_view_type(df_processed['view_type'])
    
    # 2. Encode property_subtype (one-hot)
    df_subtype = encode_property_subtype(df_processed['property_subtype'])
    
    # 3. Encode elevator_type (binary)
    df_processed['elevator_type'] = encode_elevator_type(df_processed['elevator_type'])
    
    # 4. Encode garden_access (binary)
    df_processed['garden_access'] = encode_garden_access(df_processed['garden_access'])
    
    # Drop original categorical columns now replaced
    df_processed = df_processed.drop(columns=['view_type', 'property_subtype'])
    
    # Join the new one-hot features
    df_processed = pd.concat([df_processed, df_view, df_subtype], axis=1)
    
    return df_processed

In [103]:
df_other = preprocess_fixed_features(df_other)
df_other.info()

<class 'pandas.core.frame.DataFrame'>
Index: 178459 entries, 0 to 183564
Data columns (total 23 columns):
 #   Column                                       Non-Null Count   Dtype  
---  ------                                       --------------   -----  
 0   balcony_area                                 178459 non-null  float64
 1   elevator_type                                178459 non-null  int64  
 2   postcode                                     178459 non-null  float64
 3   building_floor_count                         178459 non-null  float64
 4   garden_access                                178459 non-null  int64  
 5   district                                     178459 non-null  float64
 6   property_area                                178459 non-null  float64
 7   small_room_cnt                               178459 non-null  float64
 8   room_cnt                                     178459 non-null  float64
 9   price_per_m2                                 178459 non-null  fl

Getting unique values for the prompt 

In [104]:
print("Orientation unique values:")
print(df_orientation.unique())
print("Heating type unique values:")
print(df_heating_type.unique())
print("Property condition type unique values:")
print(df_property_condition_type.unique())
print("Property floor unique values:")
print(df_property_floor.unique())

Orientation unique values:
['east' 'missing' 'west' 'south-east' 'south-west' 'north-east'
 'north-west' 'south' 'north']
Heating type unique values:
['missing' 'gas furnace, circulating hot water' 'konvection gas burner'
 'district heating' 'central heating with own meter' 'tile stove (gas)'
 'central heating' 'electric' 'other' 'fan-coil' 'gas furnace']
Property condition type unique values:
['good' 'novel' 'medium' 'renewed' 'new_construction' 'to_be_renovated'
 'can_move_in' 'missing_info' 'under_construction']
Property floor unique values:
[ 3.   0.   4.  -1.   7.   2.   1.   0.5  8.   6.  10.   5.   9.  11. ]


Encoding methods

prompt: Generate encoding methods based on the following available unique values: ...

In [105]:
def encode_orientation_method1(df):
    # ordinal encoding based on sunlight exposure levels
    # south = best, north = worst
    mapping = {
        'missing': 0,
        'north': 1,
        'north-east': 2,
        'east': 3,
        'south-east': 4,
        'south': 5,
        'south-west': 4,
        'west': 3,
        'north-west': 2
    }
    return df.map(mapping).fillna(0).astype(int)

def encode_orientation_method2(df):
    # onehot encoding
    return pd.get_dummies(df.fillna('missing'), prefix='orientation')


In [106]:
def encode_heating_method1(df):
    # ordinal encoding based on heating efficiency and cost
    mapping = {
        'missing': 0,
        'tile stove (gas)': 1,
        'konvection gas burner': 2,
        'gas furnace': 3,
        'gas furnace, circulating hot water': 3,
        'central heating': 4,
        'central heating with own meter': 5,
        'district heating': 6,
        'fan-coil': 7,
        'electric': 8,
        'other': 0      
    }
    return df.map(mapping).fillna(0).astype(int)

def encode_heating_method2(df):
    # onehot encoding
    return pd.get_dummies(df.fillna('missing'), prefix='heating')


In [107]:
def encode_condition_method1(df):
    # ordinal encoding ‚Äî based on condition quality progression.
    mapping = {
        'missing_info': 0,
        'to_be_renovated': 1,
        'medium': 2,
        'good': 3,
        'can_move_in': 4,
        'renewed': 5,
        'novel': 6,
        'under_construction': 7,
        'new_construction': 8
    }
    return df.map(mapping).fillna(0).astype(int)

def encode_condition_method2(df):
    # onehot encoding
    return pd.get_dummies(df.fillna('missing_info'), prefix='condition')

In [108]:
def encode_floor_method1(df):
    # Direct numeric usage with missing values filled.
    return df.fillna(0).astype(float)

def encode_floor_method2(df):
    # Feature-engineered floor info.
    return pd.DataFrame({
        'is_basement': (df < 0).astype(int),
        'is_ground': (df == 0).astype(int),
        'is_high_floor': (df >= 8).astype(int),
        'normalized_floor': df.fillna(0) / df.max()  # relative position
    })


In [109]:
configs = list(itertools.product(
    ['method1', 'method2'],  # orientation
    ['method1', 'method2'],  # heating
    ['method1', 'method2'],  # condition
    ['method1', 'method2']   # floor
))

# print configurations
for conf in configs:
    print(conf)

print(f"{len(configs)} configuration will run.")

('method1', 'method1', 'method1', 'method1')
('method1', 'method1', 'method1', 'method2')
('method1', 'method1', 'method2', 'method1')
('method1', 'method1', 'method2', 'method2')
('method1', 'method2', 'method1', 'method1')
('method1', 'method2', 'method1', 'method2')
('method1', 'method2', 'method2', 'method1')
('method1', 'method2', 'method2', 'method2')
('method2', 'method1', 'method1', 'method1')
('method2', 'method1', 'method1', 'method2')
('method2', 'method1', 'method2', 'method1')
('method2', 'method1', 'method2', 'method2')
('method2', 'method2', 'method1', 'method1')
('method2', 'method2', 'method1', 'method2')
('method2', 'method2', 'method2', 'method1')
('method2', 'method2', 'method2', 'method2')
16 configuration will run.


In [110]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import KFold

In [111]:
def evaluate_config(config):
    orientation_method, heating_method, cond_method, floor_method = config

    # C√©lv√°ltoz√≥
    y_original = df_other['price_per_m2']
    df_other.drop(columns='price_per_m2')
    y = y_original * 10e6
    #print(y_original.head())
    #y = np.log1p(y_original)  # log1p transzform√°ci√≥, ha szeretn√©nk stabiliz√°lni
    #print(y.head())

    # X el≈ë√°ll√≠t√°sa
    X_parts = [
        df_other.drop(columns='price_per_m2').copy().reset_index(drop=True),
        eval(f"encode_orientation_{orientation_method}(df_orientation)").reset_index(drop=True),
        eval(f"encode_heating_{heating_method}(df_heating_type)").reset_index(drop=True),
        eval(f"encode_condition_{cond_method}(df_property_condition_type)").reset_index(drop=True),
        eval(f"encode_floor_{floor_method}(df_property_floor)").reset_index(drop=True)
    ]
    X_encoded = pd.concat(X_parts, axis=1)
    X_encoded = X_encoded.fillna(0)  # NaN-ok kezel√©se
    
    # print(X_encoded.head())

    # 3-as KFold cross-validation
    kf = KFold(n_splits=3, shuffle=True, random_state=seed)
    mape_scores = {'LinearRegression': [], 'GBM': []}

    for train_idx, test_idx in kf.split(X_encoded):
        X_train, X_test = X_encoded.iloc[train_idx], X_encoded.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        # Linear Regression
        lr = LinearRegression()
        lr.fit(X_train, y_train)
        preds_lr = lr.predict(X_test)
        mape_scores['LinearRegression'].append(mean_absolute_percentage_error(y_test, preds_lr))

        # Gradient Boosting
        gbm = GradientBoostingRegressor(random_state=seed, n_estimators=100)
        gbm.fit(X_train, y_train)
        preds_gbm = gbm.predict(X_test)
        mape_scores['GBM'].append(mean_absolute_percentage_error(y_test, preds_gbm))

    return {
        'config': config,
        'lr_mape': np.mean(mape_scores['LinearRegression']),
        'gbm_mape': np.mean(mape_scores['GBM'])
    }


In [112]:
results = []
for cfg in configs:
    res = evaluate_config(cfg)
    results.append(res)
    print(f"‚úÖ {cfg} ‚Äî LR MAPE: {res['lr_mape']:.10f}, GBM MAPE: {res['gbm_mape']:.10f}")

results_df = pd.DataFrame(results)
results_df_sorted = results_df.sort_values('gbm_mape')
display(results_df_sorted.head())

KeyboardInterrupt: 

A MAPE √©rt√©kekkkel probl√©m√°k ad√≥dtak.

In [120]:
# --- M√°sold be ide a df_raw defini√°l√°s√°t ---
# Felt√©telezz√ºk, hogy df_raw m√°r be van olvasva

df = df_raw.copy()

### Tuning egyesevel
df['price_per_m2'] = (df['price_created_at'] / df['property_area'])*1e6
df = df[df['price_per_m2'] > 100_000]
df['price_per_m2'] = df['price_per_m2'].replace([np.inf, -np.inf], np.nan)
df['price_per_m2'] = df['price_per_m2'].fillna(df['price_per_m2'].median())
df['small_room_cnt'] = df['small_room_cnt'].fillna(0)
df['property_floor'] = df['property_floor'].apply(floor_parser)
df['building_floor_count'] = df['building_floor_count'].apply(building_floor_count_parser)
df = df[df['property_area'] > 0].copy()
df.isna().sum().sort_values(ascending=False).head(20)

# egyszer≈± kit√∂lt√©s: sz√°mokn√°l medi√°n, kateg√≥ri√°kn√°l 'missing'
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].fillna('missing')
    else:
        df[col] = df[col].fillna(df[col].median())

In [121]:
df.describe()

Unnamed: 0,ad_view_cnt,nr,balcony_area,price_created_at,postcode,active_days,building_floor_count,district,property_area,property_floor,small_room_cnt,room_cnt,price_per_m2
count,183367.0,183367.0,183367.0,183367.0,183367.0,183367.0,183367.0,183367.0,183367.0,183367.0,183367.0,183367.0,183367.0
mean,261.74496,196688.825547,7.434806,20.567581,1102.526354,44.079387,4.645176,9.744283,48.475489,2.605559,0.544994,1.476531,434849.3
std,555.53599,113174.613273,2312.534627,171.582229,40.273472,47.969758,2.160359,4.681962,12.720939,2.519168,0.73909,0.971875,3932486.0
min,0.0,7.0,0.0,1.0,1011.0,1.0,1.0,1.0,5.0,-1.0,-1.0,0.0,102857.1
25%,41.0,98177.5,0.0,13.2,1084.0,11.0,4.0,6.0,40.0,1.0,0.0,1.0,288000.0
50%,102.0,196812.0,0.0,16.9,1101.0,28.0,4.0,10.0,50.0,2.0,0.0,1.0,376000.0
75%,264.0,294857.0,3.0,23.9,1118.0,60.0,4.0,13.0,60.0,4.0,1.0,2.0,491666.7
max,40248.0,394178.0,990257.0,41796.0,1239.0,544.0,11.0,20.0,70.0,11.0,56.0,215.0,795411400.0


In [122]:
# 2Ô∏è‚É£ C√©lv√°ltoz√≥ lev√°laszt√°sa
y = df['price_per_m2'].astype(float)
X = df.drop(columns=['price_per_m2'])

# 3Ô∏è‚É£ Kategorikus v√°ltoz√≥k enk√≥dol√°sa
X_encoded = pd.get_dummies(X, drop_first=True)

print(X_encoded.shape)

print(f"üî¢ Bemeneti m√°trix m√©rete: {X_encoded.shape}")
print(f"üéØ y √©rt√©ktartom√°ny: min={y.min()}, max={y.max()}, mean={y.mean()}")

# 4Ô∏è‚É£ Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# 5Ô∏è‚É£ Sk√°l√°z√°s (csak line√°ris modellhez √©rdemes)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 6Ô∏è‚É£ Modellek tan√≠t√°sa
lr = LinearRegression()
gbm = GradientBoostingRegressor(random_state=42, n_estimators=5)

lr.fit(X_train_scaled, y_train)
gbm.fit(X_train, y_train)

# 7Ô∏è‚É£ El≈ërejelz√©s √©s MAPE
preds_lr = lr.predict(X_test_scaled)
preds_gbm = gbm.predict(X_test)

mape_lr = mean_absolute_percentage_error(y_test, preds_lr)
mape_gbm = mean_absolute_percentage_error(y_test, preds_gbm)

print(f"‚úÖ LinearRegression MAPE: {mape_lr:.4f}")
print(f"‚úÖ GradientBoostingRegressor MAPE: {mape_gbm:.4f}")

(183367, 641)
üî¢ Bemeneti m√°trix m√©rete: (183367, 641)
üéØ y √©rt√©ktartom√°ny: min=102857.14285714286, max=795411428.5714285, mean=434849.27831673704
‚úÖ LinearRegression MAPE: 0.1416
‚úÖ GradientBoostingRegressor MAPE: 0.3633


In [123]:
print("y_test stats:")
print(y_test.describe())

print("\nPreds (GBM) stats:")
print(pd.Series(preds_gbm).describe())

print("\nFirst 10 pairs:")
for yt, yp in zip(y_test[:10], preds_gbm[:10]):
    print(f"actual={yt:.2f}, pred={yp:.2f}, rel_error={(abs(yt-yp)/yt if yt != 0 else np.nan):.3f}")


y_test stats:
count    3.667400e+04
mean     4.367826e+05
std      3.778160e+06
min      1.028571e+05
25%      2.890909e+05
50%      3.771429e+05
75%      4.923077e+05
max      6.850000e+08
Name: price_per_m2, dtype: float64

Preds (GBM) stats:
count    3.667400e+04
mean     4.341085e+05
std      1.495182e+06
min      4.238942e+05
25%      4.238942e+05
50%      4.238942e+05
75%      4.238942e+05
max      2.646219e+08
dtype: float64

First 10 pairs:
actual=316666.67, pred=423894.22, rel_error=0.339
actual=378000.00, pred=423894.22, rel_error=0.121
actual=270909.09, pred=423894.22, rel_error=0.565
actual=331111.11, pred=423894.22, rel_error=0.280
actual=440000.00, pred=423894.22, rel_error=0.037
actual=522222.22, pred=423894.22, rel_error=0.188
actual=316666.67, pred=423894.22, rel_error=0.339
actual=500000.00, pred=423894.22, rel_error=0.152
actual=411428.57, pred=423894.22, rel_error=0.030
actual=200000.00, pred=423894.22, rel_error=1.119
