# Real esate advertisement analysis 

In [66]:
import hashlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler

### Random seed based on NEPTUN code

In [32]:
neptun = "JPWF8N"
seed = int(hashlib.sha256(neptun.encode("utf-8")).hexdigest(), 16) % 10**8
print(f"Random seed based on NEPTUN code: {seed}")

Random seed based on NEPTUN code: 75628879


## Data prep

In [33]:
df = pd.read_csv("../data/ingatlan.csv")

In [34]:
df.describe()

Unnamed: 0,postcode,room_cnt,small_room_cnt,property_area,balcony_area,price_created_at,ad_view_cnt,active_days,nr,district
count,115475.0,183565.0,182981.0,183565.0,175966.0,183565.0,183564.0,183565.0,183565.0,176009.0
mean,1103.395895,1.476393,0.547177,48.443984,7.748792,20.564001,262.264082,44.071593,196659.747942,9.730434
std,50.789818,0.972892,0.74881,12.776044,2360.665258,171.496565,556.838684,47.969011,113179.241269,4.782807
min,1011.0,0.0,-1.0,0.0,0.0,0.0,0.0,1.0,7.0,1.0
25%,1063.0,1.0,0.0,40.0,0.0,13.2,41.0,11.0,98117.0,6.0
50%,1101.0,1.0,0.0,50.0,0.0,16.9,102.0,28.0,196774.0,10.0
75%,1141.0,2.0,1.0,60.0,3.0,23.9,265.0,60.0,294824.0,13.0
max,1239.0,215.0,56.0,70.0,990257.0,41796.0,40248.0,544.0,394178.0,20.0


In [35]:
df.isna().sum() # Show number of missing values

county                          0
city                         1304
postcode                    68090
property_type                   0
property_subtype             3640
property_condition_type         0
property_floor               9066
building_floor_count        97789
view_type                   83103
orientation                 72142
garden_access              142886
heating_type                26686
elevator_type               33502
room_cnt                        0
small_room_cnt                584
created_at                      0
property_area                   0
balcony_area                 7599
price_created_at                0
ad_view_cnt                     1
active_days                     0
nr                              0
district                     7556
dtype: int64

In [36]:
df = df.drop(columns=["ad_view_cnt", "active_days", "nr"])

### Target variable

In [37]:
df["price_per_m2"] = df["price_created_at"] / df["property_area"]
df["price_per_m2"].describe()

  sqr = _ensure_numeric((avg - values) ** 2)


count    1.835650e+05
mean              inf
std               NaN
min      0.000000e+00
25%      2.880000e-01
50%      3.760000e-01
75%      4.923077e-01
max               inf
Name: price_per_m2, dtype: float64

In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 183565 entries, 0 to 183564
Data columns (total 21 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   county                   183565 non-null  object 
 1   city                     182261 non-null  object 
 2   postcode                 115475 non-null  float64
 3   property_type            183565 non-null  object 
 4   property_subtype         179925 non-null  object 
 5   property_condition_type  183565 non-null  object 
 6   property_floor           174499 non-null  object 
 7   building_floor_count     85776 non-null   object 
 8   view_type                100462 non-null  object 
 9   orientation              111423 non-null  object 
 10  garden_access            40679 non-null   object 
 11  heating_type             156879 non-null  object 
 12  elevator_type            150063 non-null  object 
 13  room_cnt                 183565 non-null  float64
 14  smal

### Data modification

In [39]:
def remove_outliers_iqr(df, column, factor=1.5):
    q1 = df[column].quantile(0.25)
    q3 = df[column].quantile(0.75)
    iqr = q3 - q1
    lower = q1 - factor * iqr
    upper = q3 + factor * iqr
    return df[(df[column] >= lower) & (df[column] <= upper)]

In [40]:
def floor_parser(val):
    if pd.isna(val):
        return np.nan

    val = str(val).strip().lower()

    if "basement" in val:
        return -1
    elif "ground" in val:
        return 0
    elif "mezzanine" in val:
        return 0.5
    elif "plus" in val:
        nums = [int(s) for s in val.split() if s.isdigit()]
        return nums[0] + 1 if nums else 11
    else:
        try:
            return int(val)
        except ValueError:
            return np.nan

In [41]:
def building_floor_count_parser(value):
    # distinct values are: 1-10, more than 10, nan
    if pd.isna(value):
        return np.nan

    value = str(value).strip().lower()

    if "more" in value and "10" in value:
        return 11
    else:
        try:
            return int(value)
        except ValueError:
            return np.nan

In [42]:
# df['building_floor_count'].unique()
# values = ['10', np.nan, '4', '3', '2', '5', '7', '9', '6', '1', 'more than 10', '8']
# print([parse_building_floor_count(v) for v in values])


In [43]:
#df['property_floor'].unique()

# floors = ['3', 'ground floor', '4', 'basement', '7', '2', '1', np.nan, 'mezzanine floor', '8', '6', '10', '5', '9', '10 plus']
#print([floor_parser(x) for x in floors])


In [44]:
df['property_floor'] = df['property_floor'].apply(floor_parser)
df['building_floor_count'] = df['building_floor_count'].apply(building_floor_count_parser)


In [45]:
df.describe()

  sqr = _ensure_numeric((avg - values) ** 2)


Unnamed: 0,postcode,property_floor,building_floor_count,room_cnt,small_room_cnt,property_area,balcony_area,price_created_at,district,price_per_m2
count,115475.0,174499.0,85776.0,183565.0,182981.0,183565.0,175966.0,183565.0,176009.0,183565.0
mean,1103.395895,2.636161,5.378544,1.476393,0.547177,48.443984,7.748792,20.564001,9.730434,inf
std,50.789818,2.579034,2.995146,0.972892,0.74881,12.776044,2360.665258,171.496565,4.782807,
min,1011.0,-1.0,1.0,0.0,-1.0,0.0,0.0,0.0,1.0,0.0
25%,1063.0,1.0,3.0,1.0,0.0,40.0,0.0,13.2,6.0,0.288
50%,1101.0,2.0,4.0,1.0,0.0,50.0,0.0,16.9,10.0,0.376
75%,1141.0,4.0,9.0,2.0,1.0,60.0,3.0,23.9,13.0,0.4923077
max,1239.0,11.0,11.0,215.0,56.0,70.0,990257.0,41796.0,20.0,inf


In [46]:
df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce')
df['month'] = df['created_at'].dt.month
df['year'] = df['created_at'].dt.year
df['day'] = df['created_at'].dt.day
df = df.drop(columns=['created_at'])

In [47]:
df.describe()

  sqr = _ensure_numeric((avg - values) ** 2)


Unnamed: 0,postcode,property_floor,building_floor_count,room_cnt,small_room_cnt,property_area,balcony_area,price_created_at,district,price_per_m2,month,year,day
count,115475.0,174499.0,85776.0,183565.0,182981.0,183565.0,175966.0,183565.0,176009.0,183565.0,183565.0,183565.0,183565.0
mean,1103.395895,2.636161,5.378544,1.476393,0.547177,48.443984,7.748792,20.564001,9.730434,inf,5.875515,2015.385662,15.328532
std,50.789818,2.579034,2.995146,0.972892,0.74881,12.776044,2360.665258,171.496565,4.782807,,3.119566,0.486753,8.708227
min,1011.0,-1.0,1.0,0.0,-1.0,0.0,0.0,0.0,1.0,0.0,1.0,2015.0,1.0
25%,1063.0,1.0,3.0,1.0,0.0,40.0,0.0,13.2,6.0,0.288,3.0,2015.0,8.0
50%,1101.0,2.0,4.0,1.0,0.0,50.0,0.0,16.9,10.0,0.376,6.0,2015.0,15.0
75%,1141.0,4.0,9.0,2.0,1.0,60.0,3.0,23.9,13.0,0.4923077,8.0,2016.0,23.0
max,1239.0,11.0,11.0,215.0,56.0,70.0,990257.0,41796.0,20.0,inf,12.0,2016.0,31.0


In [48]:
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()
print("Numerical columns:", num_cols)
print("Categorical columns:", cat_cols)
print(df[num_cols].isna().sum())
print(df[cat_cols].isna().sum())

Numerical columns: ['postcode', 'property_floor', 'building_floor_count', 'room_cnt', 'small_room_cnt', 'property_area', 'balcony_area', 'price_created_at', 'district', 'price_per_m2', 'month', 'year', 'day']
Categorical columns: ['county', 'city', 'property_type', 'property_subtype', 'property_condition_type', 'view_type', 'orientation', 'garden_access', 'heating_type', 'elevator_type']
postcode                68090
property_floor           9066
building_floor_count    97789
room_cnt                    0
small_room_cnt            584
property_area               0
balcony_area             7599
price_created_at            0
district                 7556
price_per_m2                0
month                       0
year                        0
day                         0
dtype: int64
county                          0
city                         1304
property_type                   0
property_subtype             3640
property_condition_type         0
view_type                   83103
or

In [49]:
df[num_cols] = df[num_cols].fillna(df[num_cols].median())
df[cat_cols] = df[cat_cols].fillna("missing")
print(df.isna().sum())
df.describe()

county                     0
city                       0
postcode                   0
property_type              0
property_subtype           0
property_condition_type    0
property_floor             0
building_floor_count       0
view_type                  0
orientation                0
garden_access              0
heating_type               0
elevator_type              0
room_cnt                   0
small_room_cnt             0
property_area              0
balcony_area               0
price_created_at           0
district                   0
price_per_m2               0
month                      0
year                       0
day                        0
dtype: int64


  sqr = _ensure_numeric((avg - values) ** 2)


Unnamed: 0,postcode,property_floor,building_floor_count,room_cnt,small_room_cnt,property_area,balcony_area,price_created_at,district,price_per_m2,month,year,day
count,183565.0,183565.0,183565.0,183565.0,183565.0,183565.0,183565.0,183565.0,183565.0,183565.0,183565.0,183565.0,183565.0
mean,1102.507183,2.604742,4.644164,1.476393,0.545436,48.443984,7.428017,20.564001,9.74153,inf,5.875515,2015.385662,15.328532
std,40.299919,2.518315,2.159849,0.972892,0.748253,12.776044,2311.287103,171.496565,4.683642,,3.119566,0.486753,8.708227
min,1011.0,-1.0,1.0,0.0,-1.0,0.0,0.0,0.0,1.0,0.0,1.0,2015.0,1.0
25%,1084.0,1.0,4.0,1.0,0.0,40.0,0.0,13.2,6.0,0.288,3.0,2015.0,8.0
50%,1101.0,2.0,4.0,1.0,0.0,50.0,0.0,16.9,10.0,0.376,6.0,2015.0,15.0
75%,1118.0,4.0,4.0,2.0,1.0,60.0,3.0,23.9,13.0,0.4923077,8.0,2016.0,23.0
max,1239.0,11.0,11.0,215.0,56.0,70.0,990257.0,41796.0,20.0,inf,12.0,2016.0,31.0


In [50]:
# export dataframe for verification
# df.to_csv("../data/ingatlan_cleaned.csv", index=False)

In [51]:
# print outliers in property_area and balcony_area

print(f"number of property area: {len(df['property_area'])}")

property_area_outliers = df[~df['property_area'].between(df['property_area'].quantile(0.01), df['property_area'].quantile(0.99))]
balcony_area_outliers = df[~df['balcony_area'].between(df['balcony_area'].quantile(0.01), df['balcony_area'].quantile(0.99))]
price_per_m2_outliers = df[~df['price_per_m2'].between(df['price_per_m2'].quantile(0.01), df['price_per_m2'].quantile(0.99))]

print(f"number of property area outliers: {len(property_area_outliers)}")
print(f"number of balcony area outliers: {len(balcony_area_outliers)}")
print(f"number of price per m2 outliers: {len(price_per_m2_outliers)}")


number of property area: 183565
number of property area outliers: 708
number of balcony area outliers: 1597
number of price per m2 outliers: 3669


### Removing outliers

In [52]:
df = remove_outliers_iqr(df, 'property_area', factor=3)
df = remove_outliers_iqr(df, 'balcony_area', factor=3)
df = remove_outliers_iqr(df, 'price_per_m2', factor=3)

print(f"number of property area after outlier removal: {len(df)}")


number of property area after outlier removal: 178459


### Drop columns deemed unnecessary

In [53]:
# show unique values for each categorical column
for col in cat_cols:
    print(f"Unique values in column '{col}': {df[col].unique()}")


Unique values in column 'county': ['Budapest']
Unique values in column 'city': ['Budapest IV.' 'Budapest XIV.' 'Budapest II.' 'Budapest XII.'
 'Budapest XVII.' 'Budapest XIII.' 'Budapest X.' 'Budapest VI.'
 'Budapest VII.' 'Budapest VIII.' 'Budapest III.' 'Budapest XX.'
 'Budapest XVIII.' 'Budapest XV.' 'missing' 'Budapest V.' 'Budapest IX.'
 'Budapest XI.' 'Budapest XIX.' 'Budapest I.' 'Budapest XVI.'
 'Budapest XXI.' 'Budapest XXII.' 'Budapest XXIII.']
Unique values in column 'property_type': ['flat']
Unique values in column 'property_subtype': ['prefabricated panel flat (for sale)' 'brick flat (for sale)' 'missing'
 'prefabricated panel flat (for rent)' 'terraced house']
Unique values in column 'property_condition_type': ['good' 'novel' 'medium' 'renewed' 'new_construction' 'to_be_renovated'
 'can_move_in' 'missing_info' 'under_construction']
Unique values in column 'view_type': ['garden view' 'missing' 'street view' 'courtyard view' 'panoramic']
Unique values in column 'orientation

In [54]:
df.drop(columns=['county','property_type','city'], inplace=True)

In [55]:
df.describe()

Unnamed: 0,postcode,property_floor,building_floor_count,room_cnt,small_room_cnt,property_area,balcony_area,price_created_at,district,price_per_m2,month,year,day
count,178459.0,178459.0,178459.0,178459.0,178459.0,178459.0,178459.0,178459.0,178459.0,178459.0,178459.0,178459.0,178459.0
mean,1102.499302,2.609501,4.653125,1.470517,0.542063,48.352591,1.457113,18.995415,9.741033,0.40018,5.882163,2015.384828,15.324607
std,40.298179,2.522448,2.170813,0.895132,0.725174,12.703993,2.503852,8.392615,4.680053,0.151379,3.117738,0.486556,8.710277
min,1011.0,-1.0,1.0,0.0,-1.0,5.0,0.0,0.0,1.0,0.0,1.0,2015.0,1.0
25%,1083.0,1.0,4.0,1.0,0.0,40.0,0.0,13.0,6.0,0.285714,3.0,2015.0,8.0
50%,1101.0,2.0,4.0,1.0,0.0,50.0,0.0,16.8,10.0,0.371429,6.0,2015.0,15.0
75%,1118.0,4.0,4.0,2.0,1.0,60.0,3.0,23.4,13.0,0.483333,8.0,2016.0,23.0
max,1239.0,11.0,11.0,215.0,47.0,70.0,12.0,74.4,20.0,1.085714,12.0,2016.0,31.0


In [56]:
# save df in csv format
df.to_csv("../data/ingatlan_cleaned.csv", index=False)

### Get target

In [57]:
y = df['price_per_m2'].copy()
df = df.drop(columns=['price_per_m2'])

### Encoding

In [58]:
y_log = np.log1p(y)

In [59]:
def apply_target_encoding(df, target, cols):
    df_encoded = df.copy()
    for col in cols:
        means = df_encoded.groupby(col)[target].mean()
        df_encoded[col + '_te'] = df_encoded[col].map(means)
        df_encoded = df_encoded.drop(columns=[col])
    return df_encoded

#### Target encoding

In [62]:
target_cols = ['postcode', 'district']

In [63]:
df_encoded = df.copy()
df_encoded['price_per_m2'] = y  # ideiglenesen visszatessz√ºk a targetet a target encodinghoz
df_encoded = apply_target_encoding(df_encoded, 'price_per_m2', target_cols)
df_encoded = df_encoded.drop(columns=['price_per_m2'])


#### Ordinal encoding

In [72]:
ordinal_cols = [['to_be_renovated',
    'missing_info',
    'medium',
    'renewed',
    'good',
    'can_move_in',
    'under_construction',
    'novel',
    'new_construction']]

In [73]:
ord_enc = OrdinalEncoder(categories=ordinal_cols)
df_encoded['property_condition_type_encoded'] = ord_enc.fit_transform(
    df_encoded[['property_condition_type']]
)
df_encoded = df_encoded.drop(columns=['property_condition_type'])

#### One-Hot encoding  

In [74]:
# save encoded dataframe
df_encoded.to_csv("../data/ingatlan_encoded.csv", index=False)