# EDA

## 1. Load cleaned data

In [1]:
# Standard imports
import numpy as np
import pandas as pd
import re

# Local imports
from _csv_utils import data_timeplace, DataPathCleaningManager


In [None]:

data_path_manager = DataPathCleaningManager(data_timeplace)

df_olx = data_path_manager.load_df(domain="olx", is_cleaned=True)
df_otodom = data_path_manager.load_df(domain="otodom", is_cleaned=True)

### 1.1 OLX

In [2]:
df_olx

Unnamed: 0,link,title,price,summary_description,ownership,floor_level,is_furnished,building_type,square_meters,number_of_rooms,rent,voivodeship,city,street
0,https://www.olx.pl/d/oferta/kawalerka-tychy-os...,Kawalerka Tychy oś M,1500.0,OpisWynajmę kawalerkę na ul Moniuszki 1/16 .na...,private,3,False,apartment_building,26,1,400.0,Śląskie,Zawada,Moniuszki 1/16


In [3]:
df_olx.columns

Index(['link', 'title', 'price', 'summary_description', 'ownership',
       'floor_level', 'is_furnished', 'building_type', 'square_meters',
       'number_of_rooms', 'rent', 'voivodeship', 'city', 'street'],
      dtype='object')

### 1.2 otodom

In [4]:
df_otodom.sample(5)

Unnamed: 0_level_0,listing,listing,listing,listing,pricing,pricing,pricing,pricing,location,location,...,building_material,building_material,building_material,additional_information,additional_information,additional_information,additional_information,additional_information,additional_information,additional_information
Unnamed: 0_level_1,link,title,summary_description,remote_service,price,rent,total_rent,deposit,complete_address,street,...,silicate,large_panel,reinforced_concrete,no_information,duplex,air_conditioning,separate_kitchen,basement,utility_room,non_smokers_only
1062,https://www.otodom.pl/pl/oferta/mieszkanie-53-...,"Mieszkanie, 53 m², Myszków",Mieszkanie apartament do wynajęcia 2 pokoje.WG...,,2200.0,,2200.0,,"Myszków, myszkowski, śląskie",Myszków,...,False,False,False,True,False,False,False,False,False,False
679,https://www.otodom.pl/pl/oferta/blisko-dtswspa...,Blisko Dtś*Wspaniały widok*Umeblowane,Tylko Najem okazjonalny!❒ 3 Pokoje | 63 m2❒ Id...,,1500.0,,1500.0,,"ul. Beskidzka, Świętochłowice, śląskie",ul. Beskidzka,...,False,False,False,False,False,False,False,True,False,False
996,https://www.otodom.pl/pl/oferta/piekny-apartam...,Piękny apartament Bytom Centrum,Witam! Przedstawiam Państwu wspaniałą i w pe...,,1100.0,600.0,1700.0,4000.0,"ul. Rzeźnicza, Śródmieście, Bytom, śląskie",ul. Rzeźnicza,...,False,False,False,True,False,False,False,False,False,False
718,https://www.otodom.pl/pl/oferta/bezczynszowe-m...,Bezczynszowe mieszkanie w centrum Radzionkowa !,Do wynajęcia bezczynszowe mieszkanie dwupokojo...,video,1500.0,,1500.0,3000.0,"ul. Jana Kużaja, Radzionków, tarnogórski, śląskie",ul. Jana Kużaja,...,False,False,False,True,False,False,False,False,False,False
462,https://www.otodom.pl/pl/oferta/ul-kolejowa-ce...,"ul. Kolejowa, Centrum, 35 m2, 2 niezależne pokoje","Zapraszam do WYNAJĘCIA mieszkania, w ścisłym c...",,1400.0,450.0,1850.0,2000.0,"ul. Kolejowa, Śródmieście, Bytom, śląskie",ul. Kolejowa,...,False,False,False,False,False,False,True,False,False,False


In [5]:
df_otodom.columns

MultiIndex([(               'listing',                           'link'),
            (               'listing',                          'title'),
            (               'listing',            'summary_description'),
            (               'listing',                 'remote_service'),
            (               'pricing',                          'price'),
            (               'pricing',                           'rent'),
            (               'pricing',                     'total_rent'),
            (               'pricing',                        'deposit'),
            (              'location',               'complete_address'),
            (              'location',                         'street'),
            (              'location',                           'city'),
            (              'location',                    'voivodeship'),
            (                  'size',                  'square_meters'),
            (                  'size',

### 1.3 Combined

In [6]:
def combine_olx_otodom(df_olx: pd.DataFrame = df_olx, df_otodom: pd.DataFrame = df_otodom):

    # Step 1: Create a mapping
    column_mapping = {
        'link': ('listing', 'link'),
        'title': ('listing', 'title'),
        'price': ('pricing', 'price'),
        'rent': ('pricing', 'rent'),
        'summary_description': ('listing', 'summary_description'),
        'ownership': ('legal_and_availability', 'ownership'),
        'floor_level': ('size', 'floor'),
        'is_furnished': ('equipment', 'furniture'),
        'building_type': ('type_and_year', 'building_type'),
        'square_meters': ('size', 'square_meters'),
        'number_of_rooms': ('size', 'number_of_rooms'),
        'rent': ('pricing', 'rent'),
        'voivodeship': ('location', 'voivodeship'),
        'city': ('location', 'city'),
        'street': ('location', 'street')
    }

    # Step 2: Modify df_olx to have a MultiIndex
    df_olx.columns = pd.MultiIndex.from_tuples([column_mapping[col] if col in column_mapping else (col, '') for col in df_olx.columns])

    # Step 3: Fill missing columns in df_olx
    # Identify missing columns
    missing_columns = set(df_otodom.columns) - set(df_olx.columns)

    # Fill missing columns with NaN or False
    for col in missing_columns:
        if col in [('equipment', 'furniture'),]:  # Add other boolean columns if any
            df_olx[col] = False
        else:
            df_olx[col] = np.nan

    # Reorder df_olx columns to match df_otodom
    df_olx = df_olx.reindex(columns=df_otodom.columns)

    # Add total_rent column
    df_olx[('pricing', 'total_rent')] = df_olx[('pricing', 'price')].add(df_olx[('pricing', 'rent')], fill_value=0)

    # Create complete_address column
    df_olx[('location', 'complete_address')] = df_olx.apply(
    lambda row: ', '.join(
        filter(None, [row[('location', 'street')], row[('location', 'city')], row[('location', 'voivodeship')]])
    ),
    axis=1
)
    pd.set_option('display.max_rows', None)
    df_olx.iloc[0]

    # Fill NaNs
    # Replacing NaN with False
    columns_to_fill_false = [
        ('size', 'attic'),
        ('amenities', 'elevator'),
        ('amenities', 'parking_space'),
        ('equipment', 'no_information'),
        ('equipment', 'stove'),
        ('equipment', 'fridge'),
        ('equipment', 'oven'),
        ('equipment', 'washing_machine'),
        ('equipment', 'TV'),
        ('equipment', 'dishwasher'),
        ('media_types', 'internet'),
        ('media_types', 'telephone'),
        ('media_types', 'cable_TV'),
        ('heating', 'electric'),
        ('heating', 'gas'),
        ('heating', 'other'),
        ('heating', 'boiler_room'),
        ('heating', 'district'),
        ('heating', 'tile_stove'),
        ('security', 'intercom_or_video_intercom'),
        ('security', 'anti_burglary_doors_or_windows'),
        ('security', 'monitoring_or_security'),
        ('security', 'anti_burglary_roller_blinds'),
        ('security', 'alarm_system'),
        ('security', 'enclosed_area'),
        ('windows', 'aluminum'),
        ('windows', 'wooden'),
        ('windows', 'plastic'),
        ('building_material', 'concrete'),
        ('building_material', 'aerated_concrete'),
        ('building_material', 'brick'),
        ('building_material', 'wood'),
        ('building_material', 'other'),
        ('building_material', 'lightweight_aggregate'),
        ('building_material', 'hollow_brick'),
        ('building_material', 'silicate'),
        ('building_material', 'large_panel'),
        ('building_material', 'reinforced_concrete'),
        ('additional_information', 'duplex'),
        ('additional_information', 'air_conditioning'),
        ('additional_information', 'separate_kitchen'),
        ('additional_information', 'basement'),
        ('additional_information', 'utility_room'),
        ('additional_information', 'non_smokers_only'),
    ] 
    for col in columns_to_fill_false:
        df_olx[col] = df_olx[col].fillna(False)

    # Replacing NaN with True
    columns_to_fill_true = [
        ('media_types', 'no_information'),
        ('heating', 'no_information'),
        ('security', 'no_information'),
        ('windows', 'no_information'),
        ('building_material', 'no_information'),
        ('additional_information', 'no_information'),
    ]

    for col in columns_to_fill_true:
        df_olx[col] = df_olx[col].fillna(True)

    # Step 5: Set the same dtypes for df_olx as in df_otodom
    for column in df_otodom.columns:
        if column in df_olx.columns:
            df_olx[column] = df_olx[column].astype(df_otodom[column].dtype)


    # Step 6: Append df_olx to df_otodom
    combined_df = pd.concat([df_otodom, df_olx], ignore_index=True)

    # Step 7: Add deposit_ratio column
    combined_df.loc[:, ("pricing", "deposit_ratio")] = np.where(
        combined_df["pricing"]["total_rent"] != 0,
        (combined_df["pricing"]["deposit"] / combined_df["pricing"]["total_rent"]).round(2),
        np.nan  # or 0, depending on your preference for default value
    )

    columns = combined_df.columns.tolist()
    new_col_index = columns.index(("pricing", "deposit_ratio"))
    desired_position = 8
    columns.insert(desired_position, columns.pop(new_col_index))

    combined_df = combined_df[columns]
    return combined_df

combined_df = combine_olx_otodom(df_olx, df_otodom)

combined_df.tail()

Unnamed: 0_level_0,listing,listing,listing,listing,pricing,pricing,pricing,pricing,pricing,location,...,building_material,building_material,building_material,additional_information,additional_information,additional_information,additional_information,additional_information,additional_information,additional_information
Unnamed: 0_level_1,link,title,summary_description,remote_service,price,rent,total_rent,deposit,deposit_ratio,complete_address,...,silicate,large_panel,reinforced_concrete,no_information,duplex,air_conditioning,separate_kitchen,basement,utility_room,non_smokers_only
1101,https://www.otodom.pl/pl/oferta/katowice-ligot...,Katowice Ligota Koszalińska pokój blisko Śum,Wynajem pokoju w bardzo dobrej lokalizacji. Ni...,,800.0,,800.0,1200.0,1.5,"Ligota-Panewniki, Katowice, śląskie",...,False,False,False,False,False,False,True,False,False,False
1102,https://www.otodom.pl/pl/oferta/katowice-centr...,Katowice centrum 2 pokoje dla studentów,OFERTA POLECANA! Oferujemy do wynajęcia przytu...,virtual_tour,1499.0,,1499.0,3000.0,2.0,"Śródmieście, Katowice, śląskie",...,False,False,False,False,False,False,True,True,False,False
1103,https://www.otodom.pl/pl/oferta/kawalerka-bryn...,Kawalerka Brynów,Kawalerka Katowice Brynów bardzo spokojne miej...,,1300.0,,1300.0,2600.0,2.0,"Załęska Hałda-Brynów, Katowice, śląskie",...,False,False,False,False,False,False,True,True,False,False
1104,https://www.otodom.pl/pl/oferta/wynajme-mieszk...,Wynajmę Mieszkanie / Pogoń / Sosnowiec,"Wynajmę mieszkanie 56 m2 , dwa pokoje , kuchni...",,700.0,,700.0,2000.0,2.86,"ul. Mazowiecka, Pogoń, Sosnowiec, śląskie",...,False,False,False,False,False,False,False,True,False,False
1105,https://www.olx.pl/d/oferta/kawalerka-tychy-os...,Kawalerka Tychy oś M,OpisWynajmę kawalerkę na ul Moniuszki 1/16 .na...,,1500.0,400.0,1900.0,,,"Moniuszki 1/16, Zawada, Śląskie",...,False,False,False,True,False,False,False,False,False,False


In [7]:
pd.reset_option('display.max_rows')

In [8]:
combined_df.dtypes.to_dict()

{('listing', 'link'): string[python],
 ('listing', 'title'): string[python],
 ('listing', 'summary_description'): string[python],
 ('listing', 'remote_service'): string[python],
 ('pricing', 'price'): dtype('float64'),
 ('pricing', 'rent'): dtype('float64'),
 ('pricing', 'total_rent'): dtype('float64'),
 ('pricing', 'deposit'): dtype('float64'),
 ('pricing', 'deposit_ratio'): dtype('float64'),
 ('location', 'complete_address'): string[python],
 ('location', 'street'): string[python],
 ('location', 'city'): string[python],
 ('location', 'voivodeship'): string[python],
 ('size', 'square_meters'): Int64Dtype(),
 ('size', 'number_of_rooms'): Int64Dtype(),
 ('size', 'floor'): Int64Dtype(),
 ('size', 'attic'): dtype('bool'),
 ('size', 'building_floors'): Int64Dtype(),
 ('legal_and_availability', 'available_from'): string[python],
 ('legal_and_availability', 'completion'): string[python],
 ('legal_and_availability', 'ownership'): string[python],
 ('legal_and_availability', 'rent_to_students')

In [9]:
data_path_manager.save_df(combined_df, domain="combined")

Saving schema to ..\data\cleaned\2023_11_27_19_41_45_Mierzęcice__Będziński__Śląskie\combined.json
Saving CSV to ..\data\cleaned\2023_11_27_19_41_45_Mierzęcice__Będziński__Śląskie\combined.csv


In [10]:
saved_combined = data_path_manager._load_cleaned_df(domain="combined")
saved_combined.sample(5)

Unnamed: 0_level_0,listing,listing,listing,listing,pricing,pricing,pricing,pricing,pricing,location,...,building_material,building_material,building_material,additional_information,additional_information,additional_information,additional_information,additional_information,additional_information,additional_information
Unnamed: 0_level_1,link,title,summary_description,remote_service,price,rent,total_rent,deposit,deposit_ratio,complete_address,...,silicate,large_panel,reinforced_concrete,no_information,duplex,air_conditioning,separate_kitchen,basement,utility_room,non_smokers_only
1023,https://www.otodom.pl/pl/oferta/70m2-3-pokoje-...,"70m2, 3 pokoje, dwie łazienki, Nowy Brynów",Mamy przyjemność zaprezentować elegancki apart...,,4500.0,,4500.0,,,"Piotrowice-Ochojec, Katowice, śląskie",...,False,False,False,True,False,False,False,False,False,False
158,https://www.otodom.pl/pl/oferta/wynajem-mieszk...,Wynajem mieszkania,"Wynajmę komfortowe, w pełni wyposażone mieszka...",,2500.0,520.0,3020.0,3000.0,0.99,"ul. Tysiąclecia, Osiedle Tysiąclecia, Katowice...",...,False,True,False,True,False,False,False,False,False,False
646,https://www.otodom.pl/pl/oferta/mieszkanie-2-p...,Mieszkanie 2 pokojowe do wynajęcia,TYLKO W NASZYM BIURZE!!! Umowa najmu okazjona...,,1400.0,630.0,2030.0,2500.0,1.23,"ul. Wielka Skotnica, Mysłowice, śląskie",...,False,False,False,False,False,False,True,True,False,False
530,https://www.otodom.pl/pl/oferta/nowy-apartamen...,Nowy Apartament Z Tarasem Z Widokiem Na Katowice,APARTAMENT Z 2 TARASAMI Z WIDOKIEM NA PANORAME...,,6000.0,,6000.0,,,"Śródmieście, Katowice, śląskie",...,False,False,False,False,False,True,False,False,False,False
630,https://www.otodom.pl/pl/oferta/katowice-slone...,"Katowice, Słoneczna 73 - 3 pokoje z balkonem","Oferujemy do wynajęcia piękne, wyjątkowo jasne...",,2600.0,,2600.0,,,"ul. Słoneczna, Wełnowiec-Józefowiec, Katowice,...",...,False,True,False,False,False,False,True,False,False,False


In [11]:
del df_olx, df_otodom, saved_combined, data_path_manager, data_timeplace

In [12]:
combined_df[('listing', 'link')].duplicated().sum()

0

## 2. EDA

### 2.1 Numerical data

In [13]:
combined_df.describe()

Unnamed: 0_level_0,pricing,pricing,pricing,pricing,pricing,size,size,size,size,type_and_year
Unnamed: 0_level_1,price,rent,total_rent,deposit,deposit_ratio,square_meters,number_of_rooms,floor,building_floors,build_year
count,1106.0,593.0,1106.0,742.0,742.0,1106.0,1106.0,1083.0,979.0,476.0
mean,2180.465642,463.910624,2429.198915,3477.051213,1.457399,49.740506,2.119349,2.757156,5.48621,1993.170168
std,1429.766366,252.601285,1442.413882,3698.52419,0.84811,26.233006,0.858414,2.57227,3.828449,33.635859
min,500.0,1.0,500.0,2.0,0.0,8.0,1.0,-1.0,1.0,1893.0
25%,1500.0,300.0,1687.0,2000.0,1.0,36.0,2.0,1.0,3.0,1975.0
50%,1900.0,500.0,2200.0,3000.0,1.295,45.0,2.0,2.0,4.0,2011.5
75%,2500.0,650.0,2800.0,3800.0,1.76,56.0,3.0,4.0,7.0,2021.0
max,19000.0,990.0,19001.0,57000.0,12.0,413.0,6.0,11.0,25.0,2025.0


The 11th floor is 10+

### 2.2 Total rent

In [19]:
def advanced_statistical_analysis(df: pd.DataFrame = combined_df):
    price_data = df['pricing']['total_rent']

    # Central Tendency Measures
    mean_price = price_data.mean()
    median_price = price_data.median()
    mode_price = price_data.mode()[0]

    # Variability Measures
    std_dev_price = price_data.std()
    variance_price = price_data.var()
    range_price = price_data.max() - price_data.min()
    iqr_price = np.percentile(price_data, 75) - np.percentile(price_data, 25)

    # Skewness and Kurtosis
    skewness_price = price_data.skew()
    kurtosis_price = price_data.kurtosis()

    # Quantile Analysis
    quantiles_price = price_data.quantile([0.25, 0.5, 0.75, 0.9, 0.95, 0.99])

    # Outlier Detection: Using the 1.5*IQR rule
    lower_bound = np.percentile(price_data, 25) - 1.5 * iqr_price
    upper_bound = np.percentile(price_data, 75) + 1.5 * iqr_price
    outliers = price_data[(price_data < lower_bound) | (price_data > upper_bound)]

    advanced_statistics = {
        "Mode": mode_price,
        "Variance": variance_price,
        "Range": range_price,
        "Interquartile Range (IQR)": iqr_price,
        "Skewness": skewness_price,
        "Kurtosis": kurtosis_price,
        "Quantiles": quantiles_price,
        "Outlier Lower Bound": lower_bound,
        "Outlier Upper Bound": upper_bound,
        "Number of Outliers": outliers.count(),
        "Total Number of Listings": price_data.count(),
        "Ratio of Outliers to Total Listings": str((outliers.count() / price_data.count()).round(2) * 100)+ "%",
    }

    print("Total Price:")
    return advanced_statistics

advanced_statistical_analysis(combined_df)

Total Price:


{'Mode': 1500.0,
 'Variance': 2080557.8083608127,
 'Range': 18501.0,
 'Interquartile Range (IQR)': 1113.0,
 'Skewness': 4.891523886874032,
 'Kurtosis': 41.7108313320624,
 'Quantiles': 0.25    1687.0
 0.50    2200.0
 0.75    2800.0
 0.90    3557.5
 0.95    4569.5
 0.99    7270.0
 Name: total_rent, dtype: float64,
 'Outlier Lower Bound': 17.5,
 'Outlier Upper Bound': 4469.5,
 'Number of Outliers': 67,
 'Total Number of Listings': 1106,
 'Ratio of Outliers to Total Listings': '6.0%'}

In [None]:
# TODO - visualize the data

### 2.3 Location

In [14]:
combined_df["location"]["city"].value_counts()

Katowice                514
Sosnowiec               145
Bytom                   122
Dąbrowa Górnicza         68
Chorzów                  61
Jaworzno                 36
tarnogórski              33
będziński                32
Siemianowice Śląskie     23
Mysłowice                21
Piekary Śląskie          20
Świętochłowice           19
Ruda Śląska               6
myszkowski                3
częstochowski             1
zawierciański             1
Zawada                    1
Name: city, dtype: Int64

In [None]:
# TODO - visualize the data