# Data Preview

## 1. Set up

In [3]:
# Standard imports
from pathlib import Path
import os
import re
import sys

def print_project_root():
    notebooks_dir = Path.cwd()

    # Calculate the root directory of the project (go up three levels)
    project_root = notebooks_dir.parent.parent.parent
    print(f"The root directory of the project is: {project_root}")

    if str(project_root) not in sys.path:
        sys.path.append(str(project_root))

print_project_root()

The root directory of the project is: d:\UserData karol\Documents\Programming\Data Science\Data Engineering\Rent comparisions\Home Market Harvester


### 1.3 Importing Data

In [1]:
# Third-party imports
import pandas as pd

# Local imports
from pipeline.src._csv_utils import DataPathCleaningManager

data_timeplace = os.getenv("MARKET_OFFERS_TIMEPLACE")
if data_timeplace is None:
    raise ValueError("The environment variable MARKET_OFFERS_TIMEPLACE is not set.")

data_path_manager = DataPathCleaningManager(data_timeplace)

df_olx = data_path_manager.load_df(domain="olx", is_cleaned=False)

### 1.2 Functions

In [2]:
def count_and_percentage(df, column_name):
    """
    Function to calculate the count and percentage of unique values in a given column of a DataFrame.

    Parameters:
    df (pandas.DataFrame): The DataFrame to analyze.
    column_name (str): The name of the column in the DataFrame.

    Returns:
    pandas.DataFrame: A DataFrame with the count and percentage of each unique value in the specified column.

    Raises:
    ValueError: If the specified column is not found in the DataFrame.
    """
    # Check if the column exists in the DataFrame
    if column_name not in df.columns:
        raise ValueError(f"Column '{column_name}' not found in DataFrame.")

    # Calculate count and normalized values
    count = df[column_name].value_counts(dropna=False)
    normalized = df[column_name].value_counts(dropna=False, normalize=True) * 100

    # Concatenate count and normalized values side by side
    result = pd.concat([count, normalized], axis=1)
    result.columns = ['Count', 'Percentage']

    return result

In [3]:
def count_comma_separated_values(df, column_name):
    """
    Counts the occurrences of individual elements in a comma-separated string column of a DataFrame.

    Parameters:
    df (pandas.DataFrame): The DataFrame containing the column.
    column_name (str): The name of the column to analyze.

    Returns:
    pandas.DataFrame: A DataFrame with the count and percentage of each unique element found in the comma-separated values.

    Raises:
    ValueError: If the specified column is not found in the DataFrame.
    """
    # Check if the column exists in the DataFrame
    if column_name not in df.columns:
        raise ValueError(f"Column '{column_name}' not found in DataFrame.")

    # Split the column values, explode to individual elements, and count
    exploded_items = df[column_name].dropna().str.split(', ').explode()
    exploded_df = pd.DataFrame({column_name: exploded_items})
    counts_and_percent = count_and_percentage(exploded_df, column_name)

    return counts_and_percent

In [4]:
def remove_non_numeric_characters(df, column_name):
    """
    Removes all non-numeric characters from a column of a DataFrame.

    Parameters:
    df (pandas.DataFrame): The DataFrame containing the column.
    column_name (str): The name of the column to analyze.

    Returns:
    pandas.DataFrame: A DataFrame with all non-numeric characters removed from the specified column.

    Raises:
    ValueError: If the specified column is not found in the DataFrame.
    """

    return df[column_name].str.replace('[^a-zA-Z]', '', regex=True).unique()

In [5]:
def count_words(text):
    if pd.isna(text):
        return 0
    return len(str(text).split())


## 2. Data preview

### OLX

In [6]:
def clean_olx_data(df):

    df[['voivodeship', 'city']] = df['location'].str.split(', ', expand=True)

    pattern = r'ul\s+(\w+\s+\d+/\d+)'
    df['street'] = df['summary_description'].apply(lambda x: re.search(pattern, x).group(1) if re.search(pattern, x) else None)
    
    del df['location']
    
    df['price'] = df['price'].str.extract('(\d+ \d+)')[0].str.replace(' ', '').astype(float)
    df['rent'] = df['rent'].str.extract('(\d+)')[0].astype(float)


    # Extract and convert 'square_meters' into integers
    df['square_meters'] = df['square_meters'].str.extract('(\d+)')[0].astype('Int64')

    # Convert 'number_of_rooms' into an integer, special handling for "Kawalerka"
    df['number_of_rooms'] = df['number_of_rooms'].replace('Liczba pokoi: Kawalerka', '1').astype('Int64')

    # Extract and clean 'floor_level', 'is_furnished', 'building_type'
    df['floor_level'] = df['floor_level'].str.extract('Poziom: (\d+)')[0]
    df['is_furnished'] = df['is_furnished'].map({'Umeblowane: Tak': True, 'Umeblowane: Nie': False})
    df['building_type'] = df['building_type'].str.extract('Rodzaj zabudowy: (.+)')[0]

    return df

In [7]:
df_olx_cleaned = clean_olx_data(df_olx)
df_olx_cleaned.head()

Unnamed: 0,link,title,price,summary_description,ownership,floor_level,is_furnished,building_type,square_meters,number_of_rooms,rent,voivodeship,city,street
0,https://www.olx.pl/d/oferta/kawalerka-tychy-os...,Kawalerka Tychy oś M,1500.0,OpisWynajmę kawalerkę na ul Moniuszki 1/16 .na...,Prywatne,3,False,Apartamentowiec,26,1,400.0,Śląskie,Zawada,Moniuszki 1/16


In [8]:
df_olx_cleaned.dtypes

link                    object
title                   object
price                  float64
summary_description     object
ownership               object
floor_level             object
is_furnished              bool
building_type           object
square_meters            Int64
number_of_rooms          Int64
rent                   float64
voivodeship             object
city                    object
street                  object
dtype: object

In [9]:
df_olx_cleaned['link'] = df_olx_cleaned['link'].astype('string')
df_olx_cleaned['title'] = df_olx_cleaned['title'].astype('string')
df_olx_cleaned['summary_description'] = df_olx_cleaned['summary_description'].astype('string')
df_olx_cleaned['ownership'] = df_olx_cleaned['ownership'].astype('string')
df_olx_cleaned['floor_level'] = df_olx_cleaned['floor_level'].astype('Int64')
df_olx_cleaned['building_type'] = df_olx_cleaned['building_type'].astype('string')
df_olx_cleaned['voivodeship'] = df_olx_cleaned['voivodeship'].astype('string')
df_olx_cleaned['city'] = df_olx_cleaned['city'].astype('string')
df_olx_cleaned['street'] = df_olx_cleaned['street'].astype('string')

df = df_olx_cleaned.rename(columns={'floor_level': 'floor'})

df_olx_cleaned.dtypes


link                    string
title                   string
price                  float64
summary_description     string
ownership               string
floor_level              Int64
is_furnished              bool
building_type           string
square_meters            Int64
number_of_rooms          Int64
rent                   float64
voivodeship             string
city                    string
street                  string
dtype: object

In [10]:
df_olx_cleaned['ownership'] = df_olx_cleaned['ownership'].map({'Prywatne': 'private'})

df_olx_cleaned['building_type'] = df_olx_cleaned['building_type'].map({'Apartamentowiec': 'apartment_building'})

In [11]:
df_olx_cleaned.head()

Unnamed: 0,link,title,price,summary_description,ownership,floor_level,is_furnished,building_type,square_meters,number_of_rooms,rent,voivodeship,city,street
0,https://www.olx.pl/d/oferta/kawalerka-tychy-os...,Kawalerka Tychy oś M,1500.0,OpisWynajmę kawalerkę na ul Moniuszki 1/16 .na...,private,3,False,apartment_building,26,1,400.0,Śląskie,Zawada,Moniuszki 1/16


#### 2.2.1 Cleaning data

In [12]:
def clean_otodom_data(df: pd.DataFrame):

    # 1. Split 'location' into street, city, and voivodeship
    df['location_split'] = df['location'].str.split(', ')
    df['street'] = df['location_split'].apply(lambda x: x[0] if len(x) > 2 else None)
    df['city'] = df['location_split'].apply(lambda x: x[-2] if len(x) > 1 else None)
    df['voivodeship'] = df['location_split'].apply(lambda x: x[-1] if x else None)

    # Drop the temporary 'location_split' column
    df.drop(columns=['location_split'], inplace=True)

    # 2. Convert 'price' into float
    df['price'] = df['price'].str.replace(' ', '').str.extract('(\d+)')[0]
    df['price'] = pd.to_numeric(df['price'], errors='coerce')
    df['price'] = df['price'].astype('float64')

    # Extract and convert 'square_meters' into integers
    df['square_meters'] = df['square_meters'].str.extract('(\d+)')[0].astype('Int64')

    # Extract and convert 'rent' into integers
    df['rent'] = df['rent'].str.extract('(\d+)')[0]
    df['rent'] = pd.to_numeric(df['rent'], errors='coerce')
    df['total_rent'] = df['rent'].add(df['price'], fill_value=0)

    # Extract and convert 'deposit' into integers
    df['deposit'] = df['deposit'].str.replace(' ', '').str.extract('(\d+)')[0]
    df['deposit'] = pd.to_numeric(df['deposit'], errors='coerce')

    # Convert 'number_of_rooms' into an integer, special handling for "Kawalerka"
    df['number_of_rooms'] = df['number_of_rooms'].astype('Int64')

    # Extract and clean 'floor_level'
    df_split = df['floor_level'].str.split('/', expand=True)
    df_split[0] = df_split[0].replace({'parter': 0, 'suterena': -1, '> 10': 11})

    poddasze_rows = df_split[0] == 'poddasze'
    df_split.loc[poddasze_rows, 0] = (df_split.loc[poddasze_rows, 1].fillna(0).astype(int) + 1).astype(str)

    df['attic'] = df_split[0] == 'poddasze'
    df['floor'] = pd.to_numeric(df_split[0], errors='coerce')
    df['floor'] = df['floor'].astype('Int64')
    df['building_floors'] = pd.to_numeric(df_split[1], errors='coerce')
    df['building_floors'] = df['building_floors'].astype('Int64')
    
    del df['floor_level']

    # Convert 'elevator' and 'parking_space' into boolean values
    df['elevator'] = df['elevator'].map({'tak': True, 'nie': False})

    df['parking_space'] = df['parking_space'].map({'garaż/miejsce parkingowe': True, 'brak informacji': False})
    
    # Convert 'build_year' into integers
    df['build_year'] = pd.to_numeric(df['build_year'], errors='coerce')

    # todo create master columns for subcolumns
    # 3. Explode 'equipment', 'media_types', 'heating', 'security', 'windows', 'building_materials', 'additional_information' into boolean categories
    def explode_and_get_dummies(column_name):
        return df[column_name].str.get_dummies(sep=', ')
    
    to_explode = ['equipment', 'media_types', 'heating', 'security', 'windows', 'balcony_garden_terrace', 'building_material', 'additional_information']

    for column in to_explode:
        df = df.join(explode_and_get_dummies(column).add_prefix(f"{column}_"))

    for column in to_explode:
        del df[column]

    return df


In [13]:
df_otodom_cleaned = clean_otodom_data(df_otodom)
df_otodom_cleaned.head()

Unnamed: 0,link,title,location,price,summary_description,square_meters,rent,number_of_rooms,deposit,building_type,...,building_material_silikat,building_material_wielka płyta,building_material_żelbet,additional_information_brak informacji,additional_information_dwupoziomowe,additional_information_klimatyzacja,additional_information_oddzielna kuchnia,additional_information_piwnica,additional_information_pom. użytkowe,additional_information_tylko dla niepalących
0,https://www.otodom.pl/pl/oferta/mieszkania-2-p...,Mieszkania 2 pokojowe cena z ogrzewaniem,"ul. Karola Szymanowskiego 44, Zagórze Północ, ...",2000.0,Do wynajęcia 6 mieszkań o powierzchni od 42-58...,42,,2,4000.0,dom wolnostojący,...,0,0,0,0,0,0,1,1,1,0
1,https://www.otodom.pl/pl/oferta/kawalerka-w-ka...,Kawalerka w Katowicach do wynajęcia od zaraz,"ul. Haliny Krahelskiej, Osiedle Paderewskiego-...",1500.0,"Nowe, (2018), komfortowe, jasne, w pełni wypos...",19,250.0,1,1500.0,apartamentowiec,...,0,0,0,0,0,0,0,0,0,1
2,https://www.otodom.pl/pl/oferta/mam-do-wynajec...,Mam do wynajęcia mieszkanie Bezpośrednio !,"ul. Beskidzka, Chorzów II, Chorzów, śląskie",1499.0,"Witam.Mam do wynajęcia mieszkanie w Chorzowie,...",55,799.0,2,4500.0,blok,...,0,0,0,0,0,0,0,1,1,0
3,https://www.otodom.pl/pl/oferta/kawalerka-os-p...,Kawalerka os. Paderewskiego,"ul. Graniczna, Osiedle Paderewskiego-Muchowiec...",1400.0,Oferujemy do wynajęcia przytulną kawalerkę zlo...,31,1.0,1,1400.0,blok,...,0,0,0,1,0,0,0,0,0,0
4,https://www.otodom.pl/pl/oferta/ul-1000-lecia-...,ul.1000-lecia Dąbrowa Górnicza Gołonóg 50m2 2 pok,"1000 lecia, Brodway, Gołonóg Północny, Dąbrowa...",1500.0,Do wynajęcia mieszkanie 50m2 DąbrowaGórnicza G...,50,700.0,2,2000.0,blok,...,0,1,0,0,0,0,0,1,0,0


In [14]:
df_otodom_cleaned.columns.to_list()

['link',
 'title',
 'location',
 'price',
 'summary_description',
 'square_meters',
 'rent',
 'number_of_rooms',
 'deposit',
 'building_type',
 'available_from',
 'remote service',
 'completion',
 'ownership',
 'rent_to_students',
 'elevator',
 'parking_space',
 'build_year',
 'street',
 'city',
 'voivodeship',
 'total_rent',
 'attic',
 'floor',
 'building_floors',
 'equipment_brak informacji',
 'equipment_kuchenka',
 'equipment_lodówka',
 'equipment_meble',
 'equipment_piekarnik',
 'equipment_pralka',
 'equipment_telewizor',
 'equipment_zmywarka',
 'media_types_brak informacji',
 'media_types_internet',
 'media_types_telefon',
 'media_types_telewizja kablowa',
 'heating_brak informacji',
 'heating_elektryczne',
 'heating_gazowe',
 'heating_inne',
 'heating_kotłownia',
 'heating_miejskie',
 'heating_piece kaflowe',
 'security_brak informacji',
 'security_domofon / wideofon',
 'security_drzwi / okna antywłamaniowe',
 'security_monitoring / ochrona',
 'security_rolety antywłamaniowe',
 '

In [15]:
columns_order = [
    'link', 'title', 'summary_description', 'remote service', 
    'price', 'rent', 'total_rent', 'deposit', 
    'location', 'street', 'city', 'voivodeship', 
    'square_meters', 'number_of_rooms', 'floor', 'attic', 'building_floors', 
    'available_from', 'completion', 'ownership', 'rent_to_students', 
    'building_type', 'build_year', 
    'elevator', 'parking_space', 
    'equipment_brak informacji', 'equipment_kuchenka', 'equipment_lodówka', 'equipment_meble', 'equipment_piekarnik', 'equipment_pralka', 'equipment_telewizor', 'equipment_zmywarka', 
    'media_types_brak informacji', 'media_types_internet', 'media_types_telefon', 'media_types_telewizja kablowa', 
    'heating_brak informacji', 'heating_elektryczne', 'heating_gazowe', 'heating_inne', 'heating_kotłownia', 'heating_miejskie', 'heating_piece kaflowe', 
    'security_brak informacji', 'security_domofon / wideofon', 'security_drzwi / okna antywłamaniowe', 'security_monitoring / ochrona', 'security_rolety antywłamaniowe', 'security_system alarmowy', 'security_teren zamknięty', 
    'windows_aluminiowe', 'windows_brak informacji', 'windows_drewniane', 'windows_plastikowe', 
    'building_material_beton', 'building_material_beton komórkowy', 'building_material_brak informacji', 'building_material_cegła', 'building_material_drewno', 'building_material_inne', 'building_material_keramzyt', 'building_material_pustak', 'building_material_silikat', 'building_material_wielka płyta', 'building_material_żelbet', 
    'additional_information_brak informacji', 'additional_information_dwupoziomowe', 'additional_information_klimatyzacja', 'additional_information_oddzielna kuchnia', 'additional_information_piwnica', 'additional_information_pom. użytkowe', 'additional_information_tylko dla niepalących'
]

df_otodom_cleaned = df_otodom_cleaned[columns_order]

In [16]:
columns_multiindex = [
    ('listing', 'link'),
    ('listing', 'title'),
    ('listing', 'summary_description'),
    ('listing', 'remote_service'),
    ('pricing', 'price'),
    ('pricing', 'rent'),
    ('pricing', 'total_rent'),
    ('pricing', 'deposit'),
    ('location', 'complete_address'),
    ('location', 'street'),
    ('location', 'city'),
    ('location', 'voivodeship'),
    ('size', 'square_meters'),
    ('size', 'number_of_rooms'),
    ('size', 'floor'),
    ('size', 'attic'),
    ('size', 'building_floors'),
    ('legal_and_availability', 'available_from'),
    ('legal_and_availability', 'completion'),
    ('legal_and_availability', 'ownership'),
    ('legal_and_availability', 'rent_to_students'),
    ('type_and_year', 'building_type'),
    ('type_and_year', 'build_year'),
    ('amenities', 'elevator'),
    ('amenities', 'parking_space'),
    ('equipment', 'no_information'),
    ('equipment', 'stove'),
    ('equipment', 'fridge'),
    ('equipment', 'furniture'),
    ('equipment', 'oven'),
    ('equipment', 'washing_machine'),
    ('equipment', 'TV'),
    ('equipment', 'dishwasher'),
    ('media_types', 'no_information'),
    ('media_types', 'internet'),
    ('media_types', 'telephone'),
    ('media_types', 'cable_TV'),
    ('heating', 'no_information'),
    ('heating', 'electric'),
    ('heating', 'gas'),
    ('heating', 'other'),
    ('heating', 'boiler_room'),
    ('heating', 'district'),
    ('heating', 'tile_stove'),
    ('security', 'no_information'),
    ('security', 'intercom_or_video_intercom'),
    ('security', 'anti_burglary_doors_or_windows'),
    ('security', 'monitoring_or_security'),
    ('security', 'anti_burglary_roller_blinds'),
    ('security', 'alarm_system'),
    ('security', 'enclosed_area'),
    ('windows', 'aluminum'),
    ('windows', 'no_information'),
    ('windows', 'wooden'),
    ('windows', 'plastic'),
    ('building_material', 'concrete'),
    ('building_material', 'aerated_concrete'),
    ('building_material', 'no_information'),
    ('building_material', 'brick'),
    ('building_material', 'wood'),
    ('building_material', 'other'),
    ('building_material', 'lightweight_aggregate'),
    ('building_material', 'hollow_brick'),
    ('building_material', 'silicate'),
    ('building_material', 'large_panel'),
    ('building_material', 'reinforced_concrete'),
    ('additional_information', 'no_information'),
    ('additional_information', 'duplex'),
    ('additional_information', 'air_conditioning'),
    ('additional_information', 'separate_kitchen'),
    ('additional_information', 'basement'),
    ('additional_information', 'utility_room'),
    ('additional_information', 'non_smokers_only')
]

multiindex = pd.MultiIndex.from_tuples(columns_multiindex, names=['Category', 'Subcategory'])
df_otodom_cleaned.columns = multiindex

In [17]:
df_otodom_cleaned.head()

Category,listing,listing,listing,listing,pricing,pricing,pricing,pricing,location,location,...,building_material,building_material,building_material,additional_information,additional_information,additional_information,additional_information,additional_information,additional_information,additional_information
Subcategory,link,title,summary_description,remote_service,price,rent,total_rent,deposit,complete_address,street,...,silicate,large_panel,reinforced_concrete,no_information,duplex,air_conditioning,separate_kitchen,basement,utility_room,non_smokers_only
0,https://www.otodom.pl/pl/oferta/mieszkania-2-p...,Mieszkania 2 pokojowe cena z ogrzewaniem,Do wynajęcia 6 mieszkań o powierzchni od 42-58...,Obsługa zdalnatak,2000.0,,2000.0,4000.0,"ul. Karola Szymanowskiego 44, Zagórze Północ, ...",ul. Karola Szymanowskiego 44,...,0,0,0,0,0,0,1,1,1,0
1,https://www.otodom.pl/pl/oferta/kawalerka-w-ka...,Kawalerka w Katowicach do wynajęcia od zaraz,"Nowe, (2018), komfortowe, jasne, w pełni wypos...",Obsługa zdalnatak,1500.0,250.0,1750.0,1500.0,"ul. Haliny Krahelskiej, Osiedle Paderewskiego-...",ul. Haliny Krahelskiej,...,0,0,0,0,0,0,0,0,0,1
2,https://www.otodom.pl/pl/oferta/mam-do-wynajec...,Mam do wynajęcia mieszkanie Bezpośrednio !,"Witam.Mam do wynajęcia mieszkanie w Chorzowie,...",Obsługa zdalnaZapytaj,1499.0,799.0,2298.0,4500.0,"ul. Beskidzka, Chorzów II, Chorzów, śląskie",ul. Beskidzka,...,0,0,0,0,0,0,0,1,1,0
3,https://www.otodom.pl/pl/oferta/kawalerka-os-p...,Kawalerka os. Paderewskiego,Oferujemy do wynajęcia przytulną kawalerkę zlo...,Obsługa zdalnatak,1400.0,1.0,1401.0,1400.0,"ul. Graniczna, Osiedle Paderewskiego-Muchowiec...",ul. Graniczna,...,0,0,0,1,0,0,0,0,0,0
4,https://www.otodom.pl/pl/oferta/ul-1000-lecia-...,ul.1000-lecia Dąbrowa Górnicza Gołonóg 50m2 2 pok,Do wynajęcia mieszkanie 50m2 DąbrowaGórnicza G...,Obsługa zdalnaZapytaj,1500.0,700.0,2200.0,2000.0,"1000 lecia, Brodway, Gołonóg Północny, Dąbrowa...",1000 lecia,...,0,1,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1100,https://www.otodom.pl/pl/oferta/mieszkanie-bli...,Mieszkanie blisko Śum - dostępne od 4 grudnia !,Proponujemy do wynajęcia mieszkanie zlokalizow...,Obsługa zdalnaZapytaj,1700.0,,1700.0,,"ul. Koszalińska, Ligota-Panewniki, Katowice, ś...",ul. Koszalińska,...,0,0,0,1,0,0,0,0,0,0
1101,https://www.otodom.pl/pl/oferta/katowice-ligot...,Katowice Ligota Koszalińska pokój blisko Śum,Wynajem pokoju w bardzo dobrej lokalizacji. Ni...,Obsługa zdalnaZapytaj,800.0,,800.0,1200.0,"Ligota-Panewniki, Katowice, śląskie",Ligota-Panewniki,...,0,0,0,0,0,0,1,0,0,0
1102,https://www.otodom.pl/pl/oferta/katowice-centr...,Katowice centrum 2 pokoje dla studentów,OFERTA POLECANA! Oferujemy do wynajęcia przytu...,Obsługa zdalnaWirtualny spacer,1499.0,,1499.0,3000.0,"Śródmieście, Katowice, śląskie",Śródmieście,...,0,0,0,0,0,0,1,1,0,0
1103,https://www.otodom.pl/pl/oferta/kawalerka-bryn...,Kawalerka Brynów,Kawalerka Katowice Brynów bardzo spokojne miej...,Obsługa zdalnaZapytaj,1300.0,,1300.0,2600.0,"Załęska Hałda-Brynów, Katowice, śląskie",Załęska Hałda-Brynów,...,0,0,0,0,0,0,1,1,0,0


In [18]:
df_otodom_cleaned.dtypes.to_dict()

{('listing', 'link'): dtype('O'),
 ('listing', 'title'): dtype('O'),
 ('listing', 'summary_description'): dtype('O'),
 ('listing', 'remote_service'): dtype('O'),
 ('pricing', 'price'): dtype('float64'),
 ('pricing', 'rent'): dtype('float64'),
 ('pricing', 'total_rent'): dtype('float64'),
 ('pricing', 'deposit'): dtype('float64'),
 ('location', 'complete_address'): dtype('O'),
 ('location', 'street'): dtype('O'),
 ('location', 'city'): dtype('O'),
 ('location', 'voivodeship'): dtype('O'),
 ('size', 'square_meters'): Int64Dtype(),
 ('size', 'number_of_rooms'): Int64Dtype(),
 ('size', 'floor'): Int64Dtype(),
 ('size', 'attic'): dtype('bool'),
 ('size', 'building_floors'): Int64Dtype(),
 ('legal_and_availability', 'available_from'): dtype('O'),
 ('legal_and_availability', 'completion'): dtype('O'),
 ('legal_and_availability', 'ownership'): dtype('O'),
 ('legal_and_availability', 'rent_to_students'): dtype('O'),
 ('type_and_year', 'building_type'): dtype('O'),
 ('type_and_year', 'build_year

## 3. Save cleaned data

### 3.1. Save data

In [54]:
data_path_manager.save_df(df_olx_cleaned, domain="olx")

Saving schema to ..\data\cleaned\2023_11_27_19_41_45_Mierzęcice__Będziński__Śląskie\olx_pl_schema.json


### 3.2 Check saved data

#### OLX

In [56]:
df_olx_saved = data_path_manager.load_df(domain="olx", is_cleaned=True)
df_olx_saved.head()

Unnamed: 0,link,title,price,summary_description,ownership,floor_level,is_furnished,building_type,square_meters,number_of_rooms,rent,voivodeship,city,street
0,https://www.olx.pl/d/oferta/kawalerka-tychy-os...,Kawalerka Tychy oś M,1500.0,OpisWynajmę kawalerkę na ul Moniuszki 1/16 .na...,private,3,False,apartment_building,26,1,400.0,Śląskie,Zawada,Moniuszki 1/16


In [57]:
are_identical = df_olx_saved.equals(df_olx_cleaned)
are_identical.head()

True