# Data Preview

## 1. Set up

In [1]:
# Standard imports
from pathlib import Path
import sys

def set_project_root():
    notebooks_dir = Path.cwd()

    project_root = notebooks_dir.parents[2]

    if str(project_root) not in sys.path:
        sys.path.append(str(project_root))

    return project_root

project_root = set_project_root()

### 1.3 Importing Data

In [2]:
# Third-party imports
import numpy as np
import pandas as pd

# Local imports
from pipeline.config._conf_file_manager import ConfigManager
from pipeline.stages._csv_utils import DataPathCleaningManager


config_file = ConfigManager("run_pipeline.conf")
TIMEPLACE = "MARKET_OFFERS_TIMEPLACE"
data_timeplace = config_file.read_value(TIMEPLACE)
if data_timeplace is None:

    message = F"The configuration variable {TIMEPLACE} is not set."
    raise ValueError(message)

data_path_manager = DataPathCleaningManager(data_timeplace, project_root)

df_otodom = data_path_manager.load_df(domain="otodom", is_cleaned=False)

### 1.2 Functions

In [3]:
def count_and_percentage(df, column_name):
    """
    Function to calculate the count and percentage of unique values in a given column of a DataFrame.

    Parameters:
    df (pandas.DataFrame): The DataFrame to analyze.
    column_name (str): The name of the column in the DataFrame.

    Returns:
    pandas.DataFrame: A DataFrame with the count and percentage of each unique value in the specified column.

    Raises:
    ValueError: If the specified column is not found in the DataFrame.
    """
    # Check if the column exists in the DataFrame
    if column_name not in df.columns:
        raise ValueError(f"Column '{column_name}' not found in DataFrame.")

    # Calculate count and normalized values
    count = df[column_name].value_counts(dropna=False)
    normalized = df[column_name].value_counts(dropna=False, normalize=True) * 100

    # Concatenate count and normalized values side by side
    result = pd.concat([count, normalized], axis=1)
    result.columns = ['Count', 'Percentage']

    return result

In [4]:
def count_comma_separated_values(df, column_name):
    """
    Counts the occurrences of individual elements in a comma-separated string column of a DataFrame.

    Parameters:
    df (pandas.DataFrame): The DataFrame containing the column.
    column_name (str): The name of the column to analyze.

    Returns:
    pandas.DataFrame: A DataFrame with the count and percentage of each unique element found in the comma-separated values.

    Raises:
    ValueError: If the specified column is not found in the DataFrame.
    """
    # Check if the column exists in the DataFrame
    if column_name not in df.columns:

        message = f"Column '{column_name}' not found in DataFrame."
        raise ValueError(message)

    # Split the column values, explode to individual elements, and count
    exploded_items = df[column_name].dropna().str.split(', ').explode()
    exploded_df = pd.DataFrame({column_name: exploded_items})
    counts_and_percent = count_and_percentage(exploded_df, column_name)

    return counts_and_percent

In [5]:
def remove_non_numeric_characters(df, column_name):
    """
    Removes all non-numeric characters from a column of a DataFrame.

    Parameters:
    df (pandas.DataFrame): The DataFrame containing the column.
    column_name (str): The name of the column to analyze.

    Returns:
    pandas.DataFrame: A DataFrame with all non-numeric characters removed from the specified column.

    Raises:
    ValueError: If the specified column is not found in the DataFrame.
    """

    return df[column_name].str.replace('[^a-zA-Z]', '', regex=True).unique()

In [6]:
def count_words(text):
    if pd.isna(text):
        return 0
    return len(str(text).split())


## 2. Data preview

### Otodom

#### 2.2.1 Cleaning data

In [7]:
def clean_otodom_data(df: pd.DataFrame):

    # 1. Split 'location' into street, city, and voivodeship
    df['location_split'] = df['location'].str.split(', ')
    df['street'] = df['location_split'].apply(lambda x: x[0] if len(x) > 2 else None)
    df['city'] = df['location_split'].apply(lambda x: x[-2] if len(x) > 1 else None)
    df['voivodeship'] = df['location_split'].apply(lambda x: x[-1] if x else None)

    # Drop the temporary 'location_split' column
    df.drop(columns=['location_split'], inplace=True)

    # 2. Convert 'price' into float
    df['price'] = df['price'].str.replace(' ', '').str.extract('(\d+)')[0]
    df['price'] = pd.to_numeric(df['price'], errors='coerce')
    df['price'] = df['price'].astype('float64')

    # Extract and convert 'square_meters' into integers
    df['square_meters'] = df['square_meters'].str.extract('(\d+)')[0].astype('float64')

    # Extract and convert 'rent' into float
    df['rent'] = df['rent'].str.extract('(\d+)')[0]
    df['rent'] = pd.to_numeric(df['rent'], errors='coerce').astype('float64')
    df['total_rent'] = df['rent'].add(df['price'], fill_value=0).astype('float64')

    # Extract and convert 'deposit' into float
    df['deposit'] = df['deposit'].str.replace(' ', '').str.extract('(\d+)')[0]
    df['deposit'] = pd.to_numeric(df['deposit'], errors='coerce').astype('float64')

    # Convert 'number_of_rooms' into an integer, special handling for "Kawalerka"
    # https://regex101.com/r/L4a3bp/1
    if df['number_of_rooms'].dtype in [np.object, np.str]:
        df['number_of_rooms'] = df['number_of_rooms'].replace('Kawalerka', '1')
        df['number_of_rooms'] = df['number_of_rooms'].str.replace('\D', '', regex=True)

    df['number_of_rooms'] = df['number_of_rooms'].astype('Int64')

    # Extract and clean 'floor_level'
    df_split = df['floor_level'].str.split('/', expand=True)
    df_split[0] = df_split[0].replace({'parter': 0, 'suterena': -1, '> 10': 11})

    poddasze_rows = df_split[0] == 'poddasze'
    df_split.loc[poddasze_rows, 0] = (df_split.loc[poddasze_rows, 1].fillna(0).astype(int) + 1).astype(str)

    df['attic'] = df_split[0] == 'poddasze'
    df['floor'] = pd.to_numeric(df_split[0], errors='coerce')
    df['floor'] = df['floor'].astype('Int64')
    df['building_floors'] = pd.to_numeric(df_split[1], errors='coerce')
    df['building_floors'] = df['building_floors'].astype('Int64')
    
    del df['floor_level']

    # Convert 'elevator' and 'parking_space' into boolean values
    df['elevator'] = df['elevator'].map({'tak': True, 'nie': False}).astype('boolean')

    df['parking_space'] = df['parking_space'].map({'garaż/miejsce parkingowe': True, 'brak informacji': False}).astype('boolean')
    
    # Convert 'build_year' into integers
    df['build_year'] = pd.to_numeric(df['build_year'], errors='coerce').astype('Int64')

    # todo create master columns for subcolumns
    # 3. Explode 'equipment', 'media_types', 'heating', 'security', 'windows', 'building_materials', 'additional_information' into boolean categories
    def explode_and_get_dummies(column_name):
        return df[column_name].str.get_dummies(sep=', ')
    
    to_explode = ['equipment', 'media_types', 'heating', 'security', 'windows', 'balcony_garden_terrace', 'building_material', 'additional_information']

    for column in to_explode:
        df = df.join(explode_and_get_dummies(column).add_prefix(f"{column}_"))

    for column in to_explode:
        del df[column]

    return df


In [8]:
df_otodom_cleaned = clean_otodom_data(df_otodom)
df_otodom_cleaned.head()

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  if df['number_of_rooms'].dtype in [np.object, np.str]:
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  if df['number_of_rooms'].dtype in [np.object, np.str]:


Unnamed: 0,link,title,location,price,summary_description,square_meters,rent,number_of_rooms,deposit,building_type,...,building_material_pustak,building_material_wielka płyta,building_material_żelbet,additional_information_brak informacji,additional_information_dwupoziomowe,additional_information_klimatyzacja,additional_information_oddzielna kuchnia,additional_information_piwnica,additional_information_pom. użytkowe,additional_information_tylko dla niepalących
0,https://www.otodom.pl/pl/oferta/loft-debowe-ta...,Loft Dębowe Tarasy Klimatyzacja 3 pokoje LOFT NEW,"Dębowe Tarasy, Dąb, Katowice, śląskie",3250.0,Nowoczesny apartament z doskonałą lokalizacją ...,45.0,550.0,3,5000.0,blok,...,0,0,0,0,0,1,0,0,0,1
1,https://www.otodom.pl/pl/oferta/eleganckie-mie...,Eleganckie mieszkanie nieopodal Parku Śląskiego,"ul. Świerkowa, Chorzów Stary, Chorzów, śląskie",2300.0,Wynajmę mieszkanie znajdujące się w Chorzowie ...,48.0,800.0,2,5000.0,apartamentowiec,...,0,0,0,0,0,0,1,1,0,1
2,https://www.otodom.pl/pl/oferta/mieszkanie-2-p...,"Mieszkanie 2 pokoje, widna kuchnia. Poręba. NOWE.","ul. Zakładowa 5, Poręba, zawierciański, śląskie",1500.0,Dzień dobry.\nDo wynajęcia mieszkanie 2 pokoje...,34.0,600.0,2,2000.0,blok,...,0,0,0,0,0,0,0,1,0,0
3,https://www.otodom.pl/pl/oferta/mieszkanie-2-p...,"Mieszkanie 2 pokojowe do wynajęcia,","ul. ks. bp. Józefa Gawliny, Śródmieście, Katow...",1600.0,Do wynajęcia mieszkanie 2 pokojowe.\nLOKALIZAC...,40.0,750.0,2,2350.0,blok,...,0,0,0,1,0,0,0,0,0,0
4,https://www.otodom.pl/pl/oferta/2-pok-mieszkan...,"2 pok. mieszkanie na wynajem, Sosnowiec, Niwka","ul. Zagłębiowska, Niwka, Sosnowiec, śląskie",1600.0,Przedstawiamy Państwu ofertę wynajmu 2 - pokoj...,35.0,600.0,2,2000.0,blok,...,0,0,0,0,0,0,1,1,0,0


In [9]:
df_otodom_cleaned.columns.to_list()

['link',
 'title',
 'location',
 'price',
 'summary_description',
 'square_meters',
 'rent',
 'number_of_rooms',
 'deposit',
 'building_type',
 'available_from',
 'remote service',
 'completion',
 'ownership',
 'rent_to_students',
 'elevator',
 'parking_space',
 'build_year',
 'street',
 'city',
 'voivodeship',
 'total_rent',
 'attic',
 'floor',
 'building_floors',
 'equipment_brak informacji',
 'equipment_kuchenka',
 'equipment_lodówka',
 'equipment_meble',
 'equipment_piekarnik',
 'equipment_pralka',
 'equipment_telewizor',
 'equipment_zmywarka',
 'media_types_brak informacji',
 'media_types_internet',
 'media_types_telefon',
 'media_types_telewizja kablowa',
 'heating_brak informacji',
 'heating_elektryczne',
 'heating_gazowe',
 'heating_miejskie',
 'security_brak informacji',
 'security_domofon / wideofon',
 'security_drzwi / okna antywłamaniowe',
 'security_monitoring / ochrona',
 'security_rolety antywłamaniowe',
 'security_system alarmowy',
 'security_teren zamknięty',
 'windows

In [10]:
columns_order = [
    'link', 'title', 'summary_description', 'remote service', 
    'price', 'rent', 'total_rent', 'deposit', 
    'location', 'street', 'city', 'voivodeship', 
    'square_meters', 'number_of_rooms', 'floor', 'attic', 'building_floors', 
    'available_from', 'completion', 'ownership', 'rent_to_students', 
    'building_type', 'build_year', 
    'elevator', 'parking_space', 
    'equipment_brak informacji', 'equipment_kuchenka', 'equipment_lodówka', 'equipment_meble', 'equipment_piekarnik', 'equipment_pralka', 'equipment_telewizor', 'equipment_zmywarka', 
    'media_types_brak informacji', 'media_types_internet', 'media_types_telefon', 'media_types_telewizja kablowa', 
    'heating_brak informacji', 'heating_elektryczne', 'heating_gazowe', 'heating_inne', 'heating_kotłownia', 'heating_miejskie', 'heating_piece kaflowe', 
    'security_brak informacji', 'security_domofon / wideofon', 'security_drzwi / okna antywłamaniowe', 'security_monitoring / ochrona', 'security_rolety antywłamaniowe', 'security_system alarmowy', 'security_teren zamknięty', 
    'windows_aluminiowe', 'windows_brak informacji', 'windows_drewniane', 'windows_plastikowe', 
    'building_material_beton', 'building_material_beton komórkowy', 'building_material_brak informacji', 'building_material_cegła', 'building_material_drewno', 'building_material_inne', 'building_material_keramzyt', 'building_material_pustak', 'building_material_silikat', 'building_material_wielka płyta', 'building_material_żelbet', 
    'additional_information_brak informacji', 'additional_information_dwupoziomowe', 'additional_information_klimatyzacja', 'additional_information_oddzielna kuchnia', 'additional_information_piwnica', 'additional_information_pom. użytkowe', 'additional_information_tylko dla niepalących'
]

# Add missing columns from columns_order with NaN values
for column in columns_order:
    if column not in df_otodom_cleaned.columns:
        df_otodom_cleaned[column] = np.nan
        
df_otodom_cleaned = df_otodom_cleaned[columns_order]

In [11]:
columns_multiindex = [
    ('listing', 'link'),
    ('listing', 'title'),
    ('listing', 'summary_description'),
    ('listing', 'remote_service'),
    ('pricing', 'price'),
    ('pricing', 'rent'),
    ('pricing', 'total_rent'),
    ('pricing', 'deposit'),
    ('location', 'complete_address'),
    ('location', 'street'),
    ('location', 'city'),
    ('location', 'voivodeship'),
    ('size', 'square_meters'),
    ('size', 'number_of_rooms'),
    ('size', 'floor'),
    ('size', 'attic'),
    ('size', 'building_floors'),
    ('legal_and_availability', 'available_from'),
    ('legal_and_availability', 'completion'),
    ('legal_and_availability', 'ownership'),
    ('legal_and_availability', 'rent_to_students'),
    ('type_and_year', 'building_type'),
    ('type_and_year', 'build_year'),
    ('amenities', 'elevator'),
    ('amenities', 'parking_space'),
    ('equipment', 'no_information'),
    ('equipment', 'stove'),
    ('equipment', 'fridge'),
    ('equipment', 'furniture'),
    ('equipment', 'oven'),
    ('equipment', 'washing_machine'),
    ('equipment', 'TV'),
    ('equipment', 'dishwasher'),
    ('media_types', 'no_information'),
    ('media_types', 'internet'),
    ('media_types', 'telephone'),
    ('media_types', 'cable_TV'),
    ('heating', 'no_information'),
    ('heating', 'electric'),
    ('heating', 'gas'),
    ('heating', 'other'),
    ('heating', 'boiler_room'),
    ('heating', 'district'),
    ('heating', 'tile_stove'),
    ('security', 'no_information'),
    ('security', 'intercom_or_video_intercom'),
    ('security', 'anti_burglary_doors_or_windows'),
    ('security', 'monitoring_or_security'),
    ('security', 'anti_burglary_roller_blinds'),
    ('security', 'alarm_system'),
    ('security', 'enclosed_area'),
    ('windows', 'aluminum'),
    ('windows', 'no_information'),
    ('windows', 'wooden'),
    ('windows', 'plastic'),
    ('building_material', 'concrete'),
    ('building_material', 'aerated_concrete'),
    ('building_material', 'no_information'),
    ('building_material', 'brick'),
    ('building_material', 'wood'),
    ('building_material', 'other'),
    ('building_material', 'lightweight_aggregate'),
    ('building_material', 'hollow_brick'),
    ('building_material', 'silicate'),
    ('building_material', 'large_panel'),
    ('building_material', 'reinforced_concrete'),
    ('additional_information', 'no_information'),
    ('additional_information', 'duplex'),
    ('additional_information', 'air_conditioning'),
    ('additional_information', 'separate_kitchen'),
    ('additional_information', 'basement'),
    ('additional_information', 'utility_room'),
    ('additional_information', 'non_smokers_only')
]

multiindex = pd.MultiIndex.from_tuples(columns_multiindex, names=['Category', 'Subcategory'])
df_otodom_cleaned.columns = multiindex

In [12]:
df_otodom_cleaned.head()

Category,listing,listing,listing,listing,pricing,pricing,pricing,pricing,location,location,...,building_material,building_material,building_material,additional_information,additional_information,additional_information,additional_information,additional_information,additional_information,additional_information
Subcategory,link,title,summary_description,remote_service,price,rent,total_rent,deposit,complete_address,street,...,silicate,large_panel,reinforced_concrete,no_information,duplex,air_conditioning,separate_kitchen,basement,utility_room,non_smokers_only
0,https://www.otodom.pl/pl/oferta/loft-debowe-ta...,Loft Dębowe Tarasy Klimatyzacja 3 pokoje LOFT NEW,Nowoczesny apartament z doskonałą lokalizacją ...,Obsługa zdalnaZapytaj,3250.0,550.0,3800.0,5000.0,"Dębowe Tarasy, Dąb, Katowice, śląskie",Dębowe Tarasy,...,,0,0,0,0,1,0,0,0,1
1,https://www.otodom.pl/pl/oferta/eleganckie-mie...,Eleganckie mieszkanie nieopodal Parku Śląskiego,Wynajmę mieszkanie znajdujące się w Chorzowie ...,Obsługa zdalnaZapytaj,2300.0,800.0,3100.0,5000.0,"ul. Świerkowa, Chorzów Stary, Chorzów, śląskie",ul. Świerkowa,...,,0,0,0,0,0,1,1,0,1
2,https://www.otodom.pl/pl/oferta/mieszkanie-2-p...,"Mieszkanie 2 pokoje, widna kuchnia. Poręba. NOWE.",Dzień dobry.\nDo wynajęcia mieszkanie 2 pokoje...,Obsługa zdalnaZapytaj,1500.0,600.0,2100.0,2000.0,"ul. Zakładowa 5, Poręba, zawierciański, śląskie",ul. Zakładowa 5,...,,0,0,0,0,0,0,1,0,0
3,https://www.otodom.pl/pl/oferta/mieszkanie-2-p...,"Mieszkanie 2 pokojowe do wynajęcia,",Do wynajęcia mieszkanie 2 pokojowe.\nLOKALIZAC...,Obsługa zdalnatak,1600.0,750.0,2350.0,2350.0,"ul. ks. bp. Józefa Gawliny, Śródmieście, Katow...",ul. ks. bp. Józefa Gawliny,...,,0,0,1,0,0,0,0,0,0
4,https://www.otodom.pl/pl/oferta/2-pok-mieszkan...,"2 pok. mieszkanie na wynajem, Sosnowiec, Niwka",Przedstawiamy Państwu ofertę wynajmu 2 - pokoj...,Obsługa zdalnatak,1600.0,600.0,2200.0,2000.0,"ul. Zagłębiowska, Niwka, Sosnowiec, śląskie",ul. Zagłębiowska,...,,0,0,0,0,0,1,1,0,0


In [13]:
df_otodom_cleaned.dtypes.to_dict()

{('listing', 'link'): dtype('O'),
 ('listing', 'title'): dtype('O'),
 ('listing', 'summary_description'): dtype('O'),
 ('listing', 'remote_service'): dtype('O'),
 ('pricing', 'price'): dtype('float64'),
 ('pricing', 'rent'): dtype('float64'),
 ('pricing', 'total_rent'): dtype('float64'),
 ('pricing', 'deposit'): dtype('float64'),
 ('location', 'complete_address'): dtype('O'),
 ('location', 'street'): dtype('O'),
 ('location', 'city'): dtype('O'),
 ('location', 'voivodeship'): dtype('O'),
 ('size', 'square_meters'): dtype('float64'),
 ('size', 'number_of_rooms'): Int64Dtype(),
 ('size', 'floor'): Int64Dtype(),
 ('size', 'attic'): dtype('bool'),
 ('size', 'building_floors'): Int64Dtype(),
 ('legal_and_availability', 'available_from'): dtype('O'),
 ('legal_and_availability', 'completion'): dtype('O'),
 ('legal_and_availability', 'ownership'): dtype('O'),
 ('legal_and_availability', 'rent_to_students'): dtype('O'),
 ('type_and_year', 'building_type'): dtype('O'),
 ('type_and_year', 'build_

#### 2.2.2 Checking data

##### Prices

In [14]:

assert df_otodom_cleaned[[('pricing', 'price'), ('pricing', 'rent'), ('pricing', 'deposit')]].min().min() >= 0, "Price, rent, or deposit contains negative values"

In [15]:
df_otodom_cleaned[[('pricing', 'price'), ('pricing', 'rent'), ('pricing', 'deposit')]].max()

Category  Subcategory
pricing   price          5800.0
          rent            900.0
          deposit        8000.0
dtype: float64

In [16]:
def last_and_first_percentile(column_name, df):
    """
    Returns the first and last percentile of a column in a DataFrame.

    Parameters:
    column_name (str): The name of the column to analyze.
    df (pandas.DataFrame): The DataFrame containing the column.

    Returns:
    tuple: A tuple containing the first and last percentile of the column.
    """
    return df[column_name].quantile([0.01, 0.99])

In [17]:
last_and_first_percentile(('pricing', 'price'), df_otodom_cleaned)

0.01    1177.5
0.99    5800.0
Name: (pricing, price), dtype: float64

Quick look

In [18]:
pd.set_option('display.max_colwidth', None)
df_otodom_cleaned.sort_values(by=[('pricing', 'price')], ascending=False).head()[[('listing', 'link'), ('listing', 'title'), ('listing', 'summary_description'), ('pricing', 'total_rent'), ('location', 'city')]]


Category,listing,listing,listing,pricing,location
Subcategory,link,title,summary_description,total_rent,city
31,https://www.otodom.pl/pl/oferta/luksusowe-mieszkanie-z-2-syp-tarasem-i-parkingiem-ID4pg4L,"Luksusowe mieszkanie z 2 syp, tarasem i parkingiem","Wynajmie mieszkanie premium klasy o wysokim standardzie na zamknietym osiedlu Debowe tarasy. Mieszkanie sklada sie z dwoch sypialni, salonu z kuchnia, lazienki z wc, duzy taras z meblami tarasowymi. Do mieszkania przynalezy miejsce na podziemnym parkingu i komorka lokatorska. Czynsz z zaliczkami na wode i ogrzewanie to 1060 zl przy 2 osobach. W cenie internet i telewizja. W mieszkaniu smart TV, pralka, lodowka, plyta, piekarnik, zmywarka, mikrofala, czajnik, ekspres do kawy. Drewniane podlogi, nowe meble, duzo szaf do przechowywania. Bezposrednio od wlasciciela",5801.0,Katowice
70,https://www.otodom.pl/pl/oferta/luksusowe-mieszkanie-z-2-syp-tarasem-i-parkingiem-ID4pg4L,"Luksusowe mieszkanie z 2 syp, tarasem i parkingiem","Wynajmie mieszkanie premium klasy o wysokim standardzie na zamknietym osiedlu Debowe tarasy. Mieszkanie sklada sie z dwoch sypialni, salonu z kuchnia, lazienki z wc, duzy taras z meblami tarasowymi. Do mieszkania przynalezy miejsce na podziemnym parkingu i komorka lokatorska. Czynsz z zaliczkami na wode i ogrzewanie to 1060 zl przy 2 osobach. W cenie internet i telewizja. W mieszkaniu smart TV, pralka, lodowka, plyta, piekarnik, zmywarka, mikrofala, czajnik, ekspres do kawy. Drewniane podlogi, nowe meble, duzo szaf do przechowywania. Bezposrednio od wlasciciela",5801.0,Katowice
24,https://www.otodom.pl/pl/oferta/debowe-tarasy-garaz-komorka-ID4pdLd,Dębowe Tarasy Garaż Komórka,"Osiedle Dębowe Tarasy .Zapraszamy do zapoznania się z bardzo atrakcyjną propozycją wynajmu apartamentu zlokalizowanego w Katowicach na zamkniętym oraz strzeżonym osiedlu .Miesięczny czynsz najmu 3950zł media płatne dodatkowo . Wymagana kaucja zwrotna . Miejsce postojowe w hali garażowej za dodatkową opłatą 350 zł . Prowizja biura jednomiesięczny czynsz najmu netto + VAT.Niniejsze ogłoszenie nie stanowi oferty w rozumieniu przepisów art. 66 1 Kodeksu Cywilnego, lecz ma charakter informacyjny.Celem uzyskania szczegółowych informacji oraz umówieniem się na prezentację prosimy o kontakt telefoniczny z agentem prowadzącym: 535-978-880.",3950.0,Katowice
63,https://www.otodom.pl/pl/oferta/debowe-tarasy-garaz-komorka-ID4pdLd,Dębowe Tarasy Garaż Komórka,"Osiedle Dębowe Tarasy .Zapraszamy do zapoznania się z bardzo atrakcyjną propozycją wynajmu apartamentu zlokalizowanego w Katowicach na zamkniętym oraz strzeżonym osiedlu .Miesięczny czynsz najmu 3950zł media płatne dodatkowo . Wymagana kaucja zwrotna . Miejsce postojowe w hali garażowej za dodatkową opłatą 350 zł . Prowizja biura jednomiesięczny czynsz najmu netto + VAT.Niniejsze ogłoszenie nie stanowi oferty w rozumieniu przepisów art. 66 1 Kodeksu Cywilnego, lecz ma charakter informacyjny.Celem uzyskania szczegółowych informacji oraz umówieniem się na prezentację prosimy o kontakt telefoniczny z agentem prowadzącym: 535-978-880.",3950.0,Katowice
64,https://www.otodom.pl/pl/oferta/3-pokoje-salon-ul-zabrska-18-parking-ID4plSA,3 pokoje + salon/ul.Zabrska 18/parking,"Wynajmę lokal mieszkalny o powierzchni 90 m2 na 3 piętrze składające się z:\n- 3 niezależnych pokoi,\n- salonu z aneksem kuchennym z wyjściem na balkon,\n- dużego przedpokoju,\n- łazienki.\n\nMieszkanie wyposażone w szafy, biurka, krzesła biurowe jest idealną opcją na wynajęcie pod firmę. \nIstnieje także możliwość doposażenia bądź zabrania tych mebli, które są obecnie. \nKuchnia i łazienka są wyposażone (lodówka, piekarnik, kuchenka, pralka).\n\nW kamienicy znajduje się 6 lokali z przeznaczeniem na cele biurowe (kancelarie notarialne itd.) oraz mieszkalne. Kamienica z zamykanym bramą z podwórzem, na którym bez problemu można parkować.\n\nW podanym czynszu zaliczka na wodę i śmieci, dodatkowo płatne prąd i gaz.\n\nBEZ PROWIZJI",4100.0,Katowice


In [19]:
pd.set_option('display.max_colwidth', 50)

##### locations

In [20]:
set(df_otodom_cleaned[('location', 'city')])

{'Bytom',
 'Chorzów',
 'Dąbrowa Górnicza',
 'Katowice',
 'Siemianowice Śląskie',
 'Sosnowiec',
 'będziński',
 'myszkowski',
 'zawierciański'}

In [21]:
set(df_otodom_cleaned[('location', 'voivodeship')])

{'śląskie'}

Textual Data Analysis

In [22]:
df_otodom_cleaned[('listing', 'summary_description')].str.len().max()

3367

In [23]:
df_otodom_cleaned[('listing', 'summary_description')].apply(count_words).max()

473

Max values of the selected columns

In [24]:
df_otodom_cleaned[('size', 'square_meters')].max()

148.0

In [25]:
df_otodom_cleaned[('size', 'square_meters')].min()

17.0

In [26]:
df_otodom_cleaned[('size', 'number_of_rooms')].max()

4

In [27]:
df_otodom_cleaned[('size', 'number_of_rooms')].min()

1

In [28]:
df_otodom_cleaned[('size', 'floor')].value_counts().index.to_list()

[3, 2, 1, 0, 4, 8, 10, 9, 7]

In [29]:
df_otodom_cleaned[('size', 'building_floors')].value_counts()

4     23
3     16
10    10
2      5
5      5
11     4
1      4
9      3
24     3
8      2
14     2
7      2
6      1
Name: (size, building_floors), dtype: Int64

Check if date column is the date format

In [30]:
date_format_regex = r'^\d{4}-\d{2}-\d{2}$'

# Check if each date in the column matches the format
# Perform the assertion directly
assert (df_otodom_cleaned[('legal_and_availability', 'available_from')].dropna().str.match(date_format_regex)).all(), "Not all dates match the required format"


#####  2.2.3 Translate Polish to English
`Listing | title`, `Listing | summary_description` are not translated due to losing context by using a translation

listing

In [31]:
df_otodom_cleaned[('listing', 'remote_service')] = df_otodom_cleaned[('listing', 'remote_service')].map(
    {'Obsługa zdalnaZapytaj': np.NaN, 
     'Obsługa zdalnatak': 'unspecified', 
     'Obsługa zdalnaFilm': 'video',
     'Obsługa zdalnaWirtualny spacer': 'virtual_tour',
     'Obsługa zdalnaFilmWirtualny spacer': 'video_virtual_tour',
     }
    )
df_otodom_cleaned[('listing', 'remote_service')].value_counts(dropna=False)

NaN             75
unspecified     17
virtual_tour     2
video            2
Name: (listing, remote_service), dtype: int64

legal_and_availability

In [32]:
df_otodom_cleaned[('legal_and_availability', 'completion')] = df_otodom_cleaned[('legal_and_availability', 'completion')].map(
    {'do zamieszkania': 'ready_to_move_in', 
     'do remontu': 'in_need_of_renovation', 
     'do wykończenia': 'unfinished'}
    )
df_otodom_cleaned[('legal_and_availability', 'completion')].value_counts()

ready_to_move_in    89
Name: (legal_and_availability, completion), dtype: int64

In [33]:
df_otodom_cleaned[('legal_and_availability', 'ownership')]= df_otodom_cleaned[('legal_and_availability', 'ownership')].map(
    {'biuro nieruchomości': 'real_estate_agency', 
     'prywatny': 'private', 
     'deweloper': 'developer'}
     )
df_otodom_cleaned[('legal_and_availability', 'ownership')].value_counts()

real_estate_agency    53
private               43
Name: (legal_and_availability, ownership), dtype: int64

In [34]:
df_otodom_cleaned[('legal_and_availability', 'rent_to_students')] = df_otodom_cleaned[('legal_and_availability', 'rent_to_students')].map({'brak informacji': np.NaN, 'tak': True, 'nie': False})
df_otodom_cleaned[('legal_and_availability', 'rent_to_students')].value_counts(dropna=False)

NaN     62
True    34
Name: (legal_and_availability, rent_to_students), dtype: int64

type_and_year

In [35]:
df_otodom_cleaned['type_and_year'].head()

Subcategory,building_type,build_year
0,blok,2022.0
1,apartamentowiec,2014.0
2,blok,
3,blok,1965.0
4,blok,


In [36]:
df_otodom_cleaned[('type_and_year', 'building_type')].value_counts()

blok                55
kamienica           16
apartamentowiec     15
dom wolnostojący     4
Name: (type_and_year, building_type), dtype: int64

In [37]:
df_otodom_cleaned[('type_and_year', 'building_type')] = df_otodom_cleaned[('type_and_year', 'building_type')].map({
    'blok': 'block_of_flats', 
    'apartamentowiec': 'apartment_building', 
    'kamienica': 'historic_apartment_building',
    'dom wolnostojący': 'detached_house',
    'szeregowiec': 'terraced_house',
    })
df_otodom_cleaned[('type_and_year', 'building_type')].value_counts(dropna=False)

block_of_flats                 55
historic_apartment_building    16
apartment_building             15
NaN                             6
detached_house                  4
Name: (type_and_year, building_type), dtype: int64

##### Change data types

bool

In [38]:
df_otodom_cleaned[('legal_and_availability', 'rent_to_students')] = df_otodom_cleaned[('legal_and_availability', 'rent_to_students')].fillna(False).astype('boolean')
df_otodom_cleaned[('legal_and_availability', 'rent_to_students')].head()

0     True
1    False
2    False
3     True
4    False
Name: (legal_and_availability, rent_to_students), dtype: boolean

In [39]:
df_otodom_cleaned[('legal_and_availability', 'rent_to_students')].value_counts(dropna=False)

False    62
True     34
<NA>      0
Name: (legal_and_availability, rent_to_students), dtype: Int64

In [40]:
df_otodom_cleaned['equipment'].head()

Subcategory,no_information,stove,fridge,furniture,oven,washing_machine,TV,dishwasher
0,0,1,1,1,1,1,1,1
1,0,1,1,1,1,1,1,1
2,0,1,1,1,1,1,1,0
3,0,0,1,1,0,1,0,1
4,1,0,0,0,0,0,0,0


In [41]:
for col in df_otodom_cleaned['equipment'].columns:
    df_otodom_cleaned[('equipment', col)] = df_otodom_cleaned[('equipment', col)].fillna(0).astype(bool)
df_otodom_cleaned['equipment'].head()

Subcategory,no_information,stove,fridge,furniture,oven,washing_machine,TV,dishwasher
0,False,True,True,True,True,True,True,True
1,False,True,True,True,True,True,True,True
2,False,True,True,True,True,True,True,False
3,False,False,True,True,False,True,False,True
4,True,False,False,False,False,False,False,False


In [42]:
for col in df_otodom_cleaned['media_types'].columns:
    df_otodom_cleaned[('media_types', col)] = df_otodom_cleaned[('media_types', col)].fillna(0).astype(bool)
df_otodom_cleaned['media_types'].head()

Subcategory,no_information,internet,telephone,cable_TV
0,False,True,False,True
1,True,False,False,False
2,True,False,False,False
3,False,True,False,False
4,False,True,True,True


In [43]:
for col in df_otodom_cleaned['heating'].columns:
    df_otodom_cleaned[('heating', col)] = df_otodom_cleaned[('heating', col)].fillna(0).astype(bool)
df_otodom_cleaned['heating'].head()

Subcategory,no_information,electric,gas,other,boiler_room,district,tile_stove
0,False,False,False,False,False,True,False
1,False,False,False,False,False,True,False
2,False,False,False,False,False,True,False
3,False,False,False,False,False,True,False
4,False,False,False,False,False,True,False


In [44]:
for col in df_otodom_cleaned['security'].columns:
    df_otodom_cleaned[('security', col)] = df_otodom_cleaned[('security', col)].fillna(0).astype(bool)
df_otodom_cleaned['security'].head()

Subcategory,no_information,intercom_or_video_intercom,anti_burglary_doors_or_windows,monitoring_or_security,anti_burglary_roller_blinds,alarm_system,enclosed_area
0,False,True,True,True,False,False,True
1,False,True,False,True,False,False,False
2,False,False,True,False,False,False,False
3,False,True,True,False,False,False,False
4,True,False,False,False,False,False,False


In [45]:
for col in df_otodom_cleaned['windows'].columns:
    df_otodom_cleaned[('windows', col)] = df_otodom_cleaned[('windows', col)].fillna(0).astype(bool)
df_otodom_cleaned['windows'].head()

Subcategory,aluminum,no_information,wooden,plastic
0,False,False,False,True
1,False,True,False,False
2,False,False,False,True
3,False,False,False,True
4,False,False,False,True


In [46]:
for col in df_otodom_cleaned['building_material'].columns:
    df_otodom_cleaned[('building_material', col)] = df_otodom_cleaned[('building_material', col)].fillna(0).astype(bool)
df_otodom_cleaned['building_material'].head()

Subcategory,concrete,aerated_concrete,no_information,brick,wood,other,lightweight_aggregate,hollow_brick,silicate,large_panel,reinforced_concrete
0,False,True,False,False,False,False,False,False,False,False,False
1,False,False,True,False,False,False,False,False,False,False,False
2,False,False,True,False,False,False,False,False,False,False,False
3,False,False,False,True,False,False,False,False,False,False,False
4,False,False,True,False,False,False,False,False,False,False,False


In [47]:
for col in df_otodom_cleaned['additional_information'].columns:
    df_otodom_cleaned[('additional_information', col)] = df_otodom_cleaned[('additional_information', col)].fillna(0).astype(bool)
df_otodom_cleaned['additional_information'].head()

Subcategory,no_information,duplex,air_conditioning,separate_kitchen,basement,utility_room,non_smokers_only
0,False,False,True,False,False,False,True
1,False,False,False,True,True,False,True
2,False,False,False,False,True,False,False
3,True,False,False,False,False,False,False
4,False,False,False,True,True,False,False


Converting selected columns to the strings<br>
*We do not care about backward compatibly, and a `string` is much more readable than a `object`*

In [48]:
columns_to_convert = [
    ('listing', 'link'),
    ('listing', 'title'),
    ('listing', 'summary_description'),
    ('listing', 'remote_service'),
    ('location', 'complete_address'),
    ('location', 'street'),
    ('location', 'city'),
    ('location', 'voivodeship'),
    ('legal_and_availability', 'available_from'),
    ('legal_and_availability', 'completion'),
    ('legal_and_availability', 'ownership'),
    ('type_and_year', 'building_type'),
]

# Convert each column to the pandas string type
for col in columns_to_convert:
    df_otodom_cleaned[col] = df_otodom_cleaned[col].astype('string')

In [49]:
df_otodom_cleaned.dtypes.to_dict()

{('listing', 'link'): string[python],
 ('listing', 'title'): string[python],
 ('listing', 'summary_description'): string[python],
 ('listing', 'remote_service'): string[python],
 ('pricing', 'price'): dtype('float64'),
 ('pricing', 'rent'): dtype('float64'),
 ('pricing', 'total_rent'): dtype('float64'),
 ('pricing', 'deposit'): dtype('float64'),
 ('location', 'complete_address'): string[python],
 ('location', 'street'): string[python],
 ('location', 'city'): string[python],
 ('location', 'voivodeship'): string[python],
 ('size', 'square_meters'): dtype('float64'),
 ('size', 'number_of_rooms'): Int64Dtype(),
 ('size', 'floor'): Int64Dtype(),
 ('size', 'attic'): dtype('bool'),
 ('size', 'building_floors'): Int64Dtype(),
 ('legal_and_availability', 'available_from'): string[python],
 ('legal_and_availability', 'completion'): string[python],
 ('legal_and_availability', 'ownership'): string[python],
 ('legal_and_availability', 'rent_to_students'): BooleanDtype,
 ('type_and_year', 'building_

## 3. Save cleaned data

### 3.1. Save data

In [50]:
data_path_manager.save_df(df_otodom_cleaned, domain="otodom")

2024-02-26 17:56:02: Saving schema to d:\UserData karol\Documents\Programming\Data Science\Data Engineering\Rent comparisions\Home Market Harvester\data\cleaned\2024_02_26_17_43_23_Mierzęcice__Będziński__Śląskie\otodom_pl_schema.json
2024-02-26 17:56:02: Saving CSV to d:\UserData karol\Documents\Programming\Data Science\Data Engineering\Rent comparisions\Home Market Harvester\data\cleaned\2024_02_26_17_43_23_Mierzęcice__Będziński__Śląskie\otodom.pl.csv


### 3.2 Check saved data

### Otodom

In [51]:
df_otodom_saved = data_path_manager.load_df(domain="otodom", is_cleaned=True)
df_otodom_saved.head()


Unnamed: 0_level_0,listing,listing,listing,listing,pricing,pricing,pricing,pricing,location,location,...,building_material,building_material,building_material,additional_information,additional_information,additional_information,additional_information,additional_information,additional_information,additional_information
Unnamed: 0_level_1,link,title,summary_description,remote_service,price,rent,total_rent,deposit,complete_address,street,...,silicate,large_panel,reinforced_concrete,no_information,duplex,air_conditioning,separate_kitchen,basement,utility_room,non_smokers_only
0,https://www.otodom.pl/pl/oferta/loft-debowe-ta...,Loft Dębowe Tarasy Klimatyzacja 3 pokoje LOFT NEW,Nowoczesny apartament z doskonałą lokalizacją ...,,3250.0,550.0,3800.0,5000.0,"Dębowe Tarasy, Dąb, Katowice, śląskie",Dębowe Tarasy,...,False,False,False,False,False,True,False,False,False,True
1,https://www.otodom.pl/pl/oferta/eleganckie-mie...,Eleganckie mieszkanie nieopodal Parku Śląskiego,Wynajmę mieszkanie znajdujące się w Chorzowie ...,,2300.0,800.0,3100.0,5000.0,"ul. Świerkowa, Chorzów Stary, Chorzów, śląskie",ul. Świerkowa,...,False,False,False,False,False,False,True,True,False,True
2,https://www.otodom.pl/pl/oferta/mieszkanie-2-p...,"Mieszkanie 2 pokoje, widna kuchnia. Poręba. NOWE.",Dzień dobry. Do wynajęcia mieszkanie 2 pokoje ...,,1500.0,600.0,2100.0,2000.0,"ul. Zakładowa 5, Poręba, zawierciański, śląskie",ul. Zakładowa 5,...,False,False,False,False,False,False,False,True,False,False
3,https://www.otodom.pl/pl/oferta/mieszkanie-2-p...,"Mieszkanie 2 pokojowe do wynajęcia,",Do wynajęcia mieszkanie 2 pokojowe. LOKALIZACJ...,unspecified,1600.0,750.0,2350.0,2350.0,"ul. ks. bp. Józefa Gawliny, Śródmieście, Katow...",ul. ks. bp. Józefa Gawliny,...,False,False,False,True,False,False,False,False,False,False
4,https://www.otodom.pl/pl/oferta/2-pok-mieszkan...,"2 pok. mieszkanie na wynajem, Sosnowiec, Niwka",Przedstawiamy Państwu ofertę wynajmu 2 - pokoj...,unspecified,1600.0,600.0,2200.0,2000.0,"ul. Zagłębiowska, Niwka, Sosnowiec, śląskie",ul. Zagłębiowska,...,False,False,False,False,False,False,True,True,False,False


In [52]:
are_identical = df_otodom_saved.equals(df_otodom_cleaned)

if not are_identical:
    message = "The saved DataFrame is not identical to the original one."
    raise ValueError(message)

else:
    print("The saved DataFrame is identical to the original one.")

The saved DataFrame is identical to the original one.
