# EDA

## 1. Load cleaned data

In [23]:
# Standard imports
from pathlib import Path
import os
import sys

def print_project_root():
    notebooks_dir = Path.cwd()

    # Calculate the root directory of the project (go up three levels)
    project_root = notebooks_dir.parent.parent.parent
    print(f"The root directory of the project is: {project_root}")

    if str(project_root) not in sys.path:
        sys.path.append(str(project_root))

print_project_root()

# Suppress future warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Third party imports
import numpy as np
import pandas as pd

# Local imports
from pipeline.src._csv_utils import DataPathCleaningManager

In [24]:
data_timeplace = os.getenv("MARKET_OFFERS_TIMEPLACE")
if data_timeplace is None:
    raise ValueError("The environment variable MARKET_OFFERS_TIMEPLACE is not set.")

data_path_manager = DataPathCleaningManager(data_timeplace)

try:
    df_olx = data_path_manager.load_df(domain="olx", is_cleaned=True)
except FileNotFoundError as e:
    print(e)
    df_olx = None

try:
    df_otodom = data_path_manager.load_df(domain="otodom", is_cleaned=True)
except FileNotFoundError as e:
    print(e)
    df_otodom = None

In [None]:
if df_olx is None and df_otodom is None:
    raise ValueError("No dataframes were loaded.")

### 1.1 OLX

In [25]:
df_olx

Unnamed: 0,link,title,price,summary_description,ownership,floor_level,is_furnished,building_type,square_meters,number_of_rooms,rent,voivodeship,city,street
0,https://www.olx.pl/d/oferta/kawalerka-tychy-os...,Kawalerka Tychy oś M,1500.0,OpisWynajmę kawalerkę na ul Moniuszki 1/16 .na...,private,3,False,apartment_building,26,1,400.0,Śląskie,Zawada,Moniuszki 1/16


In [26]:
if df_olx is not None:
    df_olx.columns

Index(['link', 'title', 'price', 'summary_description', 'ownership',
       'floor_level', 'is_furnished', 'building_type', 'square_meters',
       'number_of_rooms', 'rent', 'voivodeship', 'city', 'street'],
      dtype='object')

### 1.2 otodom

In [27]:
if df_otodom is not None:
    df_otodom.sample(5)

Unnamed: 0_level_0,listing,listing,listing,listing,pricing,pricing,pricing,pricing,location,location,...,building_material,building_material,building_material,additional_information,additional_information,additional_information,additional_information,additional_information,additional_information,additional_information
Unnamed: 0_level_1,link,title,summary_description,remote_service,price,rent,total_rent,deposit,complete_address,street,...,silicate,large_panel,reinforced_concrete,no_information,duplex,air_conditioning,separate_kitchen,basement,utility_room,non_smokers_only
259,https://www.otodom.pl/pl/oferta/wynajme-mieszk...,Wynajmę mieszkanie,Mieszkanie znajduje się na ul.Pułaskiego koło ...,,1100.0,750.0,1850.0,2000.0,"ul. Kazimierza Pułaskiego, Śródmieście, Bytom,...",ul. Kazimierza Pułaskiego,...,False,False,False,False,False,False,True,True,False,False
934,https://www.otodom.pl/pl/oferta/luksusowy-4-po...,Luksusowy 4 pok. apartament w Katowice-Brynów,Luksusowy 4 pok. apartament w Katowice-Brynów ...,,6000.0,,6000.0,13000.0,"Brynów-Osiedle Zgrzebnioka, Katowice, śląskie",Brynów-Osiedle Zgrzebnioka,...,False,False,False,True,False,False,False,False,False,False
755,https://www.otodom.pl/pl/oferta/apartament-na-...,Apartament na nowym Osiedlu,Na wynajem apartament TYPU STUDIO w Nowym Apar...,unspecified,1600.0,,1600.0,2000.0,"Śródmieście, Śródmieście, Dąbrowa Górnicza, śl...",Śródmieście,...,False,False,False,True,False,False,False,False,False,False
915,https://www.otodom.pl/pl/oferta/mieszkanie-w-s...,Mieszkanie w śródmieściu ul. PCK 19,Wolne od 25 września 2023 roku. Mieszkanie...,,1450.0,,1450.0,2900.0,"Śródmieście, Katowice, śląskie",Śródmieście,...,False,False,False,False,False,True,True,False,False,False
1058,https://www.otodom.pl/pl/oferta/nowe-mieszkani...,Nowe Mieszkanie 4 pok. Świętochłowice Centrum ...,Dostępne.Oferujemy Państwu do wynajęcia nowe m...,,1650.0,,1650.0,,"Świętochłowice, śląskie",,...,False,False,False,False,False,False,True,False,False,False


In [28]:
if df_otodom is not None:
    df_otodom.columns

MultiIndex([(               'listing',                           'link'),
            (               'listing',                          'title'),
            (               'listing',            'summary_description'),
            (               'listing',                 'remote_service'),
            (               'pricing',                          'price'),
            (               'pricing',                           'rent'),
            (               'pricing',                     'total_rent'),
            (               'pricing',                        'deposit'),
            (              'location',               'complete_address'),
            (              'location',                         'street'),
            (              'location',                           'city'),
            (              'location',                    'voivodeship'),
            (                  'size',                  'square_meters'),
            (                  'size',

### 1.3 Combined

In [29]:
def safely_convert_dtypes(df, dtype_specs) -> pd.DataFrame:
    """
    Converts column data types in a DataFrame according to the specified data types,
    handling exceptions gracefully.
    
    Args:
    - df: pandas DataFrame to convert.
    - dtype_specs: Dictionary specifying the target data type for each column.
    """
    for column, target_dtype in dtype_specs.items():
        try:
            # Attempt conversion only if the column exists to prevent KeyErrors
            if column in df.columns:
                df[column] = df[column].astype(target_dtype)
        except ValueError as e:
            print(f"Warning: Could not convert column {column} to {target_dtype}. Error: {e}")
    return df

def ensure_multiindex(df, combined_schema) -> pd.DataFrame:
    """
    Ensures the DataFrame's columns are in MultiIndex format according to the combined schema.
    
    Args:
    - df: DataFrame to adjust.
    - combined_schema: The schema with which to align the DataFrame's columns.
    """
    # Convert column names to MultiIndex if they are not already
    if not isinstance(df.columns, pd.MultiIndex):
        multiindex_columns = [tuple(col.split(", ")) if ", " in col else (col, '') for col in combined_schema]
        df.columns = pd.MultiIndex.from_tuples(multiindex_columns)
    return df

def align_columns_to_schema(df, combined_schema) -> pd.DataFrame:
    """
    Aligns DataFrame columns to the combined schema, preserving existing columns and adding missing ones.
    
    Args:
    - df: DataFrame to align.
    - combined_schema: Schema to align the DataFrame's columns to.
    """
    # Generate the target column order from the schema
    target_columns = list(combined_schema.keys())
    
    # Identify missing columns and fill them appropriately
    missing_columns = [col for col in target_columns if col not in df.columns]
    for col in missing_columns:
        df[col] = np.nan  # or False for boolean columns, as appropriate
    
    # Reorder the DataFrame to match the target column order, including only the columns present in the schema
    df = df.reindex(columns=[col for col in target_columns if col in df.columns or col in missing_columns])
    
    return df

def transform_olx(df_olx: pd.DataFrame, combined_df_schema_json: dict) -> pd.DataFrame:
    """
    Transforms the df_olx DataFrame to align with the combined DataFrame schema,
    including converting column names to a MultiIndex format, filling missing columns,
    adding calculated columns, and ensuring data types match the combined schema.

    Args:
    - df_olx (pd.DataFrame): DataFrame containing data from OLX.
    - combined_df_schema_json (dict): Schema definition for the combined DataFrame,
                                      including data types and column structure.

    Returns:
    - pd.DataFrame: Transformed df_olx aligned with the combined DataFrame schema.
    """

    # Step 1: Create a mapping
    column_mapping = {
        'link': ('listing', 'link'),
        'title': ('listing', 'title'),
        'price': ('pricing', 'price'),
        'rent': ('pricing', 'rent'),
        'summary_description': ('listing', 'summary_description'),
        'ownership': ('legal_and_availability', 'ownership'),
        'floor_level': ('size', 'floor'),
        'is_furnished': ('equipment', 'furniture'),
        'building_type': ('type_and_year', 'building_type'),
        'square_meters': ('size', 'square_meters'),
        'number_of_rooms': ('size', 'number_of_rooms'),
        'voivodeship': ('location', 'voivodeship'),
        'city': ('location', 'city'),
        'street': ('location', 'street')
    }


    # Step 2: Modify df_olx to have a MultiIndex
    df_olx.columns = pd.MultiIndex.from_tuples([column_mapping[col] if col in column_mapping else (col, '') for col in df_olx.columns])

    # Step 3: Identify and fill missing columns 
    # in df_olx based on the combined schema
    combined_df_columns_names = combined_df_schema_json["dtypes"].keys()
    missing_columns = set(combined_df_columns_names) - set(df_olx.columns)

    for col in missing_columns:
        if col in [('equipment', 'furniture'),]:  # Add other boolean columns if any
            df_olx[col] = False
        else:
            df_olx[col] = np.nan

    # Step 4: Ensuring schema compatibility with MultiIndex and aligning columns
    df_olx = ensure_multiindex(df_olx, combined_df_schema_json)

    # Step 5: Reorder df_olx columns to match the combined DataFrame schema
    df_olx = df_olx.reindex(columns=combined_df_columns_names)

    # Step 6: Add calculated columns
    df_olx[('pricing', 'total_rent')] = df_olx[('pricing', 'price')].add(df_olx[('pricing', 'rent')], fill_value=0)
    df_olx[('location', 'complete_address')] = df_olx.apply(
        lambda row: ', '.join(
            filter(None, [row[('location', 'street')], row[('location', 'city')], row[('location', 'voivodeship')]])
        ),
        axis=1
    )

    # Step 7: Fill NaNs for specified columns and replace NaNs with appropriate values
    columns_to_fill_false = [
        ('size', 'attic'),
        ('amenities', 'elevator'),
        ('amenities', 'parking_space'),
        ('equipment', 'no_information'),
        ('equipment', 'stove'),
        ('equipment', 'fridge'),
        ('equipment', 'oven'),
        ('equipment', 'washing_machine'),
        ('equipment', 'TV'),
        ('equipment', 'dishwasher'),
        ('media_types', 'internet'),
        ('media_types', 'telephone'),
        ('media_types', 'cable_TV'),
        ('heating', 'electric'),
        ('heating', 'gas'),
        ('heating', 'other'),
        ('heating', 'boiler_room'),
        ('heating', 'district'),
        ('heating', 'tile_stove'),
        ('security', 'intercom_or_video_intercom'),
        ('security', 'anti_burglary_doors_or_windows'),
        ('security', 'monitoring_or_security'),
        ('security', 'anti_burglary_roller_blinds'),
        ('security', 'alarm_system'),
        ('security', 'enclosed_area'),
        ('windows', 'aluminum'),
        ('windows', 'wooden'),
        ('windows', 'plastic'),
        ('building_material', 'concrete'),
        ('building_material', 'aerated_concrete'),
        ('building_material', 'brick'),
        ('building_material', 'wood'),
        ('building_material', 'other'),
        ('building_material', 'lightweight_aggregate'),
        ('building_material', 'hollow_brick'),
        ('building_material', 'silicate'),
        ('building_material', 'large_panel'),
        ('building_material', 'reinforced_concrete'),
        ('additional_information', 'duplex'),
        ('additional_information', 'air_conditioning'),
        ('additional_information', 'separate_kitchen'),
        ('additional_information', 'basement'),
        ('additional_information', 'utility_room'),
        ('additional_information', 'non_smokers_only'),
    ] 
    for col in columns_to_fill_false:
        df_olx[col] = df_olx[col].fillna(False)

    columns_to_fill_true = [
        ('media_types', 'no_information'),
        ('heating', 'no_information'),
        ('security', 'no_information'),
        ('windows', 'no_information'),
        ('building_material', 'no_information'),
        ('additional_information', 'no_information'),
    ]

    for col in columns_to_fill_true:
        df_olx[col] = df_olx[col].fillna(True)

    # Step 8: Safely convert data types according to the combined schema
    df_olx = safely_convert_dtypes(df_olx, combined_df_schema_json["dtypes"])

    return df_olx

def transform_otodom(df_otodom: pd.DataFrame, combined_df_schema_json: dict) -> pd.DataFrame:
    """
    Transforms the df_otodom DataFrame to align with the combined DataFrame schema.
    This includes ensuring columns match the MultiIndex format of the combined schema,
    filling missing columns, and converting data types according to the schema.

    Args:
    - df_otodom (pd.DataFrame): DataFrame containing data from Otodom.
    - combined_df_schema_json (dict): Schema definition for the combined DataFrame,
                                      including data types and column structure.

    Returns:
    - pd.DataFrame: Transformed df_otodom aligned with the combined DataFrame schema.
    """

    # Step 1: Ensure schema compatibility with MultiIndex
    df_otodom = ensure_multiindex(df_otodom, combined_df_schema_json)

    # Step 2: Identify and fill missing columns in df_otodom based on the combined schema
    combined_df_columns_names = list(combined_df_schema_json["dtypes"].keys())
    missing_columns = set(combined_df_columns_names) - set(df_otodom.columns.get_level_values(0))

    for col in missing_columns:
        if col in [('equipment', 'furniture'),]:  # Add other boolean columns if any
            df_otodom[col] = False
        else:
            df_otodom[col] = np.nan

    # Step 3: Reorder df_otodom columns to match the combined DataFrame schema
    df_otodom = align_columns_to_schema(df_otodom, combined_df_schema_json["dtypes"])

    # Step 5: Safely convert data types according to the combined schema
    df_otodom = safely_convert_dtypes(df_otodom, combined_df_schema_json["dtypes"])

    return df_otodom

def transform_combined_df(combined_df: pd.DataFrame) -> pd.DataFrame:
    """
    Adds calculated columns to the combined DataFrame and reorders the columns.
    
    Args:
    - combined_df: DataFrame to transform.
    
    Returns:
    - pd.DataFrame: The transformed combined DataFrame.
    """

    # Step 1: Add deposit_ratio column
    combined_df.loc[:, ("pricing", "deposit_ratio")] = np.where(
        combined_df["pricing"]["total_rent"] != 0,
        (combined_df["pricing"]["deposit"] / combined_df["pricing"]["total_rent"]).round(2),
        np.nan  # or 0, depending on your preference for default value
    )

    # Step 2: Add total price per square meter column
    combined_df.loc[:, ("pricing", "total_rent_sqm")] = (combined_df['pricing']['total_rent'] / combined_df['size']['square_meters']).replace([np.inf, -np.inf], np.nan)

    # Step 3: Reorder columns
    columns = combined_df.columns.tolist()

    combined_df = combined_df[columns]

    return combined_df

def combine_olx_otodom(df_olx: pd.DataFrame, df_otodom: pd.DataFrame) -> pd.DataFrame:
    """
    Combines the OLX and Otodom DataFrames into a single DataFrame.
    """

    combined_df = pd.concat([df_otodom, df_olx], ignore_index=True)

    return combined_df

def create_combined_df(df_olx: pd.DataFrame, df_otodom: pd.DataFrame, combined_df_schema_json: dict) -> pd.DataFrame:
    """
    Creates the combined DataFrame by transforming the OLX and Otodom DataFrames and combining them.
    
    Args:
    - df_olx: DataFrame containing OLX data.
    - df_otodom: DataFrame containing Otodom data.
    
    Returns:
    - pd.DataFrame: The combined DataFrame.
    
    Raises:
    - ValueError: If both input DataFrames are None.
    """

    if df_olx is None and df_otodom is None:
        raise ValueError("Both dataframes are None.")

    if df_olx is not None:
        df_olx = transform_olx(df_olx, combined_df_schema_json)
    if df_otodom is not None:
        df_otodom = transform_otodom(df_otodom, combined_df_schema_json)

    combined_df = None
    if df_olx is not None and df_otodom is not None:
        combined_df = combine_olx_otodom(df_olx, df_otodom)
    elif df_olx is not None:
        combined_df = df_olx
    else:
        combined_df = df_otodom

    combined_df = transform_combined_df(combined_df)

    return combined_df

combined_df_schema_json = data_path_manager.load_schema("combined")

combined_df = create_combined_df(df_olx, df_otodom, combined_df_schema_json)

combined_df.tail()

Unnamed: 0_level_0,listing,listing,listing,listing,pricing,pricing,pricing,pricing,pricing,pricing,...,building_material,building_material,building_material,additional_information,additional_information,additional_information,additional_information,additional_information,additional_information,additional_information
Unnamed: 0_level_1,link,title,summary_description,remote_service,price,rent,total_rent,total_rent_sqm,deposit,deposit_ratio,...,silicate,large_panel,reinforced_concrete,no_information,duplex,air_conditioning,separate_kitchen,basement,utility_room,non_smokers_only
1101,https://www.otodom.pl/pl/oferta/katowice-ligot...,Katowice Ligota Koszalińska pokój blisko Śum,Wynajem pokoju w bardzo dobrej lokalizacji. Ni...,,800.0,,800.0,61.538462,1200.0,1.5,...,False,False,False,False,False,False,True,False,False,False
1102,https://www.otodom.pl/pl/oferta/katowice-centr...,Katowice centrum 2 pokoje dla studentów,OFERTA POLECANA! Oferujemy do wynajęcia przytu...,virtual_tour,1499.0,,1499.0,39.447368,3000.0,2.0,...,False,False,False,False,False,False,True,True,False,False
1103,https://www.otodom.pl/pl/oferta/kawalerka-bryn...,Kawalerka Brynów,Kawalerka Katowice Brynów bardzo spokojne miej...,,1300.0,,1300.0,38.235294,2600.0,2.0,...,False,False,False,False,False,False,True,True,False,False
1104,https://www.otodom.pl/pl/oferta/wynajme-mieszk...,Wynajmę Mieszkanie / Pogoń / Sosnowiec,"Wynajmę mieszkanie 56 m2 , dwa pokoje , kuchni...",,700.0,,700.0,12.5,2000.0,2.86,...,False,False,False,False,False,False,False,True,False,False
1105,https://www.olx.pl/d/oferta/kawalerka-tychy-os...,Kawalerka Tychy oś M,OpisWynajmę kawalerkę na ul Moniuszki 1/16 .na...,,1500.0,400.0,1900.0,73.076923,,,...,False,False,False,True,False,False,False,False,False,False


In [30]:
pd.reset_option('display.max_rows')

In [31]:
combined_df.dtypes.to_dict()

{('listing', 'link'): string[python],
 ('listing', 'title'): string[python],
 ('listing', 'summary_description'): string[python],
 ('listing', 'remote_service'): string[python],
 ('pricing', 'price'): dtype('float64'),
 ('pricing', 'rent'): dtype('float64'),
 ('pricing', 'total_rent'): dtype('float64'),
 ('pricing', 'total_rent_sqm'): Float64Dtype(),
 ('pricing', 'deposit'): dtype('float64'),
 ('pricing', 'deposit_ratio'): dtype('float64'),
 ('location', 'complete_address'): string[python],
 ('location', 'street'): string[python],
 ('location', 'city'): string[python],
 ('location', 'voivodeship'): string[python],
 ('size', 'square_meters'): Int64Dtype(),
 ('size', 'number_of_rooms'): Int64Dtype(),
 ('size', 'floor'): Int64Dtype(),
 ('size', 'attic'): dtype('bool'),
 ('size', 'building_floors'): Int64Dtype(),
 ('legal_and_availability', 'available_from'): string[python],
 ('legal_and_availability', 'completion'): string[python],
 ('legal_and_availability', 'ownership'): string[python],

Saving and checking combined df

In [32]:
data_path_manager.save_df(combined_df, domain="combined")

In [33]:
combined_df = data_path_manager._load_cleaned_df(domain="combined")
combined_df.sample(5)

Unnamed: 0_level_0,listing,listing,listing,listing,pricing,pricing,pricing,pricing,pricing,pricing,...,building_material,building_material,building_material,additional_information,additional_information,additional_information,additional_information,additional_information,additional_information,additional_information
Unnamed: 0_level_1,link,title,summary_description,remote_service,price,rent,total_rent,total_rent_sqm,deposit,deposit_ratio,...,silicate,large_panel,reinforced_concrete,no_information,duplex,air_conditioning,separate_kitchen,basement,utility_room,non_smokers_only
863,https://www.otodom.pl/pl/oferta/dwa-pokoje-po-...,Dwa pokoje po remoncie w Piekarach Śl .Brzozowice,Oferuję do wynajęcia lokal mieszkalny usytuowa...,,1550.0,,1550.0,31.632653,3000.0,1.94,...,False,False,False,False,False,False,False,True,False,False
672,https://www.otodom.pl/pl/oferta/piekne-2-pok-u...,Piękne 2 Pok! Umeblowane I Wyposażone! Balkon!...,PROMINENTNA LOKALIZACJA! ATRAKCYJNY UKŁAD POMI...,,2400.0,,2400.0,58.536585,,,...,False,False,False,True,False,False,False,False,False,False
291,https://www.otodom.pl/pl/oferta/debowe-tarasy-...,Dębowe Tarasy 2 pokoje wynajmę bezpośrednio,Wynajmę mieszkanie dwupokojowe na prestiżowym ...,unspecified,2500.0,800.0,3300.0,66.0,6000.0,1.82,...,False,False,False,False,False,False,False,False,True,False
920,https://www.otodom.pl/pl/oferta/duze-mieszkani...,"Duże Mieszkanie Z Balkonem, 4 Oddzielne Pokoje!",WYNAJEM MYSŁOWICE DUŻE MIESZKANIE - 4 ODDZIELN...,,2000.0,,2000.0,27.39726,,,...,False,False,False,False,False,False,False,True,False,False
229,https://www.otodom.pl/pl/oferta/mieszkanie-4-p...,"Mieszkanie 4 pokojowe na granicy Katowic, BYTKÓW",Do wynajęcia piękny przestronny apartament -mi...,,3800.0,770.0,4570.0,43.942308,5000.0,1.09,...,False,False,False,True,False,False,False,False,False,False


In [34]:
combined_df[('listing', 'link')].duplicated().sum()

0

In [36]:
len(combined_df)

1106