# EDA

## 1. Load combined data

In [1]:
# Standard imports
from pathlib import Path
import os
import sys
import time

def set_project_root():
    notebooks_dir = Path.cwd()

    # Calculate the root directory of the project (go up three levels)
    project_root = notebooks_dir.parent.parent.parent

    if str(project_root) not in sys.path:
        print(f"The root directory of the project is: {project_root}")
        sys.path.append(str(project_root))

    return project_root

project_root = set_project_root()

# Suppress future warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Third party imports
from geopy.exc import  GeocoderTimedOut, GeocoderUnavailable
from geopy.geocoders import Nominatim
import pandas as pd

# Local imports
from pipeline.config._config_manager import ConfigManager
from pipeline.src._csv_utils import DataPathCleaningManager

The root directory of the project is: d:\UserData karol\Documents\Programming\Data Science\Data Engineering\Rent comparisions\Home Market Harvester


In [2]:
config_file = ConfigManager("run_pipeline.conf")
TIMEPLACE = "MARKET_OFFERS_TIMEPLACE"
data_timeplace = config_file.read_value(TIMEPLACE)
if data_timeplace is None:
    raise ValueError(F"The configuration variable {TIMEPLACE} is not set.")

data_path_manager = DataPathCleaningManager(data_timeplace, project_root)

combined_df = data_path_manager.load_df(domain="combined", is_cleaned=True)


In [3]:
if len(combined_df) < 5:
    print(f"The DataFrame has {len(combined_df)} rows.")
    combined_df.head()

In [4]:
if len(combined_df) >= 5:
    combined_df.sample(5).head()

In [5]:
combined_df[('listing', 'link')].duplicated().sum()

0

In [6]:
len(combined_df)

10

## 2. Creating Map Data and saving it

In [7]:
geolocator = Nominatim(user_agent="your_app_name")

def get_coordinates(address, attempt=1, max_attempts=3):
    try:
        location = geolocator.geocode(address, timeout=10)  # Increase timeout to 10 seconds
        if location:
            return (location.latitude, location.longitude)
        else:
            return (None, None)
    except GeocoderTimedOut:
        if attempt <= max_attempts:
            time.sleep(1 * attempt)  # Exponential backoff
            return get_coordinates(address, attempt + 1, max_attempts)
        return (None, None)
    except GeocoderUnavailable:
        return (None, None)

def add_geo_data_to_offers(df: pd.DataFrame):

    df_temp = pd.DataFrame()
    df_temp['complete_address'] = df[('location', 'complete_address')]
    df_temp['city'] = df[('location', 'city')] + ", " + df[('location', 'voivodeship')]
    df_temp['price_total'] = df[('pricing', 'total_rent')]
    df_temp['price'] = df[('pricing', 'price')]
    df_temp['rent'] = df[('pricing', 'rent')]
    df_temp['rent_sqm'] = df[('pricing', 'total_rent_sqm')]
    df_temp['sqm'] = df[('size', 'square_meters')]
    df_temp['is_furnished'] = df[('equipment', 'furniture')]

    # Create unique address list
    unique_addresses = df[('location', 'complete_address')].unique()
    address_coords = {}

    for address in unique_addresses:
        coords = get_coordinates(address)
        if coords == (None, None):
            # If coordinates for the complete address are not found, try with city
            city = df_temp[df_temp['complete_address'] == address]['city'].values[0]
            coords = get_coordinates(city)
        address_coords[address] = coords

    # Map the coordinates back to the DataFrame
    df_temp['coords'] = df_temp['complete_address'].map(address_coords)

    return df_temp

In [8]:
map_df = add_geo_data_to_offers(combined_df) # Takes a while to run due to server requests latency
map_df.head()

Unnamed: 0,complete_address,city,price_total,price,rent,rent_sqm,sqm,is_furnished,coords
0,"ul. Władysława Przanowskiego 83, Ulrychów, Wol...","Warszawa, mazowieckie",3000.0,3000.0,,83.33,36.0,True,"(52.2337172, 21.071432235636493)"
1,"ul. Żeglugi Wiślanej, Kobiałka, Białołęka, War...","Warszawa, mazowieckie",2340.0,2340.0,,45.88,51.0,True,"(52.2337172, 21.071432235636493)"
2,"ul. Ostrobramska, Gocław, Praga-Południe, Wars...","Warszawa, mazowieckie",3201.0,3200.0,1.0,74.44,43.0,True,"(52.2337172, 21.071432235636493)"
3,"ul. Erazma z Zakroczymia, Tarchomin, Białołęka...","Warszawa, mazowieckie",3000.0,3000.0,,47.62,63.0,True,"(52.2337172, 21.071432235636493)"
4,"Warszawa, Ursynów, Mazowieckie","Warszawa, Ursynów, Mazowieckie",2800.0,2100.0,700.0,93.33,30.0,True,"(52.1103103, 20.9931266)"


In [9]:
data_path_manager.save_df(map_df, "map")

Saving schema to d:\UserData karol\Documents\Programming\Data Science\Data Engineering\Rent comparisions\Home Market Harvester\data\cleaned\2024_02_09_11_45_45_Warszawa\map_df_schema.json
Saving CSV to d:\UserData karol\Documents\Programming\Data Science\Data Engineering\Rent comparisions\Home Market Harvester\data\cleaned\2024_02_09_11_45_45_Warszawa\map_df.csv


In [10]:
map_df = data_path_manager.load_df("map", is_cleaned=True)

In [11]:
print("Unique geo coordinates:")
len(map_df['coords'].unique())

Unique geo coordinates:


6