# Data Preview

## 1. Set up

In [1]:
# Standard imports
from pathlib import Path
import re
import sys

def set_project_root():
    notebooks_dir = Path.cwd()

    project_root = notebooks_dir.parents[2]

    if str(project_root) not in sys.path:
        sys.path.append(str(project_root))

    return project_root

project_root = set_project_root()

### 1.3 Importing Data

In [2]:
# Third-party imports
import pandas as pd

# Local imports
from pipeline.stages._csv_utils import DataPathCleaningManager
from pipeline.config._conf_file_manager import ConfigManager

config_file = ConfigManager("run_pipeline.conf")
TIMEPLACE = "MARKET_OFFERS_TIMEPLACE"
data_timeplace = config_file.read_value(TIMEPLACE)

if data_timeplace is None:

    message = f"The environment variable {TIMEPLACE} is not set."
    raise ValueError(message)

data_path_manager = DataPathCleaningManager(data_timeplace, project_root)

df_olx = data_path_manager.load_df(domain="olx", is_cleaned=False)

### 1.2 Functions

In [3]:
def count_and_percentage(df, column_name):
    """
    Function to calculate the count and percentage of unique values in a given column of a DataFrame.

    Parameters:
    df (pandas.DataFrame): The DataFrame to analyze.
    column_name (str): The name of the column in the DataFrame.

    Returns:
    pandas.DataFrame: A DataFrame with the count and percentage of each unique value in the specified column.

    Raises:
    ValueError: If the specified column is not found in the DataFrame.
    """
    # Check if the column exists in the DataFrame
    if column_name not in df.columns:
        message = f"Column '{column_name}' not found in DataFrame."
        raise ValueError(message)

    # Calculate count and normalized values
    count = df[column_name].value_counts(dropna=False)
    normalized = df[column_name].value_counts(dropna=False, normalize=True) * 100

    # Concatenate count and normalized values side by side
    result = pd.concat([count, normalized], axis=1)
    result.columns = ['Count', 'Percentage']

    return result

In [4]:
def count_comma_separated_values(df, column_name):
    """
    Counts the occurrences of individual elements in a comma-separated string column of a DataFrame.

    Parameters:
    df (pandas.DataFrame): The DataFrame containing the column.
    column_name (str): The name of the column to analyze.

    Returns:
    pandas.DataFrame: A DataFrame with the count and percentage of each unique element found in the comma-separated values.

    Raises:
    ValueError: If the specified column is not found in the DataFrame.
    """
    # Check if the column exists in the DataFrame
    if column_name not in df.columns:
        message
        raise ValueError(f"Column '{column_name}' not found in DataFrame.")

    # Split the column values, explode to individual elements, and count
    exploded_items = df[column_name].dropna().str.split(', ').explode()
    exploded_df = pd.DataFrame({column_name: exploded_items})
    counts_and_percent = count_and_percentage(exploded_df, column_name)

    return counts_and_percent

In [5]:
def remove_non_numeric_characters(df, column_name):
    """
    Removes all non-numeric characters from a column of a DataFrame.

    Parameters:
    df (pandas.DataFrame): The DataFrame containing the column.
    column_name (str): The name of the column to analyze.

    Returns:
    pandas.DataFrame: A DataFrame with all non-numeric characters removed from the specified column.

    Raises:
    ValueError: If the specified column is not found in the DataFrame.
    """

    return df[column_name].str.replace('[^a-zA-Z]', '', regex=True).unique()

In [6]:
def count_words(text):
    if pd.isna(text):
        return 0
    return len(str(text).split())


## 2. Data preview

In [7]:
df_olx.head(3)

Unnamed: 0,link,title,price,summary_description,location,ownership,floor_level,is_furnished,building_type,square_meters,number_of_rooms,rent
0,https://www.olx.pl/d/oferta/kawalerka-mierzeci...,Kawalerka Mierzęcice Osiedle,1 400 złdo negocjacji,OpisDo wynajęcia kawalerka o powierzchni 34m2 ...,"Śląskie, Mierzęcice",Prywatne,Poziom: 2,Umeblowane: Tak,Rodzaj zabudowy: Blok,Powierzchnia: 34 m²,Liczba pokoi: Kawalerka,Czynsz (dodatkowo): 350 zł
1,https://www.olx.pl/d/oferta/mieszkanie-37m2-w-...,Mieszkanie 37m2 w Mierzęcicach. Polecam,1 000 zł,"OpisWitam, mam do wynajęcia piękne mieszkanie ...","Śląskie, Mierzęcice",Prywatne,Poziom: 3,Umeblowane: Tak,Rodzaj zabudowy: Blok,Powierzchnia: 37 m²,Liczba pokoi: 2 pokoje,Czynsz (dodatkowo): 550 zł
2,https://www.olx.pl/d/oferta/wynajme-mieszkanie...,Wynajmę mieszkanie w Mierzęcicach,2 600 złdo negocjacji,"OpisWynajmę mieszkanie 2 pokojowe, na 1 piętrz...","Śląskie, Mierzęcice",Prywatne,Poziom: 1,Umeblowane: Tak,Rodzaj zabudowy: Blok,Powierzchnia: 49 m²,Liczba pokoi: 2 pokoje,Czynsz (dodatkowo): 1 zł


### OLX

In [8]:
def clean_olx_data(df):

    split_locations = df['location'].str.split(', ')
    df['voivodeship'] = split_locations.str[0]
    df['city'] = split_locations.apply(lambda x: ', '.join(x[1:]) if len(x) > 1 else '')
    
    pattern = r'ul\s+(\w+\s+\d+/\d+)'
    df['street'] = df['summary_description'].apply(lambda x: re.search(pattern, x).group(1) if re.search(pattern, x) else None)
    
    del df['location']
    
    df['price'] = df['price'].str.extract('(\d+ \d+)')[0].str.replace(' ', '').astype(float)
    df['rent'] = df['rent'].str.extract('(\d+)')[0].astype(float)


    # Extract and convert 'square_meters' into integers
    df['square_meters'] = df['square_meters'].str.extract('(\d+)')[0].astype('Int64')

    # Convert 'number_of_rooms' into an integer, special handling for "Kawalerka"
    df['number_of_rooms'] = df['number_of_rooms'].replace('Liczba pokoi: Kawalerka', '1')
    df['number_of_rooms'] = df['number_of_rooms'].str.extract('(\d+)')
    df['number_of_rooms'] = df['number_of_rooms'].astype('Int64')

    # Extract and clean 'floor_level', 'is_furnished', 'building_type'
    df['floor_level'] = df['floor_level'].str.extract('Poziom: (\d+)')[0]
    df['is_furnished'] = df['is_furnished'].map({'Umeblowane: Tak': True, 'Umeblowane: Nie': False})
    df['building_type'] = df['building_type'].str.extract('Rodzaj zabudowy: (.+)')[0]

    return df

In [9]:
df_olx_cleaned = clean_olx_data(df_olx)
df_olx_cleaned.head()

Unnamed: 0,link,title,price,summary_description,ownership,floor_level,is_furnished,building_type,square_meters,number_of_rooms,rent,voivodeship,city,street
0,https://www.olx.pl/d/oferta/kawalerka-mierzeci...,Kawalerka Mierzęcice Osiedle,1400.0,OpisDo wynajęcia kawalerka o powierzchni 34m2 ...,Prywatne,2.0,True,Blok,34,1,350.0,Śląskie,Mierzęcice,
1,https://www.olx.pl/d/oferta/mieszkanie-37m2-w-...,Mieszkanie 37m2 w Mierzęcicach. Polecam,1000.0,"OpisWitam, mam do wynajęcia piękne mieszkanie ...",Prywatne,3.0,True,Blok,37,2,550.0,Śląskie,Mierzęcice,
2,https://www.olx.pl/d/oferta/wynajme-mieszkanie...,Wynajmę mieszkanie w Mierzęcicach,2600.0,"OpisWynajmę mieszkanie 2 pokojowe, na 1 piętrz...",Prywatne,1.0,True,Blok,49,2,1.0,Śląskie,Mierzęcice,
3,https://www.olx.pl/d/oferta/pokoj-z-kuchnia-i-...,Pokój z kuchnią i łazienką blisko lotniska,1500.0,OpisWitam. Mam do zaoferowania pokój z kuchnią...,Prywatne,,True,Dom wolnostojący,28,1,1.0,Śląskie,Najdziszów,


In [10]:
df_olx_cleaned.dtypes

link                    object
title                   object
price                  float64
summary_description     object
ownership               object
floor_level             object
is_furnished              bool
building_type           object
square_meters            Int64
number_of_rooms          Int64
rent                   float64
voivodeship             object
city                    object
street                  object
dtype: object

In [11]:
df_olx_cleaned['link'] = df_olx_cleaned['link'].astype('string')
df_olx_cleaned['title'] = df_olx_cleaned['title'].astype('string')
df_olx_cleaned['summary_description'] = df_olx_cleaned['summary_description'].astype('string')
df_olx_cleaned['ownership'] = df_olx_cleaned['ownership'].astype('string')
df_olx_cleaned['floor_level'] = df_olx_cleaned['floor_level'].astype('Int64')
df_olx_cleaned['building_type'] = df_olx_cleaned['building_type'].astype('string')
df_olx_cleaned['voivodeship'] = df_olx_cleaned['voivodeship'].astype('string')
df_olx_cleaned['city'] = df_olx_cleaned['city'].astype('string')
df_olx_cleaned['street'] = df_olx_cleaned['street'].astype('string')

df = df_olx_cleaned.rename(columns={'floor_level': 'floor'})

df_olx_cleaned.dtypes


link                    string
title                   string
price                  float64
summary_description     string
ownership               string
floor_level              Int64
is_furnished              bool
building_type           string
square_meters            Int64
number_of_rooms          Int64
rent                   float64
voivodeship             string
city                    string
street                  string
dtype: object

In [12]:
def safe_lower(value):
    if pd.isna(value):
        return value  # Return NaN as is
    return value.lower()

# Apply safe_lower before mapping to ensure case insensitivity without affecting NaN values
df_olx_cleaned['ownership'] = df_olx_cleaned['ownership'].apply(safe_lower).map({
    'biuro nieruchomości': 'real_estate_agency',
    'prywatne': 'private', 
    'prywatny': 'private', 
    'deweloper': 'developer'
})

df_olx_cleaned['building_type'] = df_olx_cleaned['building_type'].apply(safe_lower).map({
    'blok': 'block_of_flats', 
    'apartamentowiec': 'apartment_building', 
    'kamienica': 'historic_apartment_building',
    'dom wolnostojący': 'detached_house',
    'szeregowiec': 'terraced_house',
})

In [13]:
df_olx_cleaned.head()

Unnamed: 0,link,title,price,summary_description,ownership,floor_level,is_furnished,building_type,square_meters,number_of_rooms,rent,voivodeship,city,street
0,https://www.olx.pl/d/oferta/kawalerka-mierzeci...,Kawalerka Mierzęcice Osiedle,1400.0,OpisDo wynajęcia kawalerka o powierzchni 34m2 ...,private,2.0,True,block_of_flats,34,1,350.0,Śląskie,Mierzęcice,
1,https://www.olx.pl/d/oferta/mieszkanie-37m2-w-...,Mieszkanie 37m2 w Mierzęcicach. Polecam,1000.0,"OpisWitam, mam do wynajęcia piękne mieszkanie ...",private,3.0,True,block_of_flats,37,2,550.0,Śląskie,Mierzęcice,
2,https://www.olx.pl/d/oferta/wynajme-mieszkanie...,Wynajmę mieszkanie w Mierzęcicach,2600.0,"OpisWynajmę mieszkanie 2 pokojowe, na 1 piętrz...",private,1.0,True,block_of_flats,49,2,1.0,Śląskie,Mierzęcice,
3,https://www.olx.pl/d/oferta/pokoj-z-kuchnia-i-...,Pokój z kuchnią i łazienką blisko lotniska,1500.0,OpisWitam. Mam do zaoferowania pokój z kuchnią...,private,,True,detached_house,28,1,1.0,Śląskie,Najdziszów,


## 3. Save cleaned data

### 3.1. Save data

In [14]:
data_path_manager.save_df(df_olx_cleaned, domain="olx")

2024-02-26 17:55:48: Saving schema to d:\UserData karol\Documents\Programming\Data Science\Data Engineering\Rent comparisions\Home Market Harvester\data\cleaned\2024_02_26_17_43_23_Mierzęcice__Będziński__Śląskie\olx_pl_schema.json
2024-02-26 17:55:48: Saving CSV to d:\UserData karol\Documents\Programming\Data Science\Data Engineering\Rent comparisions\Home Market Harvester\data\cleaned\2024_02_26_17_43_23_Mierzęcice__Będziński__Śląskie\olx.pl.csv


### 3.2 Check saved data

#### OLX

In [15]:
df_olx_saved = data_path_manager.load_df(domain="olx", is_cleaned=True)
df_olx_saved.head()

Unnamed: 0,link,title,price,summary_description,ownership,floor_level,is_furnished,building_type,square_meters,number_of_rooms,rent,voivodeship,city,street
0,https://www.olx.pl/d/oferta/kawalerka-mierzeci...,Kawalerka Mierzęcice Osiedle,1400.0,OpisDo wynajęcia kawalerka o powierzchni 34m2 ...,private,2.0,True,block_of_flats,34,1,350.0,Śląskie,Mierzęcice,
1,https://www.olx.pl/d/oferta/mieszkanie-37m2-w-...,Mieszkanie 37m2 w Mierzęcicach. Polecam,1000.0,"OpisWitam, mam do wynajęcia piękne mieszkanie ...",private,3.0,True,block_of_flats,37,2,550.0,Śląskie,Mierzęcice,
2,https://www.olx.pl/d/oferta/wynajme-mieszkanie...,Wynajmę mieszkanie w Mierzęcicach,2600.0,"OpisWynajmę mieszkanie 2 pokojowe, na 1 piętrz...",private,1.0,True,block_of_flats,49,2,1.0,Śląskie,Mierzęcice,
3,https://www.olx.pl/d/oferta/pokoj-z-kuchnia-i-...,Pokój z kuchnią i łazienką blisko lotniska,1500.0,OpisWitam. Mam do zaoferowania pokój z kuchnią...,private,,True,detached_house,28,1,1.0,Śląskie,Najdziszów,


In [16]:
df_olx_saved.dtypes

link                    string
title                   string
price                  float64
summary_description     string
ownership               object
floor_level              Int64
is_furnished              bool
building_type           object
square_meters            Int64
number_of_rooms          Int64
rent                   float64
voivodeship             string
city                    string
street                  string
dtype: object

In [17]:
are_identical = df_olx_saved.equals(df_olx_cleaned)
if not are_identical:
    message = "The saved DataFrame is not identical to the original one."
    raise ValueError(message)
else:
    print("The saved DataFrame is identical to the original one.")

The saved DataFrame is identical to the original one.
