In [155]:
import pandas as pd
import numpy as np
import re

In [156]:
# pd.set_option('display.max_colwidth', None)

In [157]:
pd.reset_option('display.max_colwidth')

In [158]:
collected_08_23 = {
    "otodom_krk" : "../original_data/collected_08_23/master_data/otodom_krk.json",
    "otodom_gda" : "../original_data/collected_08_23/master_data/otodom_gda.json",
    "olx_gda" : "../original_data/collected_08_23/master_data/olx_gda.json",
    "olx_krk" : "../original_data/collected_08_23/master_data/olx_krk.json",
    "gratka_gda" : "../original_data/collected_08_23/master_data/gratka_gda.json",
    "gratak_krk" : "../original_data/collected_08_23/master_data/gratka_krk.json",
    "trojmiasto_gda" : "../original_data/collected_08_23/master_data/trojmiasto_gda.json"
}

# Create a dictionary to store DataFrames
dataframes_08_23 = {}

# Iterate through the JSON data and create DataFrames
for key, value in collected_08_23.items():
    dataframes_08_23[key] = pd.read_json(value)  # Assuming the data is in JSON format

# Now, you have a dictionary of DataFrames where each key is a dataset
# You can access them like this:
otodom_krk_df = dataframes_08_23["otodom_krk"]
otodom_gda_df = dataframes_08_23["otodom_gda"]
olx_gda_df = dataframes_08_23["olx_gda"]
olx_krk_df = dataframes_08_23["olx_krk"]
gratka_gda_df = dataframes_08_23["gratka_gda"]
gratak_krk_df = dataframes_08_23["gratak_krk"]
trojmiasto_gda_df = dataframes_08_23["trojmiasto_gda"]

In [159]:
df = olx_gda_df

In [160]:
olx_gda_districts_mapping = {
    "Śródmieście": "Srodmiescie - Stare Miasto - Dolne Miasto - Długie Ogrody",
    "Przymorze Wielkie": "Przymorze",
    "Brzeźno": "Brzezno",
    "Przymorze Małe": "Przymorze",
    "Wrzeszcz": "Wrzeszcz",
    "Żabianka - Wejhera - Jelitkowo - Tysiąclecia": "Zabianka - Wejhera - Jelitkowo - Tysiaclecia",
    "Chełm z dzielnicą Gdańsk Południe": "Chelm - Lostowice - Poludnie",
    "Jasień": "Jasien - Matarnia - Kokoszki - Osowa",
    "Ujeścisko - Łostowice": "Chelm - Lostowice - Poludnie",
    "Letnica": "Letnica",
    "Piecki-Migowo": "Piecki Migowo - Suchanino - Morena",
    "Zaspa Młyniec": "Zaspa",
    "Orunia - Św. Wojciech - Lipce": "Orunia",
    "Zaspa Rozstaje": "Zaspa",
    "Siedlce": "Siedlce - Aniolki - Mlyniska - Mickiewicza",
    "Stogi z Przeróbką": "Przerobka - Stogi - Nowy Port",
    "Oliwa": "Oliwa - Strzyza",
    "Aniołki": "Siedlce - Aniolki - Mlyniska - Mickiewicza",
    "Wzgórze Mickiewicza": "Siedlce - Aniolki - Mlyniska - Mickiewicza",
    "Matarnia": "Jasien - Matarnia - Kokoszki - Osowa",
    "Suchanino": "Piecki Migowo - Suchanino - Morena",
    "Brętowo": "Piecki Migowo - Suchanino - Morena",
    "Kokoszki": "Jasien - Matarnia - Kokoszki - Osowa",
    "Strzyża": "Oliwa - Strzyza",
    "Młyniska": "Siedlce - Aniolki - Mlyniska - Mickiewicza"
}

olx_rooms_mapping = {
    "2pokoje": 2,
    "kawalerka": 1,
    "3pokoje": 3, 
    "4iwięcej": 4,

}

olx_private_seller_mapping = {
    1.0: True,
    np.nan: False
}

olx_furnished_mapping = {
    "Tak": True,
}

olx_level_mapping = {
    "powyżej10": 11,
    "suterena": 0,
    "parter": 0,
    "1":1, "2": 2, "3":3, "4":4, "5":5, "6":6, "7":7, "8":8, "9":9, "10":10
}

olx_building_type_mapping = {
    "blok": "unit",
    "apartamentowiec": "apartment_building",
    "kamienica": "tenement",
    "dom wolnostojący": "house",
    "szeregowiec": "house",
    "loft": "loft"
}

In [161]:
def convert_numeric_to_float(convertable_string):
    return float(convertable_string.replace("zł", "").replace(" ", "").replace(",",".").replace("\ndonegocjacji", ""))

def clear_olx(df: pd.DataFrame, city):
    df.drop(
        labels=[
            'link', 
            'username', 
            'on_olx_since', 
            'last_activity',
            'title',
            'description',
            'Firmowe',
            'add_date',
        ],
        inplace= True,
        axis= 1
    )
    
    is_negotiable_list = []
    is_furnished_list = []
    
    for index, row in df.iterrows():

        # negotiable
        if 'do negocjacji' in row['rent']:
            is_negotiable_list.append(True)
        else:
            is_negotiable_list.append(False)

        # furnished
        if 'Tak' in row['Umeblowane']:
            is_furnished_list.append(True)
        else:
            is_furnished_list.append(False)

    # negotiable apply
    df['negotiable'] = is_negotiable_list

    # building_type
    df = df.rename(mapper={"Rodzaj zabudowy": "building_type"}, axis=1)
    df["building_type"] = df["building_type"].str.replace(" ", "").str.lower()
    df["building_type"] = df["building_type"].map(olx_building_type_mapping)
    df.drop(df["building_type"] == "loft", axis=0)
    
    # furnished apply
    df = df.rename(mapper={"Umeblowane": "furnished"}, axis=1)
    df['furnished'] = is_furnished_list
    
    # private_seller
    df = df.rename(mapper={"Prywatne": "private_seller"}, axis=1)
    df['private_seller'] = df['private_seller'].map(olx_private_seller_mapping)
    
    # negotiable apply
    df['negotiable'] = is_negotiable_list
    
    # surface
    df = df.rename(mapper={"Powierzchnia": "surface"}, axis=1)
    df['surface'] = df['surface'].str.replace("m²", "").str.replace(" ", "").str.replace(",", ".").astype(float)

    # level 
    df = df.rename(mapper={"Poziom": "level"}, axis=1)
    df['level'] = df['level'].str.replace(" ", "").str.lower()
    df['level'] = df['level'].map(olx_level_mapping)
    df['level'].fillna(1001, inplace=True)
    
    # rooms 
    df = df.rename(mapper={"Liczba pokoi": "rooms"}, axis=1)
    df['rooms'] = df['rooms'].str.replace(" ", "").str.lower()
    df['rooms'] = df['rooms'].map(olx_rooms_mapping)
    df['rooms'] = df['rooms'].astype(int)
    
    # rent
    df['rent'] = df['rent'].apply(convert_numeric_to_float)

    # rent_extra
    df = df.rename(mapper={"Czynsz (dodatkowo)": "rent_extra"}, axis=1)
    df['rent_extra'] = df['rent_extra'].apply(convert_numeric_to_float)

    # location
    if city.lower() == "gda":
        df['location'] = df['location'].map(olx_gda_districts_mapping)
    elif city.lower() == "krk":
        pass
        # df['location'] = df['location'].map(olx_krk_districts_mapping)

    return df

In [162]:
df = clear_olx(df, city="gda")

KeyError: '[False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False] not found in axis'

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.isna().sum()

In [None]:
df["building_type"]

In [None]:
df["building_type"].value_counts()