In [51]:
import pandas as pd
import numpy as np
import re

In [52]:
# pd.set_option('display.max_colwidth', None)

In [53]:
pd.reset_option('display.max_colwidth')

In [54]:
collected_08_23 = {
    "otodom_krk" : "../original_data/collected_08_23/master_data/otodom_krk.json",
    "otodom_gda" : "../original_data/collected_08_23/master_data/otodom_gda.json",
    "olx_gda" : "../original_data/collected_08_23/master_data/olx_gda.json",
    "olx_krk" : "../original_data/collected_08_23/master_data/olx_krk.json",
    "gratka_gda" : "../original_data/collected_08_23/master_data/gratka_gda.json",
    "gratak_krk" : "../original_data/collected_08_23/master_data/gratka_krk.json",
    "trojmiasto_gda" : "../original_data/collected_08_23/master_data/trojmiasto_gda.json"
}

# Create a dictionary to store DataFrames
dataframes_08_23 = {}

# Iterate through the JSON data and create DataFrames
for key, value in collected_08_23.items():
    dataframes_08_23[key] = pd.read_json(value)  # Assuming the data is in JSON format

# Now, you have a dictionary of DataFrames where each key is a dataset
# You can access them like this:
otodom_krk_df = dataframes_08_23["otodom_krk"]
otodom_gda_df = dataframes_08_23["otodom_gda"]
olx_gda_df = dataframes_08_23["olx_gda"]
olx_krk_df = dataframes_08_23["olx_krk"]
gratka_gda_df = dataframes_08_23["gratka_gda"]
gratak_krk_df = dataframes_08_23["gratak_krk"]
trojmiasto_gda_df = dataframes_08_23["trojmiasto_gda"]

In [55]:
df = olx_gda_df

In [56]:
olx_gda_df.isna().sum()

rent                    0
location                0
username                0
on_olx_since            0
last_activity           0
add_date                0
title                   0
link                    0
description             0
Prywatne               46
Poziom                 12
Umeblowane              0
Rodzaj zabudowy         0
Powierzchnia            0
Liczba pokoi            0
Czynsz (dodatkowo)      0
Firmowe               328
dtype: int64

In [57]:
olx_gda_districts_mapping = {
    "Śródmieście": "Srodmiescie - Stare Miasto - Dolne Miasto - Długie Ogrody",
    "Przymorze Wielkie": "Przymorze",
    "Brzeźno": "Brzezno",
    "Przymorze Małe": "Przymorze",
    "Wrzeszcz": "Wrzeszcz",
    "Żabianka - Wejhera - Jelitkowo - Tysiąclecia": "Zabianka - Wejhera - Jelitkowo - Tysiaclecia",
    "Chełm z dzielnicą Gdańsk Południe": "Chelm - Lostowice - Poludnie",
    "Jasień": "Jasien - Matarnia - Kokoszki - Osowa",
    "Ujeścisko - Łostowice": "Chelm - Lostowice - Poludnie",
    "Letnica": "Letnica",
    "Piecki-Migowo": "Piecki Migowo - Suchanino - Morena",
    "Zaspa Młyniec": "Zaspa",
    "Orunia - Św. Wojciech - Lipce": "Orunia",
    "Zaspa Rozstaje": "Zaspa",
    "Siedlce": "Siedlce - Aniolki - Mlyniska - Mickiewicza",
    "Stogi z Przeróbką": "Przerobka - Stogi - Nowy Port",
    "Oliwa": "Oliwa - Strzyza",
    "Aniołki": "Siedlce - Aniolki - Mlyniska - Mickiewicza",
    "Wzgórze Mickiewicza": "Siedlce - Aniolki - Mlyniska - Mickiewicza",
    "Matarnia": "Jasien - Matarnia - Kokoszki - Osowa",
    "Suchanino": "Piecki Migowo - Suchanino - Morena",
    "Brętowo": "Piecki Migowo - Suchanino - Morena",
    "Kokoszki": "Jasien - Matarnia - Kokoszki - Osowa",
    "Strzyża": "Oliwa - Strzyza",
    "Młyniska": "Siedlce - Aniolki - Mlyniska - Mickiewicza"
}

olx_rooms_mapping = {
    "2pokoje": 2,
    "kawalerka": 1,
    "3pokoje": 3, 
    "4iwięcej": 4,

}

olx_private_seller_mapping = {
    1.0: True,
    np.nan: False
}

olx_furnished_mapping = {
    "Tak": True,
}

olx_level_mapping = {
    "powyżej10": 1,
    "suterena": 0,
    "parter": 0,
    "1":1, "2": 2, "3":3, "4":4, "5":5, "6":6, "7":7, "8":8, "9":9, "10":10
}

# olx_building_type_mapping = {
#     "Blok": input("Mapping should be added")
#     "Apartamentowiec": 
#     "Kamienica": 
#     "Dom wolnostojący": 
#     "Szeregowiec": 
#     "Loft": 
# }

In [58]:
def convert_numeric_to_float(convertable_string):
    return float(convertable_string.replace("zł", "").replace(" ", "").replace(",",".").replace("\ndonegocjacji", ""))

def clear_olx(df: pd.DataFrame, city):
    df.drop(
        labels=[
            'link', 
            'username', 
            'on_olx_since', 
            'last_activity',
            'title',
            'description',
            'Firmowe',
            'add_date',
        ],
        inplace= True,
        axis= 1
    )
    
    is_negotiable_list = []
    is_furnished_list = []
    
    for index, row in df.iterrows():

        # negotiable
        if 'do negocjacji' in row['rent']:
            is_negotiable_list.append(True)
        else:
            is_negotiable_list.append(False)

        # furnished
        if 'Tak' in row['Umeblowane']:
            is_furnished_list.append(True)
        else:
            is_furnished_list.append(False)

    # negotiable apply
    df['negotiable'] = is_negotiable_list

    # furnished apply
    df = df.rename(mapper={"Umeblowane": "furnished"}, axis=1)
    df['furnished'] = is_furnished_list
    
    # private_seller
    df = df.rename(mapper={"Prywatne": "private_seller"}, axis=1)
    df['private_seller'] = df['private_seller'].map(olx_private_seller_mapping)
    
    # negotiable apply
    df['negotiable'] = is_negotiable_list
    
    # surface
    df = df.rename(mapper={"Powierzchnia": "surface"}, axis=1)
    df['surface'] = df['surface'].str.replace("m²", "").str.replace(" ", "").str.replace(",", ".").astype(float)

    # level 
    df = df.rename(mapper={"Poziom": "level"}, axis=1)
    df['level'] = df['level'].str.replace(" ", "").str.lower()
    df['level'] = df['level'].map(olx_level_mapping)
    df['level'].fillna(1001, inplace=True)
    
    # rooms 
    df = df.rename(mapper={"Liczba pokoi": "rooms"}, axis=1)
    df['rooms'] = df['rooms'].str.replace(" ", "").str.lower()
    df['rooms'] = df['rooms'].map(olx_rooms_mapping)
    df['rooms'] = df['rooms'].astype(int)
    
    # rent
    df['rent'] = df['rent'].apply(convert_numeric_to_float)

    # rent_extra
    df = df.rename(mapper={"Czynsz (dodatkowo)": "rent_extra"}, axis=1)
    df['rent_extra'] = df['rent_extra'].apply(convert_numeric_to_float)

    # location
    if city.lower() == "gda":
        df['location'] = df['location'].map(olx_gda_districts_mapping)
    elif city.lower() == "krk":
        pass
        # df['location'] = df['location'].map(olx_krk_districts_mapping)

    return df

In [59]:
olx_gda_df = clear_olx(olx_gda_df, city="gda")

In [61]:
olx_gda_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 374 entries, 0 to 373
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   rent             374 non-null    float64
 1   location         374 non-null    object 
 2   private_seller   374 non-null    bool   
 3   level            374 non-null    float64
 4   furnished        374 non-null    bool   
 5   Rodzaj zabudowy  374 non-null    object 
 6   surface          374 non-null    float64
 7   rooms            374 non-null    int64  
 8   rent_extra       374 non-null    float64
 9   negotiable       374 non-null    bool   
dtypes: bool(3), float64(4), int64(1), object(2)
memory usage: 21.7+ KB


In [62]:
olx_gda_df.head()

Unnamed: 0,rent,location,private_seller,level,furnished,Rodzaj zabudowy,surface,rooms,rent_extra,negotiable
0,2250.0,Przerobka - Stogi - Nowy Port,True,2.0,True,Blok,38.0,2,750.0,False
1,2700.0,Chelm - Lostowice - Poludnie,True,2.0,True,Blok,51.0,2,650.0,False
2,2800.0,Siedlce - Aniolki - Mlyniska - Mickiewicza,True,1.0,True,Apartamentowiec,40.0,2,320.0,False
3,2900.0,Przymorze,True,2.0,True,Apartamentowiec,41.0,2,500.0,False
4,350.0,Przymorze,True,2.0,True,Blok,45.0,2,350.0,False


In [60]:
olx_krk_df = clear_olx(olx_krk_df, city="krk")

In [63]:
olx_krk_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 629 entries, 0 to 628
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   rent             629 non-null    float64
 1   location         629 non-null    object 
 2   level            629 non-null    float64
 3   furnished        629 non-null    bool   
 4   Rodzaj zabudowy  629 non-null    object 
 5   surface          629 non-null    float64
 6   rooms            629 non-null    int64  
 7   rent_extra       629 non-null    float64
 8   private_seller   629 non-null    bool   
 9   negotiable       629 non-null    bool   
dtypes: bool(3), float64(4), int64(1), object(2)
memory usage: 36.4+ KB


In [64]:
olx_krk_df.head()

Unnamed: 0,rent,location,level,furnished,Rodzaj zabudowy,surface,rooms,rent_extra,private_seller,negotiable
0,5000.0,Nowa Huta,5.0,True,Blok,94.0,4,900.0,False,True
1,3000.0,Dębniki,1.0,True,Blok,72.0,3,650.0,True,False
2,2500.0,Krowodrza,6.0,True,Apartamentowiec,51.0,2,490.0,True,False
3,2400.0,Nowa Huta,6.0,True,Blok,42.0,2,380.0,False,True
4,3000.0,Podgórze,5.0,True,Apartamentowiec,26.0,1,400.0,False,False


In [65]:
olx_krk_df["location"].value_counts()

location
Stare Miasto                93
Krowodrza                   71
Grzegórzki                  63
Prądnik Biały               51
Podgórze                    49
Bronowice                   49
Prądnik Czerwony            48
Dębniki                     39
Czyżyny                     30
Łagiewniki-Borek Fałęcki    29
Bieńczyce                   27
Podgórze Duchackie          26
Nowa Huta                   16
Bieżanów-Prokocim           15
Mistrzejowice               10
Zwierzyniec                  8
Swoszowice                   3
Wzgórza Krzesławickie        2
Name: count, dtype: int64