# Data Preview

## 1. Set up

### 1.3 Importing Data

In [1]:
import pandas as pd
import numpy as np

# Specify the file path of the CSV file
file_paths = {
    "olx": "..\\data\\raw\\2023_11_27_19_41_45_Mierzęcice__Będziński__Śląskie\\olx.pl.csv",
    "otodom": "..\\data\\raw\\2023_11_27_19_41_45_Mierzęcice__Będziński__Śląskie\\otodom.pl.csv"
}

# Read the CSV file into a pandas DataFrame
df_otodom = pd.read_csv(file_paths["otodom"])
df_olx = pd.read_csv(file_paths["olx"])

### 1.2 Functions

In [2]:
def count_and_percentage(df, column_name):
    """
    Function to calculate the count and percentage of unique values in a given column of a DataFrame.

    Parameters:
    df (pandas.DataFrame): The DataFrame to analyze.
    column_name (str): The name of the column in the DataFrame.

    Returns:
    pandas.DataFrame: A DataFrame with the count and percentage of each unique value in the specified column.

    Raises:
    ValueError: If the specified column is not found in the DataFrame.
    """
    # Check if the column exists in the DataFrame
    if column_name not in df.columns:
        raise ValueError(f"Column '{column_name}' not found in DataFrame.")

    # Calculate count and normalized values
    count = df[column_name].value_counts(dropna=False)
    normalized = df[column_name].value_counts(dropna=False, normalize=True) * 100

    # Concatenate count and normalized values side by side
    result = pd.concat([count, normalized], axis=1)
    result.columns = ['Count', 'Percentage']

    return result

In [3]:
def count_comma_separated_values(df, column_name):
    """
    Counts the occurrences of individual elements in a comma-separated string column of a DataFrame.

    Parameters:
    df (pandas.DataFrame): The DataFrame containing the column.
    column_name (str): The name of the column to analyze.

    Returns:
    pandas.DataFrame: A DataFrame with the count and percentage of each unique element found in the comma-separated values.

    Raises:
    ValueError: If the specified column is not found in the DataFrame.
    """
    # Check if the column exists in the DataFrame
    if column_name not in df.columns:
        raise ValueError(f"Column '{column_name}' not found in DataFrame.")

    # Split the column values, explode to individual elements, and count
    exploded_items = df[column_name].dropna().str.split(', ').explode()
    exploded_df = pd.DataFrame({column_name: exploded_items})
    counts_and_percent = count_and_percentage(exploded_df, column_name)

    return counts_and_percent

## 2. Data preview

### 2.1 OLX

In [4]:
df_olx.head()

Unnamed: 0,link,title,price,summary_description,location,ownership,floor_level,is_furnished,building_type,square_meters,number_of_rooms,rent
0,https://www.olx.pl/d/oferta/kawalerka-tychy-os...,Kawalerka Tychy oś M,1 500 zł,OpisWynajmę kawalerkę na ul Moniuszki 1/16 .na...,"Śląskie, Zawada",Prywatne,Poziom: 3,Umeblowane: Nie,Rodzaj zabudowy: Apartamentowiec,Powierzchnia: 26 m²,Liczba pokoi: Kawalerka,Czynsz (dodatkowo): 400 zł


In [5]:
df_olx.columns

Index(['link', 'title', 'price', 'summary_description', 'location',
       'ownership', 'floor_level', 'is_furnished', 'building_type',
       'square_meters', 'number_of_rooms', 'rent'],
      dtype='object')

### 2.2 Otodom

### 2.2.1 Data Shape

In [6]:
df_otodom.head()

Unnamed: 0,link,title,location,price,summary_description,square_meters,rent,number_of_rooms,deposit,floor_level,...,equipment,media_types,heating,security,windows,elevator,parking_space,build_year,building_material,additional_information
0,https://www.otodom.pl/pl/oferta/mieszkania-2-p...,Mieszkania 2 pokojowe cena z ogrzewaniem,"ul. Karola Szymanowskiego 44, Zagórze Północ, ...",2 000 zł,Do wynajęcia 6 mieszkań o powierzchni od 42-58...,42 m²,,2,4 000 zł,parter/2,...,"lodówka, meble, piekarnik, kuchenka, pralka","telewizja kablowa, internet",gazowe,"drzwi / okna antywłamaniowe, teren zamknięty, ...",plastikowe,nie,garaż/miejsce parkingowe,brak informacji,cegła,"piwnica, oddzielna kuchnia, pom. użytkowe"
1,https://www.otodom.pl/pl/oferta/kawalerka-w-ka...,Kawalerka w Katowicach do wynajęcia od zaraz,"ul. Haliny Krahelskiej, Osiedle Paderewskiego-...",1 500 zł,"Nowe, (2018), komfortowe, jasne, w pełni wypos...",19 m²,250 zł/miesiąc,1,1 500 zł,3/3,...,"zmywarka, lodówka, meble, kuchenka, telewizor,...","telewizja kablowa, internet",gazowe,"teren zamknięty, domofon / wideofon",plastikowe,nie,garaż/miejsce parkingowe,2018,cegła,tylko dla niepalących
2,https://www.otodom.pl/pl/oferta/mam-do-wynajec...,Mam do wynajęcia mieszkanie Bezpośrednio !,"ul. Beskidzka, Chorzów II, Chorzów, śląskie",1 499 zł,"Witam.Mam do wynajęcia mieszkanie w Chorzowie,...",55 m²,799 zł/miesiąc,2,4 500 zł,parter/2,...,"zmywarka, lodówka, meble, piekarnik, kuchenka,...","telewizja kablowa, internet",miejskie,"drzwi / okna antywłamaniowe, teren zamknięty, ...",plastikowe,nie,garaż/miejsce parkingowe,2004,cegła,"piwnica, pom. użytkowe"
3,https://www.otodom.pl/pl/oferta/kawalerka-os-p...,Kawalerka os. Paderewskiego,"ul. Graniczna, Osiedle Paderewskiego-Muchowiec...",1 400 zł,Oferujemy do wynajęcia przytulną kawalerkę zlo...,31 m²,1 zł/miesiąc,1,1 400 zł,10/10,...,"lodówka, piekarnik, pralka",brak informacji,miejskie,brak informacji,brak informacji,nie,brak informacji,brak informacji,brak informacji,brak informacji
4,https://www.otodom.pl/pl/oferta/ul-1000-lecia-...,ul.1000-lecia Dąbrowa Górnicza Gołonóg 50m2 2 pok,"1000 lecia, Brodway, Gołonóg Północny, Dąbrowa...",1 500 zł,Do wynajęcia mieszkanie 50m2 DąbrowaGórnicza G...,50 m²,700 zł/miesiąc,2,2 000 zł,> 10/13,...,brak informacji,brak informacji,miejskie,brak informacji,plastikowe,tak,brak informacji,1980,wielka płyta,piwnica


In [7]:
df_otodom.sample(3)

Unnamed: 0,link,title,location,price,summary_description,square_meters,rent,number_of_rooms,deposit,floor_level,...,equipment,media_types,heating,security,windows,elevator,parking_space,build_year,building_material,additional_information
198,https://www.otodom.pl/pl/oferta/rezerwacja-dwu...,REZERWACJA Dwupokojowe os. Paderewskiego Katowic,"ul. gen. Władysława Sikorskiego, Osiedle Pader...",2 200 zł,"Nowoczesne, kompaktowe i ustawne dwupokojowe m...",43 m²,500 zł/miesiąc,2,3 000 zł,4/10,...,"zmywarka, lodówka, meble, piekarnik, pralka",brak informacji,miejskie,domofon / wideofon,plastikowe,tak,brak informacji,brak informacji,wielka płyta,"piwnica, tylko dla niepalących, oddzielna kuchnia"
409,https://www.otodom.pl/pl/oferta/bytom-ul-wrocl...,Bytom ul.Wrocławska - M do wynajęcia / 2 pokoje,"Śródmieście, Bytom, śląskie",1 550 zł,**** NAJNOWSZA OFERTA NA RYNKU ---- MIESZKANIE...,"37,63 m²",1 550 zł/miesiąc,2,,2/4,...,brak informacji,brak informacji,miejskie,"drzwi / okna antywłamaniowe, domofon / wideofon",plastikowe,nie,brak informacji,brak informacji,brak informacji,"piwnica, oddzielna kuchnia"
293,https://www.otodom.pl/pl/oferta/wynajme-2-poko...,"Wynajmę 2-pokojowe mieszkanie ,ul. Karoliny, 41m2","ul. Karoliny, Bogucice, Katowice, śląskie",2 300 zł,Wynajmę nowoczesne 2 pokojowe mieszkanie w blo...,41 m²,650 zł/miesiąc,2,,2/4,...,"zmywarka, lodówka, meble, piekarnik, kuchenka,...",brak informacji,miejskie,domofon / wideofon,plastikowe,tak,brak informacji,2019,brak informacji,brak informacji


In [8]:

# Count the number of duplicates in the DataFrame
duplicate_count = df_otodom.duplicated().sum()

# Print the count of duplicates
print("Number of duplicates:", duplicate_count)
del duplicate_count


Number of duplicates: 0


In [9]:
df_otodom.isna().sum()

link                        0
title                       0
location                    0
price                       0
summary_description         0
square_meters               0
rent                      513
number_of_rooms             0
deposit                   363
floor_level                23
building_type             105
available_from            958
balcony_garden_terrace    478
remote service              0
completion                173
ownership                   0
rent_to_students            0
equipment                   0
media_types                 0
heating                     0
security                    0
windows                     0
elevator                    0
parking_space               0
build_year                  0
building_material           0
additional_information      0
dtype: int64

In [10]:
len(df_otodom)

1105

In [11]:
df_otodom.columns


Index(['link', 'title', 'location', 'price', 'summary_description',
       'square_meters', 'rent', 'number_of_rooms', 'deposit', 'floor_level',
       'building_type', 'available_from', 'balcony_garden_terrace',
       'remote service', 'completion', 'ownership', 'rent_to_students',
       'equipment', 'media_types', 'heating', 'security', 'windows',
       'elevator', 'parking_space', 'build_year', 'building_material',
       'additional_information'],
      dtype='object')

In [12]:
df_otodom.dtypes

link                      object
title                     object
location                  object
price                     object
summary_description       object
square_meters             object
rent                      object
number_of_rooms            int64
deposit                   object
floor_level               object
building_type             object
available_from            object
balcony_garden_terrace    object
remote service            object
completion                object
ownership                 object
rent_to_students          object
equipment                 object
media_types               object
heating                   object
security                  object
windows                   object
elevator                  object
parking_space             object
build_year                object
building_material         object
additional_information    object
dtype: object

In [13]:
df_otodom.describe()

Unnamed: 0,number_of_rooms
count,1105.0
mean,2.120362
std,0.858141
min,1.0
25%,2.0
50%,2.0
75%,3.0
max,6.0


### 2.2.2 Selected columns

#### 2.2.2.1 location

In [14]:
df_otodom["location"].unique()[:10]

array(['ul. Karola Szymanowskiego 44, Zagórze Północ, Sosnowiec, śląskie',
       'ul. Haliny Krahelskiej, Osiedle Paderewskiego-Muchowiec, Katowice, śląskie',
       'ul. Beskidzka, Chorzów II, Chorzów, śląskie',
       'ul. Graniczna, Osiedle Paderewskiego-Muchowiec, Katowice, śląskie',
       '1000 lecia, Brodway, Gołonóg Północny, Dąbrowa Górnicza, śląskie',
       'ul. rtm. Witolda Pileckiego, Centrum / Śródmieście, Sosnowiec, śląskie',
       'ul. Bytomska, Piekary Śląskie, śląskie',
       'ul. Bohaterów Monte Cassino, Zawodzie, Katowice, śląskie',
       'ul. Francuska, Osiedle Paderewskiego-Muchowiec, Katowice, śląskie',
       'ul. Litewska, Pogoń, Sosnowiec, śląskie'], dtype=object)

#### 2.2.2.2 price

In [48]:
count_and_percentage(df_otodom, "price")

Unnamed: 0,Count,Percentage
1 500 zł,76,6.877828
2 000 zł,56,5.067873
2 500 zł,55,4.977376
1 800 zł,48,4.343891
2 200 zł,46,4.162896
...,...,...
1 310 zł,1,0.090498
3 299 zł,1,0.090498
1 210 zł,1,0.090498
3 150 zł,1,0.090498


#### 2.2.2.3 rent (additional fee for the housing association or owner)

In [49]:
count_and_percentage(df_otodom, "rent")

Unnamed: 0,Count,Percentage
,513,46.425339
500 zł/miesiąc,41,3.710407
700 zł/miesiąc,35,3.167421
600 zł/miesiąc,33,2.986425
400 zł/miesiąc,33,2.986425
...,...,...
502 zł/miesiąc,1,0.090498
65 zł/miesiąc,1,0.090498
386 zł/miesiąc,1,0.090498
963 zł/miesiąc,1,0.090498


#### 2.2.2.5 deposit
The one-time deposit to secure the rental agreement typically amounts to one or two months' rent. This deposit is held by the landlord as a security against potential damages or unpaid rent, and is usually refundable at the end of the tenancy, provided the property is left in its original condition and all contractual obligations have been met.

In [50]:
count_and_percentage(df_otodom, "deposit")

Unnamed: 0,Count,Percentage
,363,32.850679
3 000 zł,141,12.760181
2 000 zł,109,9.864253
2 500 zł,77,6.968326
4 000 zł,53,4.796380
...,...,...
6 400 zł,1,0.090498
5 400 zł,1,0.090498
430 zł,1,0.090498
1 950 zł,1,0.090498


#### 2.2.2.6 floor_level

In [51]:
count_and_percentage(df_otodom, "floor_level")

Unnamed: 0,Count,Percentage
2/4,89,8.054299
1/4,82,7.420814
3/4,73,6.606335
1/3,69,6.244344
4/4,58,5.248869
...,...,...
6/17,1,0.090498
2/11,1,0.090498
3/12,1,0.090498
parter/6,1,0.090498


In [30]:
df_otodom["floor_level"].str.replace('[^a-zA-Z]', '', regex=True).unique()

array(['parter', '', 'poddasze', nan, 'suterena'], dtype=object)

- 'parter' - 'ground floor'
- 'poddasze' - 'attic'
- 'suterena' - 'semi-basement'

#### 2.2.2.7 building_type

In [52]:
count_and_percentage(df_otodom, "building_type")

Unnamed: 0,Count,Percentage
blok,482,43.61991
apartamentowiec,256,23.167421
kamienica,231,20.904977
,105,9.502262
dom wolnostojący,28,2.533937
szeregowiec,3,0.271493


- 'blok' - 'block of flats' or 'apartment block'
- 'apartamentowiec' - 'apartment building'
- 'kamienica' - 'tenement house' (often refers to a historic, multi-story building in urban areas)
- 'dom wolnostojący' - 'detached house'
- 'szeregowiec' - 'terraced house' or 'row house' (a type of house connected in a row with others)

#### 2.2.2.7 available_from

In [53]:
count_and_percentage(df_otodom, "available_from")

Unnamed: 0,Count,Percentage
,958,86.696833
2023-12-01,21,1.900452
2023-11-01,11,0.995475
2023-11-20,6,0.542986
2023-11-06,5,0.452489
...,...,...
2023-12-04,1,0.090498
2023-09-28,1,0.090498
2023-04-20,1,0.090498
2023-11-10,1,0.090498


In [33]:
# Last 5 dates
np.sort(df_otodom["available_from"].dropna().unique())[:5]

array(['2019-02-28', '2019-07-16', '2020-03-06', '2021-01-04',
       '2021-05-01'], dtype=object)

In [34]:
# First 5 dates
np.sort(df_otodom["available_from"].dropna().unique())[-6:-1]

array(['2023-12-04', '2023-12-15', '2024-01-01', '2024-01-02',
       '2024-01-08'], dtype=object)

#### 2.2.2.8 balcony_garden_terrace

In [55]:
count_comma_separated_values(df_otodom, "balcony_garden_terrace")

Unnamed: 0,Count,Percentage
balkon,563,81.358382
taras,88,12.716763
ogródek,41,5.924855


- 'balkon' - 'balcony'
- 'taras' - 'terrace'
- 'ogródek' - 'garden' (often refers to a small, private garden space)

#### 2.2.2.9 completion

In [56]:
count_and_percentage(df_otodom, "completion")

Unnamed: 0,Count,Percentage
do zamieszkania,915,82.80543
,173,15.656109
do remontu,10,0.904977
do wykończenia,7,0.633484


- 'do zamieszkania' - 'ready to move in' or 'move-in ready'
- 'do remontu' - 'in need of renovation' or 'requires renovation'
- 'do wykończenia' - 'to be finished' or 'unfinished' (indicating that the property needs final finishing works)

#### 2.2.2.10 ownership

In [57]:
count_and_percentage(df_otodom, "ownership")

Unnamed: 0,Count,Percentage
biuro nieruchomości,819,74.117647
prywatny,283,25.61086
deweloper,3,0.271493


- 'biuro nieruchomości' - 'real estate agency'
- 'prywatny' - 'private' (often indicating a private seller or individual owner)
- 'deweloper' - 'developer' (referring to a property development company or builder)

#### 2.2.2.11 equipment

In [59]:
count_comma_separated_values(df_otodom, "equipment")

Unnamed: 0,Count,Percentage
meble,728,18.637993
lodówka,597,15.284178
kuchenka,578,14.797747
pralka,553,14.157706
piekarnik,507,12.980031
zmywarka,351,8.986175
brak informacji,314,8.038914
telewizor,278,7.117256


- 'meble' - 'furniture'
- 'lodówka' - 'refrigerator'
- 'kuchenka' - 'cooker' or 'stove'
- 'pralka' - 'washing machine'
- 'piekarnik' - 'oven'
- 'zmywarka' - 'dishwasher'
- 'brak informacji' - 'no information' or 'not specified'
- 'telewizor' - 'television'

#### 2.2.2.12 media_types

In [61]:
count_comma_separated_values(df_otodom, "media_types")

Unnamed: 0,Count,Percentage
brak informacji,627,39.533417
internet,452,28.499369
telewizja kablowa,354,22.320303
telefon,153,9.64691


- 'brak informacji' - 'no information' or 'not specified'
- 'internet' - 'internet'
- 'telewizja kablowa' - 'cable TV'
- 'telefon' - 'telephone'

#### 2.2.2.13 heating

In [62]:
count_and_percentage(df_otodom, "heating")

Unnamed: 0,Count,Percentage
miejskie,681,61.628959
brak informacji,215,19.457014
gazowe,125,11.312217
elektryczne,40,3.61991
inne,26,2.352941
kotłownia,15,1.357466
piece kaflowe,3,0.271493


- 'miejskie' - 'municipal' or 'district heating'
- 'brak informacji' - 'no information' or 'not specified'
- 'gazowe' - 'gas'
- 'elektryczne' - 'electric'
- 'inne' - 'other'
- 'kotłownia' - 'boiler room'
- 'piece kaflowe' - 'tiled stoves' or 'ceramic stoves'

#### 2.2.2.14 windows


In [63]:
count_and_percentage(df_otodom, "windows")

Unnamed: 0,Count,Percentage
plastikowe,791,71.58371
brak informacji,284,25.701357
drewniane,26,2.352941
aluminiowe,4,0.361991


- 'plastikowe' - 'plastic'
- 'brak informacji' - 'no information' or 'not specified'
- 'drewniane' - 'wooden'
- 'aluminiowe' - 'aluminum'

#### 2.2.2.15 elevator

In [64]:
count_and_percentage(df_otodom, "elevator")

Unnamed: 0,Count,Percentage
nie,701,63.438914
tak,404,36.561086


- 'nie' - 'no'
- 'tak' - 'yes'

#### 2.2.2.16 parking_space


In [65]:
count_and_percentage(df_otodom, "parking_space")

Unnamed: 0,Count,Percentage
brak informacji,755,68.325792
garaż/miejsce parkingowe,350,31.674208


- 'brak informacji' - 'no information' or 'not specified'
- 'garaż/miejsce parkingowe' - 'garage/parking space'

#### 2.2.2.17 build_year


In [66]:
count_and_percentage(df_otodom, "build_year")

Unnamed: 0,Count,Percentage
brak informacji,629,56.923077
2022,61,5.520362
1980,60,5.429864
2021,47,4.253394
2023,39,3.529412
...,...,...
1893,1,0.090498
1992,1,0.090498
1935,1,0.090498
1956,1,0.090498


In [45]:
df_otodom_filtered = df_otodom[df_otodom['build_year'].notna() & (df_otodom['build_year'] != 'brak informacji')].astype({'build_year': 'int32'}).sort_values(by='build_year')
df_otodom_filtered['build_year'].tail()


80     2023
881    2023
220    2025
188    2025
36     2025
Name: build_year, dtype: int32

In [46]:
df_otodom_filtered['build_year'].head()

445    1893
840    1900
141    1900
381    1900
250    1900
Name: build_year, dtype: int32

#### 2.2.2.18 building_material

In [67]:
count_and_percentage(df_otodom, "building_material")

Unnamed: 0,Count,Percentage
brak informacji,548,49.59276
cegła,279,25.248869
wielka płyta,119,10.769231
pustak,65,5.882353
inne,42,3.800905
żelbet,17,1.538462
beton,16,1.447964
silikat,8,0.723982
beton komórkowy,7,0.633484
drewno,3,0.271493


- 'brak informacji' - 'no information' or 'not specified'
- 'cegła' - 'brick'
- 'wielka płyta' - 'large panel' (a type of panel building construction)
- 'pustak' - 'hollow brick' or 'concrete block'
- 'inne' - 'other'
- 'żelbet' - 'reinforced concrete'
- 'beton' - 'concrete'
- 'silikat' - 'silicate' (referring to silicate brick)
- 'beton komórkowy' - 'aerated concrete'
- 'drewno' - 'wood'
- 'keramzyt' - 'expanded clay' (a lightweight aggregate used in construction)