Imports

In [1]:
import kagglehub
import os
import pandas as pd
import numpy as np

Data Downloads

In [7]:
# Germany
path_germany = kagglehub.dataset_download("corrieaar/apartment-rental-offers-in-germany")
germany_csvs = [f for f in os.listdir(path_germany) if f.endswith('.csv')]
df_germany = pd.read_csv(os.path.join(path_germany, germany_csvs[0]))

# Poland
path_poland = kagglehub.dataset_download("krzysztofjamroz/apartment-prices-in-poland")
poland_csvs = [f for f in os.listdir(path_poland) if f.endswith('.csv')]

# Filter only rental-related files
rental_files = [f for f in poland_csvs if 'apartments_rent_pl' in f.lower()]

# Load and combine them into a single DataFrame
df_poland = pd.concat(
    [pd.read_csv(os.path.join(path_poland, f)) for f in rental_files],
    ignore_index=True
)



Data Processing Pipeline
comments:

-only germany has heatingType data

-description not needed imo

-dates not needed imo

-added floor and elevator data and they both had it, and they could be interesting

In [3]:
# Germany
selected_columns_germany = ['livingSpace', 'noRooms', 'balcony', 'heatingType', 'yearConstructed',
                           'totalRent', 'baseRent', 'serviceCharge', 'heatingCosts', 'date', 'floor', 'lift'] 
df_germany_selected = df_germany[selected_columns_germany]
#print(df_germany_selected)

# Fill missing totalRent with baseRent + serviceCharge + heatingCosts
df_germany['totalRent'] = df_germany['totalRent'].fillna(
    df_germany['baseRent'].fillna(0) + df_germany['serviceCharge'].fillna(0) + df_germany['heatingCosts'].fillna(0)
)

# Select and rename columns
df_germany_cleaned = pd.DataFrame({
    'size': df_germany['livingSpace'],
    'number_of_rooms': df_germany['noRooms'],
    'construction_year': df_germany['yearConstructed'],
    'total_rent': df_germany['totalRent'],
    'floor': df_germany['floor'],
    'balcony': df_germany['balcony'],
    'elevator': df_germany['lift'],
    'heating_type': df_germany['heatingType'],
    'country': 'Germany'
})

# Poland 
selected_columns_poland = ['squareMeters', 'rooms', 'hasBalcony', 'buildYear', 'price', 'floor', 'hasElevator']
df_poland_selected = df_poland[selected_columns_poland]
#print(df_poland_selected)


# Exchange rate: 1 Złoty = 0.23 Euro
zloty_euro_rate = 0.23
df_poland['price_eur'] = df_poland['price'] * zloty_euro_rate

# format balcony and elevator
df_poland['hasBalcony'] = df_poland['hasBalcony'].map({"yes": True, "no": False})
df_poland['hasElevator'] = df_poland['hasElevator'].map({"yes": True, "no": False})

# Select and rename columns
df_poland_cleaned = pd.DataFrame({
    'size': df_poland['squareMeters'],
    'number_of_rooms': df_poland['rooms'],
    'construction_year': df_poland['buildYear'],
    'total_rent': df_poland['price_eur'],
    'floor': df_poland['floor'],
    'balcony': df_poland['hasBalcony'],
    'elevator': df_poland['hasElevator'],
    'heating_type': np.nan,  # Poland dataset has no heatingType
    'country': 'Poland'
})

# Combine both datasets
df_combined_rentals = pd.concat([df_germany_cleaned, df_poland_cleaned], ignore_index=True)

# Optional: preview
print(df_combined_rentals.shape)
df_combined_rentals.head()


# save to csv
df_combined_rentals.to_csv('data/cleaned_apartments_rentals.csv', index=False)

(339697, 9)


Dimensional Model

In [6]:
# Country Dimension
dim_country = pd.DataFrame({
    'Country': ['Germany', 'Poland'],
    'Currency': ['EUR', 'PLN'],
    'PurchasingPowerParity': [1.0, 0.8]
})
dim_country['CountryKey'] = dim_country.index + 1

# Apartment Dimension
dim_apartment = df_combined_rentals[['size', 'number_of_rooms', 'balcony', 'construction_year', 'elevator', 'floor']].drop_duplicates().reset_index(drop=True)

dim_apartment['ApartmentKey'] = dim_apartment.index + 1


# Date Dimension
