***Data Loading and Initial Inspection***

In [47]:
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv("DKHousingPricesSample100k.csv")

print(df.head())
print(df.shape)
print(df.info())
print(df.describe())


         date quarter  house_id house_type    sales_type  year_build  \
0  2024-10-24  2024Q4        16      Villa  regular_sale        1997   
1  2024-10-24  2024Q4        13  Apartment  regular_sale        1885   
2  2024-10-23  2024Q4        60      Villa  regular_sale        1949   
3  2024-10-23  2024Q4        29      Villa  regular_sale        2001   
4  2024-10-22  2024Q4        92  Apartment  regular_sale        1965   

   purchase_price  %_change_between_offer_and_purchase  no_rooms    sqm  \
0         6500000                                 -3.0         5  142.0   
1         3400000                                  0.0         2   46.0   
2         4550000                                 -4.0         4  112.0   
3         1630000                                -12.0         4  186.0   
4         1975000                                -10.0         3   82.0   

   sqm_price                          address  zip_code             city  \
0  45774.650                     Spættev

***Data Cleaning***

In [48]:
cols_to_drop = [
    "zip_code",
    "address",
    "house_id",
    "date",
    "%_change_between_offer_and_purchase",
    "nom_interest_rate%",
    "area",
    "dk_ann_infl_rate%",
    "yield_on_mortgage_credit_bonds%"
]

df.drop(columns=cols_to_drop, inplace=True)

***Handling Missing Values***

In [49]:
print(df.isna().sum())

# Remove records with missing city values
df.dropna(subset=["city"], inplace=True)

print(df.isna().sum())

quarter            0
house_type         0
sales_type         0
year_build         0
purchase_price     0
no_rooms           0
sqm                0
sqm_price          0
city              11
region             0
dtype: int64
quarter           0
house_type        0
sales_type        0
year_build        0
purchase_price    0
no_rooms          0
sqm               0
sqm_price         0
city              0
region            0
dtype: int64


***Time Feature Extraction & Ordinal Encoding***

In [50]:
df["year"] = df["quarter"].str[:4].astype(int)
df["quarter_num"] = df["quarter"].str[-1].astype(int)

df.drop(columns=["quarter"], inplace=True)

***High-Cardinality Feature Handling***

In [51]:
print("Number of unique regions:", df["region"].nunique())
print("Number of unique cities:", df["city"].nunique())

# Drop high-cardinality feature
df.drop(columns=["city"], inplace=True)

Number of unique regions: 4
Number of unique cities: 607


***Categorical Feature Encoding***

In [52]:
categorical_cols = ["house_type", "region", "sales_type"]

df_encoded = pd.get_dummies(
    df,
    columns=categorical_cols,
    drop_first=True
)

***Outlier Detection***

In [46]:
# df_encoded["purchase_price"].plot.box()