***Import Required Libraries***

In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

***Load Dataset and Handle Missing Values***

In [3]:
missing_tokens = ["?", "unknown", "Unknown", "N/A", "na", "", " "]

df = pd.read_csv(
    "DKHousingPricesSample100k.csv",
    na_values=missing_tokens
)

***Clean Invalid Numerical Values***

In [4]:
df.loc[df["year_build"] <= 0, "year_build"] = np.nan
df.loc[df["sqm"] <= 0, "sqm"] = np.nan
df.loc[df["no_rooms"] <= 0, "no_rooms"] = np.nan


***Drop Irrelevant Columns***

In [5]:
cols_to_drop = [
    'date',
    'house_id',
    '%_change_between_offer_and_purchase',
    'sqm_price',
    'address',
    'zip_code',
    'nom_interest_rate%',
    'dk_ann_infl_rate%',
    'city'
]

df.drop(columns=cols_to_drop, inplace=True)

***Missing Values***

In [7]:
print(df.columns)
print(df.isna().sum())

df = df.dropna()
print(df.isna().sum())

df["purchase_price"].describe()

Index(['quarter', 'house_type', 'sales_type', 'year_build', 'purchase_price',
       'no_rooms', 'sqm', 'area', 'region', 'yield_on_mortgage_credit_bonds%'],
      dtype='object')
quarter                             0
house_type                          0
sales_type                          0
year_build                          0
purchase_price                      0
no_rooms                            0
sqm                                 0
area                                1
region                              1
yield_on_mortgage_credit_bonds%    78
dtype: int64
quarter                            0
house_type                         0
sales_type                         0
year_build                         0
purchase_price                     0
no_rooms                           0
sqm                                0
area                               0
region                             0
yield_on_mortgage_credit_bonds%    0
dtype: int64


Unnamed: 0,purchase_price
count,55388.0
mean,2308497.0
std,1937945.0
min,251300.0
25%,1035000.0
50%,1780000.0
75%,2975000.0
max,30492540.0


***Convert Price from DKK to USD***

In [8]:
DKK_TO_USD = 0.15

df["price_usd"] = df["purchase_price"] * DKK_TO_USD
df["price_usd_log"] = np.log1p(df["price_usd"])

***Create House Age Feature***

In [9]:
df = df.drop(columns=["purchase_price", "price_usd"])

CURRENT_YEAR = 2025

df["house_age"] = CURRENT_YEAR - df["year_build"]
df["house_age"].describe()

df = df.drop(columns=["year_build"])
df.loc[df["house_age"] == 0, "house_age"] = 1

***Process Quarter Feature***

In [10]:
df["year"] = df["quarter"].str[:4].astype(int)
df["quarter_num"] = df["quarter"].str[-1].astype(int)
df.drop(columns=["quarter"], inplace=True)

***Filter Unrealistic House Ages***

In [11]:
df = df[df["house_age"] <= 200]
print(df.columns)

Index(['house_type', 'sales_type', 'no_rooms', 'sqm', 'area', 'region',
       'yield_on_mortgage_credit_bonds%', 'price_usd_log', 'house_age', 'year',
       'quarter_num'],
      dtype='object')


***Encode Categorical Variables***

In [13]:
catag_cols = ["house_type", "region", "sales_type", "area"]
df = pd.get_dummies(df, columns=catag_cols, drop_first=True)

***Final Statistical Summary***

In [12]:
print(df[["sqm", "no_rooms", "house_age"]].describe())

                sqm      no_rooms     house_age
count  54747.000000  54747.000000  54747.000000
mean     126.414014      4.280088     63.948326
std       55.741453      1.629397     39.660968
min       26.000000      1.000000      1.000000
25%       86.000000      3.000000     40.000000
50%      120.000000      4.000000     57.000000
75%      157.000000      5.000000     90.000000
max      955.000000     15.000000    200.000000
