# Loading Data

In [2]:
# Importing packages
import numpy as np
import pandas as pd

import re 
import warnings
from unidecode import unidecode

## Regions

In [3]:
# Read regions table
regions = pd.read_excel("data/cz_regions.xlsx", header=None)
# Rename column
regions.rename(columns={regions.columns[0] : "region"}, inplace=True)
# Remove special characters
regions["regions_base"] = regions["region"].apply(lambda x: unidecode(str(x))) 
# Get only names of each region
regions["name_base"] = regions["regions_base"].apply(lambda x: " ".join([word for word in x.split() if word.upper() != "KRAJ"]))
# Manually set Ceska republika have name_base="CZ" and Prague to have name_base = "Praha"
regions.loc[0, "name_base"] = "CZ"
regions.loc[1, "name_base"] = "Praha"

In [13]:
# Save as .csv
regions.to_csv("data/processed/regions.csv", index=False)

In [4]:
regions

Unnamed: 0,region,regions_base,name_base
0,Česká republika,Ceska republika,CZ
1,Hlavní město Praha,Hlavni mesto Praha,Praha
2,Středočeský kraj,Stredocesky kraj,Stredocesky
3,Jihočeský kraj,Jihocesky kraj,Jihocesky
4,Plzeňský kraj,Plzensky kraj,Plzensky
5,Karlovarský kraj,Karlovarsky kraj,Karlovarsky
6,Ústecký kraj,Ustecky kraj,Ustecky
7,Liberecký kraj,Liberecky kraj,Liberecky
8,Královéhradecký kraj,Kralovehradecky kraj,Kralovehradecky
9,Pardubický kraj,Pardubicky kraj,Pardubicky


## Housing

In [6]:
housing_dfs= []
with warnings.catch_warnings():
    # Catch openpyxl warnings
    warnings.filterwarnings("ignore", category=UserWarning, module=re.escape('openpyxl.styles.stylesheet'))
    
    # open years from 2006 until 2019
    for i in range(6, 20):
        suf = str(i) if i >= 10 else '0'+str(i)
        df = pd.read_excel(f"data/housing_indices/CEN{suf}.xlsx", skiprows=8, skipfooter=6, usecols=lambda x: 'Unnamed: 0' not in x, sheet_name="DATA")
        df.rename(columns={df.columns[0] : "region", 
                        df.columns[1] : "total", 
                        df.columns[2] : "family_houses", 
                        df.columns[3] : "apartments",
                        df.columns[4] : "apartment_buildings",
                        df.columns[5] : "plots"}, inplace=True)
        df.insert(0, "year", "20" + suf)        
        housing_dfs.append(df)
    
    # open years from 2020 until 2024 - different due to change of table structure
    for i in range(21, 25):
        suf = str(i) if i >= 10 else '0'+str(i)
        df = pd.read_excel(f"data/housing_indices/CEN{suf}.xlsx", skiprows=8, skipfooter=5, usecols=lambda x: 'Unnamed: 0' not in x, sheet_name="DATA")
        df.insert(0, "year", "20" + suf)
        df.rename(columns={df.columns[1] : "region", 
                        df.columns[6] : "family_houses", 
                        df.columns[11] : "apartments"}, inplace=True)
        df = df.loc[:,~df.columns.str.startswith('Q')]
        housing_dfs.append(df)
        
# Concat the years
housing = pd.concat(housing_dfs, ignore_index=True)

### Fix Indexing

Due to different base years throughout the data series, the indexes have to be adjusted such that all years reflect the year 2006 has index 100.

In [7]:
# Import edge years where indexing changes
# 2005-2010: 2005=100
# 2011-2024: 2010=100

with warnings.catch_warnings():
    # Catch openpyxl warnings
    warnings.filterwarnings("ignore", category=UserWarning, module=re.escape('openpyxl.styles.stylesheet'))

    # 2011 (_o suffix = data compared to the same period in the previous year)
    housing11 = pd.read_excel("data/housing_indices/CEN11_o.xlsx", sheet_name="DATA", skiprows=8, skipfooter=6, usecols=lambda x: 'Unnamed: 0' not in x)
    housing11.rename(columns={housing11.columns[0] : "region", 
                    housing11.columns[1] : "total", 
                    housing11.columns[2] : "family_houses", 
                    housing11.columns[3] : "apartments",
                    housing11.columns[4] : "apartment_buildings",
                    housing11.columns[5] : "plots"}, inplace=True)
    housing11.insert(0, "year", "2011")   
    

In [8]:
# Make sure year is int
housing["year"] = housing["year"].astype(int)
housing11["year"] = housing11["year"].astype(int)

# Rebase 2006 to 100
for region in housing["region"].unique():
    base_2006 = housing.loc[(housing["year"] == 2006) & (housing["region"] == region), ["family_houses", "apartments"]]

    if not base_2006.empty:
        factor = 100 / base_2006.values  # scaling so that 2006 == 100
        mask = (housing["region"] == region) & (housing["year"].between(2006, 2010))
        housing.loc[mask, ["family_houses", "apartments"]] = (
            housing.loc[mask, ["family_houses", "apartments"]].values * factor
        )

# Rebase 2011 using housing11 (growth vs 2010)
for region in housing["region"].unique():
    val_2010 = housing.loc[(housing["year"] == 2010) & (housing["region"] == region), ["family_houses", "apartments"]]
    factor_2011 = housing11.loc[(housing11["year"] == 2011) & (housing11["region"] == region), ["family_houses", "apartments"]]

    if not val_2010.empty and not factor_2011.empty:
        # factor_2011 is e.g. 102.1 → means +2.1% → scale = 1.021
        scale = factor_2011.values / 100
        new_2011 = val_2010.values * scale
        housing.loc[(housing["year"] == 2011) & (housing["region"] == region), ["family_houses", "apartments"]] = new_2011

# Chain forward 2012–2024
# Each year is expressed as index relative to 2010=100, so now we anchor them to the rebased 2010
for region in housing["region"].unique():
    base_2010 = housing.loc[(housing["year"] == 2010) & (housing["region"] == region), ["family_houses", "apartments"]]
    if not base_2010.empty:
        factor_2010 = housing.loc[(housing["year"] == 2010) & (housing["region"] == region), ["family_houses", "apartments"]].values / 100
        mask = (housing["region"] == region) & (housing["year"] >= 2011)
        housing.loc[mask, ["family_houses", "apartments"]] = (
            housing.loc[mask, ["family_houses", "apartments"]].values * factor_2010
        )


In [9]:
# Left join region name_base
housing = pd.merge(housing, regions, how="left", on="region")

In [22]:
housing.columns

Index(['year', 'region', 'total', 'family_houses', 'apartments',
       'apartment_buildings', 'plots', 'regions_base', 'name_base'],
      dtype='object')

In [25]:
# Keep only relevant columns
housing = housing[['year', 'region', 'family_houses', 'apartments', 'regions_base', 'name_base']]

# Save as .csv
housing.to_csv("data/processed/housing.csv", index=False)

In [12]:
# show sample of data
housing.sample(10)

Unnamed: 0,year,region,total,family_houses,apartments,apartment_buildings,plots,regions_base,name_base
187,2018,Liberecký kraj,122.9,135.843429,167.17548,124.6,117.6,Liberecky kraj,Liberecky
186,2018,Ústecký kraj,120.1,154.096597,139.944747,140.3,125.8,Ustecky kraj,Ustecky
52,2009,Liberecký kraj,139.9,123.714286,140.439158,136.1,147.9,Liberecky kraj,Liberecky
11,2006,Jihomoravský kraj,109.3,100.0,100.0,116.2,113.8,Jihomoravsky kraj,Jihomoravsky
176,2017,Jihomoravský kraj,127.7,144.828707,185.892176,115.1,132.7,Jihomoravsky kraj,Jihomoravsky
159,2016,Pardubický kraj,108.0,126.169082,143.038581,107.7,116.6,Pardubicky kraj,Pardubicky
17,2007,Středočeský kraj,128.6,113.776493,133.242009,136.9,125.0,Stredocesky kraj,Stredocesky
162,2016,Olomoucký kraj,107.6,129.804706,160.992507,88.4,111.0,Olomoucky kraj,Olomoucky
181,2018,Hlavní město Praha,141.3,146.690406,189.330827,160.4,124.3,Hlavni mesto Praha,Praha
94,2012,Plzeňský kraj,100.8,119.781183,121.286239,95.9,102.9,Plzensky kraj,Plzensky


## Wages

In [14]:
# Read wages data for CZ region
cz_wages = pd.read_excel("data/wages/MZDCZ00_24.xlsx", sheet_name="DATA", skiprows=6, skipfooter=7, usecols=lambda x: "Unnamed: 0" not in x)
# Rename cols
cz_wages = cz_wages.rename(columns={cz_wages.columns[0] : "year",
                        cz_wages.columns[1] : "avg_no_empl_adj", # adjusted to full time equivalent
                        cz_wages.columns[2] : "avg_month_wage_adj", # adjusted to full time equivalent
                        cz_wages.columns[3] : "avg_no_empl_hc", # headcount
                        cz_wages.columns[4] : "avg_month_wage_hc" # headcount
                        })

# Make sure year is int
cz_wages["year"] = cz_wages["year"].astype(int)
cz_wages.insert(0, "region", "CZ")
# Order by year in ascending
cz_wages = cz_wages.sort_values(["year"]).reset_index().drop(["index"],axis=1)


In [15]:
# Import regional wages
wages_regions = []
with warnings.catch_warnings():
    # Catch openpyxl warnings
    warnings.filterwarnings("ignore", category=UserWarning, module=re.escape('openpyxl.styles.stylesheet'))
        
    for i in regions["name_base"]:
        if i != "CZ": # --- ion like this here, make it cleaner X)))
            for j in ["00_10", "11_24"]:
                df = pd.read_excel(f"data/wages/MZD{i+j}.xlsx", sheet_name="DATA", skiprows=6, skipfooter=8 if j=="11_24" else 6, usecols = lambda x: "Unnamed: 0" not in x)
                df.rename(columns={df.columns[0] : "year",
                                        df.columns[1] : "avg_no_empl_adj", # adjusted to full time equivalent
                                        df.columns[2] : "avg_month_wage_adj", # adjusted to full time equivalent
                                        df.columns[3] : "avg_no_empl_hc", # headcount
                                        df.columns[4] : "avg_month_wage_hc" # headcount
                                        }, inplace=True)
                df.insert(0, "region", i)  
                # get rid of the notes in years
                df["year"] = df['year'].astype(str).str.slice(0, 4).astype(int)
                wages_regions.append(df)

# Concat regions
wages_regions = pd.concat(wages_regions, ignore_index=True)

In [17]:
# merge wages for regions and aggregate
wages = pd.concat([cz_wages, wages_regions], ignore_index=True)

# order by region and year
wages = wages.sort_values(["region", "year"]).reset_index().drop(["index"],axis=1)

# filter to years [2006,2024]
# wages = wages[wages['year'].astype(int).between(2006, 2024)]

### Fix Indexing

Due to nominal values in the data series, the values are reindexed such that year 2006 has value 100 for every region and column.

In [18]:
# --- CONFIG
base_year = 2006
cols_to_index = ['avg_no_empl_adj', 'avg_month_wage_adj', 'avg_no_empl_hc', 'avg_month_wage_hc']

# --- 0) Basic type safety
# convert year to int (nullable Int64) and ensure region is string
wages['year'] = pd.to_numeric(wages['year'], errors='coerce').astype('Int64')
wages['region'] = wages['region'].astype(str)

# --- 1) Filter years 2006-2024 and copy
w = wages[wages['year'].between(2006, 2024)].copy()

# --- 2) Remove exact duplicate rows, then ensure single row per (region, year)
w = w.drop_duplicates()
w = w.sort_values(['region', 'year']).drop_duplicates(['region', 'year'], keep='first').reset_index(drop=True)

# --- 3) Ensure numeric columns
for c in cols_to_index:
    w[c] = pd.to_numeric(w[c], errors='coerce')

# --- 4) Build per-region 2006 base (use mean in case of multiple 2006 rows)
base_df = (
    w[w['year'] == base_year]
    .groupby('region')[cols_to_index]
    .mean()
    .reset_index()
    .rename(columns={c: f'{c}_base' for c in cols_to_index})
)

# --- 5) Merge base back into main table
w = w.merge(base_df, on='region', how='left')

# --- 6) Compute indexed values (2006 = 100) and overwrite original columns
for c in cols_to_index:
    base_col = f'{c}_base'
    # avoid division by zero / NaN
    w[c] = np.where(w[base_col].notna() & (w[base_col] != 0),
                    (w[c] / w[base_col]) * 100,
                    np.nan)

# --- 7) Drop helper base columns
w = w.drop(columns=[f'{c}_base' for c in cols_to_index])

# --- 8) Sanity checks & warnings
# Regions that have NaN in the first base column are missing 2006 base
missing_regions = sorted(w[w[cols_to_index[0]].isna()]['region'].unique().tolist())
if missing_regions:
    print("Warning: the following regions are missing 2006 base values (check spelling/duplicates or data completeness):")
    print(missing_regions)

# Replace wages DataFrame (optional) - if you want to keep the old one, skip this
wages_rebased = w.copy()


In [19]:
# Left join region name_base
# wages_rebased = 
wages_rebased.rename(columns={"region": "name_base"}, inplace=True)
wages_rebased = pd.merge(wages_rebased, regions, how="left", on="name_base")

In [29]:
# Keep only relevant columns
wages_rebased = wages_rebased[['name_base', 'year', 'avg_no_empl_adj', 'avg_month_wage_adj', 'region', 'regions_base']]

# Save as .csv
wages_rebased.to_csv("data/processed/wages.csv", index=False)

In [20]:
# show sample
wages_rebased.sample(10)

Unnamed: 0,name_base,year,avg_no_empl_adj,avg_month_wage_adj,avg_no_empl_hc,avg_month_wage_hc,region,regions_base
144,Olomoucky,2017,103.279446,151.569711,103.35871,151.498819,Olomoucký kraj,Olomoucky kraj
136,Olomoucky,2009,94.364896,116.740341,94.357367,116.777212,Olomoucký kraj,Olomoucky kraj
119,Moravskoslezsky,2011,99.134908,122.111123,99.1589,122.085719,Moravskoslezský kraj,Moravskoslezsky kraj
162,Pardubicky,2016,93.428723,142.135956,93.333333,142.233383,Pardubický kraj,Pardubicky kraj
72,Karlovarsky,2021,80.228928,194.778868,80.99837,192.815959,Karlovarský kraj,Karlovarsky kraj
266,Zlinsky,2006,100.0,100.0,100.0,100.0,Zlínský kraj,Zlinsky kraj
153,Pardubicky,2007,101.165872,107.106512,100.974359,107.302806,Pardubický kraj,Pardubicky kraj
217,Stredocesky,2014,97.166623,134.68598,97.227124,134.616837,Středočeský kraj,Stredocesky kraj
124,Moravskoslezsky,2016,99.625906,136.514971,99.863605,136.205459,Moravskoslezský kraj,Moravskoslezsky kraj
147,Olomoucky,2020,100.778077,188.443787,102.402193,185.483056,Olomoucký kraj,Olomoucky kraj
