## Imports

In [418]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import eurostat
import os

In [350]:
from functools import reduce
from eurostat import get_data_df

## Dataset Documentation

### Research Questions

#### 1. How does GDP per capita relate to tertiary education attainment?

#### 2. Is there a correlation between GDP per capita and greenhouse gas emissions?

#### 3. How does tertiary education attainment correlate with unemployment rate?

### Eurostat Datasets


| Dataset | Description |
|---------|-------------|
| [**namq_10_gdp**](https://ec.europa.eu/eurostat/databrowser/view/namq_10_gdp/default/table?lang=en) | Gross domestic product (GDP) and main components (output, expenditure and income). |
| [**tps00203**](https://ec.europa.eu/eurostat/databrowser/view/tps00203/default/table?lang=en) | Total unemployment rate anually. |
| [**tec00118**](https://ec.europa.eu/eurostat/databrowser/product/view/tec00118) | Harmonised Index of Consumer Prices (HICP) - inflation ratex. |
| [**tgs00109**](https://ec.europa.eu/eurostat/databrowser/view/tgs00109/default/table?lang=en) | Persons aged 25-64 with tertiary educational attainment level by sex and Nomenclature of Territorial Units for Statistics (NUTS) 2 region. |
| [**nama_10_pc**](https://ec.europa.eu/eurostat/databrowser/product/view/nama_10_pc) | Gross domestic product (GDP) and main components per capita. |
| [**tec00115**](https://ec.europa.eu/eurostat/databrowser/view/tec00115/default/line?lang=en) | Real GDP growth rate - volume. |
| [**prc_hicp_midx**](https://ec.europa.eu/eurostat/databrowser/view/prc_hicp_midx/default/table?lang=en) | Harmonized Index of Consumer Prices (HICP) - monthly data (index). |
| [**nrg_pc_204**](https://ec.europa.eu/eurostat/databrowser/view/nrg_pc_204/default/bar?lang=en) | Electricity prices for household consumers - bi-annual data (from 2007 onwards). |
| [**prc_hicp_mmor**](https://ec.europa.eu/eurostat/databrowser/view/prc_hicp_mmor/default/table?lang=en) | Harmonized Consumer Price Index (HICP) - monthly data (monthly rate of change). |
| [**prc_hicp_manr**](https://ec.europa.eu/eurostat/databrowser/view/prc_hicp_manr/default/table?lang=en) | Harmonized Consumer Price Index (HICP) - monthly data (annual rate of change). |
| [**env_air_gge**](https://ec.europa.eu/eurostat/databrowser/view/env_air_gge/default/table?lang=en) | Greenhouse gas emissions by source sector. |
| [**ilc_di01**](https://ec.europa.eu/eurostat/databrowser/view/ilc_di01/default/table?lang=en) | Distribution of income by quantiles. |
| [**demo_gind**](https://ec.europa.eu/eurostat/databrowser/view/demo_gind/default/table?lang=en) | Population change - Demographic balance and crude rates at national level. |
| [**une_rt_m**](https://ec.europa.eu/eurostat/databrowser/view/une_rt_m/default/table?lang=en) | Unemployment by sex and age - monthly data. |

### Final Dataset

| Column | Description |
|--------|-------------|
| **TIME_PERIOD** | Year, month and day of the observation. |
| **gdp** | Gross Domestic Product, total economic output of the country. Measured in millions of euros. |
| **gdp_quarterly** | Gross Domestic Product, total economic output of the country divided by quarters. Measured in millions of euros. |
| **total_unemployment_rate** | Percentage of the labor force that is unemployed. |
| **adult_unemployment_rate** | Percentage of the adult labor force (25-74) that is unemployed. |
| **youth_unemployment_rate** | Percentage of the youth labor force (<25) that is unemployed. |
| **inflation_rate_anually** | Annual inflation rate, showing yearly price changes. |
| **education_attainment** | Share of population aged 25-64 with tertiary education. |
| **gdp_per_capita** | GDP divided by total population, a measure of average economic output per person. Measured in euros. |
| **gdp_growth_rate** | Annual percentage change in GDP. |
| **price_index** | Index tracking general price levels over time. |
| **energy_prices** | Average energy costs for households. Measured in euros per KWH total |
| **inflation_rate_monthly_m** | Monthly inflation rate (monthly rate of change) based on consumer prices. |
| **inflation_rate_monthly_y** | Monthly inflation rate (annual rate of change) based on consumer prices. |
| **gas_emissions** | Greenhouse gas emissions, aggregates of CO2 related gases (GHG), measured in millions of tonnes. |
| **household_income_q1** | Income of the lowest household income quintile. |
| **household_income_q2** | Income of the second household income quintile. |
| **household_income_q3** | Income of the third household income quintile. |
| **net_migration** | Net migration (inflow minus outflow of people). |
| **total_population** | Total population of the country. |
| **net_population_growth** | Yearly population change (including births, deaths, and migration). |
| **growth_rate** | General population growth rate (percentage). |
| **births** | Number of live births anually. |
| **deaths** | Number of deaths anually. |


## Datasets

### Utils

In [351]:
COUNTRY = "SE"
OUTPUT_DIR = "data_eurostat"
DFS = {}
ANNUAL_COLS = []
INDICATORS = []

os.makedirs(OUTPUT_DIR, exist_ok=True)

In [352]:
def get_dataset(indicator_name, local_path, country=None, unit=None, coicop=None) -> pd.DataFrame:
    """_summary_

    Args:
        indicator_name (_type_): _description_
        local_path (_type_): _description_
        country (_type_, optional): _description_. Defaults to None.
        unit (_type_, optional): _description_. Defaults to None.
        coicop (_type_, optional): _description_. Defaults to None.

    Returns:
        pd.DataFrame: _description_
    """    
    
    # Lookup the dataset inside the data_eurostat folder
    if os.path.exists(local_path):
        print(f"Found locally: {local_path}")
        df = pd.read_csv(local_path)
        return df
    
    # Download if no avilable csv was found
    try:
        print(f"Downloading from Eurostat: {indicator_name}")
        df: pd.DataFrame = eurostat.get_data_df(indicator_name) # pyright: ignore[reportAssignmentType]

        if "geo\\TIME_PERIOD" in df.columns:
            df = df.rename(columns={"geo\\TIME_PERIOD":"geo"})
        
        if country is not None and 'geo' in df.columns:
            df = df[df['geo'] == COUNTRY]

        if unit is not None and 'unit' in df.columns:
            df = df[df['unit'] == unit]
        
        if coicop is not None and 'coicop' in df.columns:
            df = df[df['coicop'] == coicop]

        df.to_csv(local_path, index=False)
        print(f"Saved locally: {local_path}")
    except Exception as e:
        print(f"Failed to download {indicator_name}: {e}")
        return pd.DataFrame()

    return df

In [353]:
def col_to_timestamp(df, ds_name):

    freq = list(df['freq'].unique())[0]

    try:
        if freq.upper() == 'S':
            df['year'] = df['TIME_PERIOD'].str[:4].astype(int)
            df['semester'] = df['TIME_PERIOD'].str[4:]
            df['month'] = df['semester'].map({'S1': 1, 'S2': 7})
            
            df['TIME_PERIOD'] = pd.to_datetime(df[['year', 'month']].assign(day=1))
            df = df.drop(columns=['year', 'semester', 'month'])
        elif freq.upper() == 'Q':
            df['TIME_PERIOD'] = pd.PeriodIndex(df['TIME_PERIOD'], freq='Q').to_timestamp()
        elif freq.upper() == 'M':
            df['TIME_PERIOD'] = pd.PeriodIndex(df['TIME_PERIOD'], freq='M').to_timestamp()
        else:
            ANNUAL_COLS.append(ds_name)
            df['TIME_PERIOD'] = pd.to_datetime(df['TIME_PERIOD'])
    
    except Exception as e:
        print(f"Could not parse TIME_PERIOD: {e}")

    return df

In [354]:
def preprocess_dataset(df, ds_name, filters, id_vars, index_column=None, indices=None):
    df_clean = df.copy()

    for col, val in filters.items():
        df_clean = df_clean[df_clean[col] == val]

    if index_column is not None:
        df_clean = df_clean[df_clean[index_column].isin(indices)]
    
    non_null_columns = df_clean.columns[~(df_clean.isnull().sum() > 0)]
    df_clean = df_clean[non_null_columns]

    value_vars = [col for col in df_clean.columns if col not in id_vars]

    df_clean = df_clean.melt(
        id_vars=id_vars,
        value_vars=value_vars,
        var_name='TIME_PERIOD',
        value_name='VALUE'
    )
       
    if indices is not None and index_column is not None:
        df_clean = df_clean.pivot_table(
            index=['TIME_PERIOD', 'freq'],
            columns=index_column,
            values='VALUE'
        ).reset_index()
    
    df_clean = df_clean.rename(columns={"VALUE": f"{ds_name}"})
    df_clean = col_to_timestamp(df_clean, ds_name)
    
    if f'{ds_name}' in df_clean.columns:
        df_clean = df_clean[['TIME_PERIOD', f'{ds_name}']]
    elif indices:
        df_clean = df_clean[['TIME_PERIOD', *indices]]
        df_clean = df_clean.rename(str.lower, axis='columns')
        df_clean = df_clean.rename(columns={'time_period': 'TIME_PERIOD'})
        df_clean.columns.name = None

    DFS[f"df_{ds_name}"] = df_clean
    
    return df_clean

### GDP Quarterly

In [355]:
indicator_name = "namq_10_gdp"
ds_name = "gdp_quarterly"
filename = f"{ds_name}.csv"
local_path = os.path.join(OUTPUT_DIR, filename)

INDICATORS.append(indicator_name)

In [356]:
df = get_dataset(
    indicator_name=indicator_name, 
    local_path=local_path, 
    country=COUNTRY,
    unit="CLV10_MEUR",
)

# CLV10_MEUR - Chain Linked Volumes, reference year 2010, Million Euro - Inflation adjusted 
# CLV05_MEUR - Chain Linked Volumes, reference year 2005, Million Euro - Inflation adjusted
# CP_MEUR - Current prices, million euros - Not iflation adjusted

df.head()

Found locally: data_eurostat/gdp_quarterly.csv


Unnamed: 0,freq,unit,s_adj,na_item,geo,1975-Q1,1975-Q2,1975-Q3,1975-Q4,1976-Q1,...,2023-Q2,2023-Q3,2023-Q4,2024-Q1,2024-Q2,2024-Q3,2024-Q4,2025-Q1,2025-Q2,2025-Q3
0,Q,CLV10_MEUR,CA,B1G,SE,,,,,,...,105644.7,97111.3,110123.1,103656.0,106134.4,98127.5,112437.8,104241.4,107534.1,
1,Q,CLV10_MEUR,CA,B1GQ,SE,,,,,,...,119210.5,109400.6,124109.1,115433.1,119536.1,110572.7,126648.8,116089.2,121263.2,
2,Q,CLV10_MEUR,CA,D21,SE,,,,,,...,14210.9,12903.8,14670.0,12364.1,14045.3,13092.8,14855.8,12380.5,14358.9,
3,Q,CLV10_MEUR,CA,D21X31,SE,,,,,,...,13558.7,12280.2,13976.4,11713.2,13400.8,12447.0,14210.7,11783.7,13735.4,
4,Q,CLV10_MEUR,CA,D31,SE,,,,,,...,635.4,616.2,681.4,655.7,628.5,641.1,619.9,589.2,599.2,


In [357]:
filters = {'na_item': 'B1GQ', 's_adj': 'SCA'}
id_vars = ['freq', 'geo', 'na_item', 'unit', 's_adj']

df_clean = preprocess_dataset(df, ds_name, filters, id_vars)

df_clean.tail()

Unnamed: 0,TIME_PERIOD,gdp_quarterly
125,2024-04-01,117243.0
126,2024-07-01,118212.1
127,2024-10-01,118729.0
128,2025-01-01,118509.7
129,2025-04-01,119066.0


### GDP Anually

In [358]:
indicator_name = "nama_10_gdp"
ds_name = "gdp"
filename = f"{ds_name}.csv"
local_path = os.path.join(OUTPUT_DIR, filename)

INDICATORS.append(indicator_name)

In [359]:
df = get_dataset(
    indicator_name=indicator_name, 
    local_path=local_path, 
    country=COUNTRY,
    unit="CLV10_MEUR",
)

# CLV10_MEUR - Chain Linked Volumes, reference year 2010, Million Euro - Inflation adjusted 
# CLV05_MEUR - Chain Linked Volumes, reference year 2005, Million Euro - Inflation adjusted
# CP_MEUR - Current prices, million euros - Not iflation adjusted

df.head()

Found locally: data_eurostat/gdp.csv


Unnamed: 0,freq,unit,na_item,geo,1975,1976,1977,1978,1979,1980,...,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
0,A,CLV10_MEUR,B1G,SE,,,,,,,...,365697.7,372153.4,379639.8,386562.7,397430.6,388429.9,409269.3,415188.5,416536.5,420134.1
1,A,CLV10_MEUR,B1GQ,SE,,,,,,,...,413175.5,421981.0,429926.2,437536.9,448948.9,440267.2,463274.9,469091.0,468133.8,471972.8
2,A,CLV10_MEUR,D21,SE,,,,,,,...,50064.8,52450.8,53037.1,53708.9,54144.2,54567.7,56681.1,56767.6,54173.8,54376.4
3,A,CLV10_MEUR,D21X31,SE,,,,,,,...,47483.8,49870.0,50317.1,51002.3,51541.3,51879.8,54026.4,53900.5,51557.2,51791.6
4,A,CLV10_MEUR,D31,SE,,,,,,,...,2569.8,2563.7,2709.4,2690.1,2575.2,2664.9,2613.5,2865.4,2585.8,2544.3


In [360]:
filters = {'na_item': 'B1GQ'}
id_vars = ['freq', 'geo', 'na_item', 'unit']

df_clean = preprocess_dataset(df, ds_name, filters, id_vars)

df_clean.head()

Unnamed: 0,TIME_PERIOD,gdp
0,1993-01-01,235464.4
1,1994-01-01,244920.1
2,1995-01-01,254948.1
3,1996-01-01,259294.2
4,1997-01-01,267241.5


### Unemployment Rate

In [361]:
# indicator_name = "tps00203"
# ds_name = "unemployment_rate"
# filename = f"{ds_name}.csv"
# local_path = os.path.join(OUTPUT_DIR, filename)

In [362]:
# df = get_dataset(indicator_name=indicator_name, local_path=local_path, country=COUNTRY)

# df.head()

In [363]:
# filters = {'unit': 'PC_POP'}
# id_vars = ['freq', 'geo', 'sex', 'unit', 'age']

# df_clean = preprocess_dataset(df, ds_name, filters, id_vars)

# df_clean.head()

### Inflation Rate Anually

In [406]:
indicator_name = "tec00118"
ds_name = "inflation_rate_anually"
filename = f"{ds_name}.csv"
local_path = os.path.join(OUTPUT_DIR, filename)

INDICATORS.append(indicator_name)

In [407]:
df = get_dataset(indicator_name=indicator_name, local_path=local_path, country=COUNTRY)

df.head()

Found locally: data_eurostat/inflation_rate_anually.csv


Unnamed: 0,freq,unit,coicop,geo,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
0,A,RCH_A_AVG,CP00,SE,0.4,0.2,0.7,1.1,1.9,2.0,1.7,0.7,2.7,8.1,5.9,2.0


In [366]:
filters = {}
id_vars = ['freq', 'unit', 'coicop', 'geo']

df_clean = preprocess_dataset(df, ds_name, filters, id_vars)

df_clean.head(25)

Unnamed: 0,TIME_PERIOD,inflation_rate_anually
0,2013-01-01,0.4
1,2014-01-01,0.2
2,2015-01-01,0.7
3,2016-01-01,1.1
4,2017-01-01,1.9
5,2018-01-01,2.0
6,2019-01-01,1.7
7,2020-01-01,0.7
8,2021-01-01,2.7
9,2022-01-01,8.1


### Education Attainmanet (25-64)

In [367]:
indicator_name = "tgs00109"
ds_name = "education_attainment"
filename = f"{ds_name}.csv"
local_path = os.path.join(OUTPUT_DIR, filename)

INDICATORS.append(indicator_name)

In [368]:
df = get_dataset(indicator_name=indicator_name, local_path=local_path)

df.head()

Found locally: data_eurostat/education_attainment.csv


Unnamed: 0,freq,unit,isced11,age,sex,geo,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
0,A,PC,ED5-8,Y25-64,F,AT11,14.4,24.8,26.7,27.1,28.3,28.4,30.5,32.4,31.3,31.9,34.1,35.5
1,A,PC,ED5-8,Y25-64,F,AT12,15.5,27.7,28.1,28.7,31.1,30.3,31.9,33.2,33.8,33.4,36.2,38.7
2,A,PC,ED5-8,Y25-64,F,AT13,30.7,40.3,39.3,41.2,43.6,43.4,44.9,44.7,46.1,47.1,47.5,49.4
3,A,PC,ED5-8,Y25-64,F,AT21,17.0,29.5,28.2,28.3,30.2,30.6,32.0,32.8,34.7,34.1,34.4,36.7
4,A,PC,ED5-8,Y25-64,F,AT22,15.5,24.5,24.4,27.0,27.8,27.8,29.2,31.6,30.2,31.5,33.9,33.8


In [369]:
df_clean = df.copy()

sweden_geos = ['SE11', 'SE12', 'SE21', 'SE22', 'SE23', 'SE31', 'SE32', 'SE33']

df_clean = (df_clean[df_clean['geo'].isin(sweden_geos)])

df_clean = df_clean[
    (df_clean['sex'] == 'T')
]

non_null_columns = df_clean.columns[~(df_clean.isnull().sum() > 0)]
df_clean = df_clean[non_null_columns]

id_vars = ['freq', 'unit', 'isced11', 'age', 'sex', 'geo']
value_vars = [col for col in df_clean.columns if col not in id_vars]

df_clean = df_clean.melt(
    id_vars=id_vars,
    value_vars=value_vars,
    var_name='TIME_PERIOD',
    value_name='VALUE'
)

df_clean = df_clean.rename(columns={"VALUE": f"{ds_name}"})

df_clean = (
    df_clean.groupby(['TIME_PERIOD', 'freq', 'sex', 'age', 'isced11'])
             [f'{ds_name}']
             .mean()
             .reset_index()
)

df_clean['geo'] = 'SE'
df_clean = col_to_timestamp(df_clean, ds_name)
df_clean = df_clean[['TIME_PERIOD', f'{ds_name}']]

DFS[f"df_{ds_name}"] = df_clean

df_clean.head(25)

Unnamed: 0,TIME_PERIOD,education_attainment
0,2013-01-01,34.7125
1,2014-01-01,36.1875
2,2015-01-01,37.3
3,2016-01-01,38.55
4,2017-01-01,39.275
5,2018-01-01,40.7375
6,2019-01-01,41.3875
7,2020-01-01,41.85
8,2021-01-01,43.7375
9,2022-01-01,45.7625


### GDP per Capita

In [370]:
indicator_name = "nama_10_pc"
ds_name = "gdp_per_capita"
filename = f"{ds_name}.csv"
local_path = os.path.join(OUTPUT_DIR, filename)

INDICATORS.append(indicator_name)

In [371]:
df = get_dataset(
    indicator_name=indicator_name, 
    local_path=local_path,
    country=COUNTRY 
)

# df.head()

Found locally: data_eurostat/gdp_per_capita.csv


In [372]:
filters = {'na_item': 'B1GQ', 'unit': 'CLV10_EUR_HAB'}
id_vars = ['freq', 'geo', 'na_item', 'unit']

df_clean = preprocess_dataset(df, ds_name, filters, id_vars)

df_clean.tail()

Unnamed: 0,TIME_PERIOD,gdp_per_capita
27,2020-01-01,42520.0
28,2021-01-01,44480.0
29,2022-01-01,44560.0
30,2023-01-01,44170.0
31,2024-01-01,44400.0


### GDP Growth Rate

In [373]:
indicator_name = "tec00115"
ds_name = "gdp_growth_rate"
filename = f"{ds_name}.csv"
local_path = os.path.join(OUTPUT_DIR, filename)

INDICATORS.append(indicator_name)

In [374]:
df = get_dataset(
    indicator_name=indicator_name, 
    local_path=local_path,
    country=COUNTRY
)

# df.head()

Found locally: data_eurostat/gdp_growth_rate.csv


In [375]:
filters = {'na_item': 'B1GQ', 'unit': 'CLV_PCH_PRE'}
id_vars = ['freq', 'geo', 'na_item', 'unit']

df_clean = preprocess_dataset(df, ds_name, filters, id_vars)

df_clean.tail()

Unnamed: 0,TIME_PERIOD,gdp_growth_rate
7,2020-01-01,-1.9
8,2021-01-01,5.2
9,2022-01-01,1.3
10,2023-01-01,-0.2
11,2024-01-01,0.8


### Price Index

In [376]:
indicator_name = "prc_hicp_midx"
ds_name = "price_index"
filename = f"{ds_name}.csv"
local_path = os.path.join(OUTPUT_DIR, filename)

INDICATORS.append(indicator_name)

In [377]:
df = get_dataset(
    indicator_name=indicator_name, 
    local_path=local_path,
    country=COUNTRY,
    unit='I15', # index, base year 2015
    coicop='CP00'
)

df.head()

Found locally: data_eurostat/price_index.csv


Unnamed: 0,freq,unit,coicop,geo,1996-01,1996-02,1996-03,1996-04,1996-05,1996-06,...,2024-12,2025-01,2025-02,2025-03,2025-04,2025-05,2025-06,2025-07,2025-08,2025-09
0,M,I15,CP00,SE,75.41,75.6,76.14,76.49,76.56,76.28,...,130.46,130.62,131.94,131.19,131.5,131.96,132.71,133.12,132.69,132.9


In [378]:
filters = {}
id_vars = ['freq', 'geo', 'coicop', 'unit']

df_clean = preprocess_dataset(df, ds_name, filters, id_vars)

df_clean.tail()

Unnamed: 0,TIME_PERIOD,price_index
352,2025-05-01,131.96
353,2025-06-01,132.71
354,2025-07-01,133.12
355,2025-08-01,132.69
356,2025-09-01,132.9


### Energy Prices

In [379]:
indicator_name = "nrg_pc_204"
ds_name = "energy_prices"
filename = f"{ds_name}.csv"
local_path = os.path.join(OUTPUT_DIR, filename)

INDICATORS.append(indicator_name)

In [380]:
df = get_dataset(
    indicator_name=indicator_name, 
    local_path=local_path,
    country=COUNTRY
)

df.columns = df.columns.str.replace('-', '', regex=False)

Found locally: data_eurostat/energy_prices.csv


In [381]:
filters = {'tax': 'I_TAX', 'currency': 'EUR', 'nrg_cons': 'TOT_KWH'}
id_vars = ['freq', 'geo', 'product', 'nrg_cons', 'unit', 'tax', 'currency']
value_vars = [col for col in df.columns if col not in id_vars]

consumption_midpoints = {
    'KWH_LT1000': 500,
    'KWH1000-2499': 1749.5,
    'KWH2500-4999': 3749.5,
    'KWH5000-14999': 9999.5,
    'KWH_GE15000': 20000
}

df = df[(df['tax']=='I_TAX') & (df['currency']=='EUR')]
df['weight'] = df['nrg_cons'].map(consumption_midpoints)

In [382]:
tot_values = [(df[col] * df['weight']).sum() / df['weight'].sum() for col in value_vars] # pyright: ignore[reportUnboundVariable]

# Create a new row for TOT_KWH with the same metadata as the other rows
tot_row = df.iloc[-1].copy()
tot_row['nrg_cons'] = 'TOT_KWH'
tot_row[value_vars] = tot_values
df = df.drop(df.index[-1])

# Append it
df = pd.concat([df, pd.DataFrame([tot_row])], ignore_index=True)
df = df.drop(columns=['weight'])

df.tail()

Unnamed: 0,freq,product,nrg_cons,unit,tax,currency,geo,2007S1,2007S2,2008S1,...,2020S2,2021S1,2021S2,2022S1,2022S2,2023S1,2023S2,2024S1,2024S2,2025S1
1,S,6000,KWH2500-4999,KWH,I_TAX,EUR,SE,0.1558,0.1613,0.1698,...,0.2017,0.2114,0.2604,0.2278,0.274,0.2662,0.2181,0.2434,0.2351,0.2654
2,S,6000,KWH5000-14999,KWH,I_TAX,EUR,SE,0.1389,0.147,0.1485,...,0.1655,0.1695,0.2106,0.177,0.228,0.2166,0.1883,0.2034,0.1935,0.209
3,S,6000,KWH_GE15000,KWH,I_TAX,EUR,SE,0.1241,0.1311,0.1349,...,0.146,0.1506,0.1732,0.1524,0.1915,0.1924,0.1657,0.1808,0.1666,0.1834
4,S,6000,KWH_LT1000,KWH,I_TAX,EUR,SE,0.2715,0.2906,0.287,...,0.3564,0.4051,0.4701,0.4513,0.4747,0.4546,0.381,0.4471,0.4484,0.5118
5,S,6000,TOT_KWH,KWH,I_TAX,EUR,SE,0.135145,0.142933,0.146977,...,0.163785,0.170009,0.202345,0.17655,0.219686,0.215563,0.184139,0.201573,0.189821,0.209401


In [383]:
df_clean = preprocess_dataset(df, ds_name, filters, id_vars)

df_clean.head()

Unnamed: 0,TIME_PERIOD,energy_prices
0,2007-01-01,0.135145
1,2007-07-01,0.142933
2,2008-01-01,0.146977
3,2008-07-01,0.150458
4,2009-01-01,0.138676


### Inflation Rate Monthly (Monthly Rate of Change)

In [415]:
indicator_name = "prc_hicp_mmor"
ds_name = "inflation_rate_monthly_m"
filename = f"{ds_name}.csv"
local_path = os.path.join(OUTPUT_DIR, filename)

INDICATORS.append(indicator_name)
# prc_hicp_mmor
# prc_hicp_manr

In [416]:
df = get_dataset(
    indicator_name=indicator_name, 
    local_path=local_path,
    country=COUNTRY,
    coicop='AP'
)

df.head()

Found locally: data_eurostat/inflation_rate_monthly_m.csv


Unnamed: 0,freq,unit,coicop,geo,1996-02,1996-03,1996-04,1996-05,1996-06,1996-07,...,2024-12,2025-01,2025-02,2025-03,2025-04,2025-05,2025-06,2025-07,2025-08,2025-09
0,M,RCH_M,AP,SE,,,,,,,...,0.4,2.8,0.5,0.2,0.4,0.0,-0.1,-0.8,0.2,0.4


In [417]:
filters = {}
id_vars = ['freq', 'geo', 'coicop', 'unit']

df_clean = preprocess_dataset(df, ds_name, filters, id_vars)

df_clean.tail(50)

Unnamed: 0,TIME_PERIOD,inflation_rate_monthly_m
247,2021-08-01,0.1
248,2021-09-01,0.5
249,2021-10-01,0.2
250,2021-11-01,0.0
251,2021-12-01,0.0
252,2022-01-01,1.4
253,2022-02-01,0.5
254,2022-03-01,0.2
255,2022-04-01,0.3
256,2022-05-01,0.3


### Inflation Rate Monthly (Annual Rate of Change)

In [408]:
indicator_name = "prc_hicp_manr"
ds_name = "inflation_rate_monthly_y"
filename = f"{ds_name}.csv"
local_path = os.path.join(OUTPUT_DIR, filename)

INDICATORS.append(indicator_name)

In [409]:
df = get_dataset(
    indicator_name=indicator_name, 
    local_path=local_path,
    country=COUNTRY,
    coicop='AP'
)

df.head()

Found locally: data_eurostat/inflation_rate_monthly_y.csv


Unnamed: 0,freq,unit,coicop,geo,1997-01,1997-02,1997-03,1997-04,1997-05,1997-06,...,2024-12,2025-01,2025-02,2025-03,2025-04,2025-05,2025-06,2025-07,2025-08,2025-09
0,M,RCH_A,AP,SE,,,,,,,...,5.9,5.7,5.3,4.9,4.8,4.7,4.6,4.8,4.7,4.7


In [412]:
filters = {}
id_vars = ['freq', 'geo', 'coicop', 'unit']

df_clean = preprocess_dataset(df, ds_name, filters, id_vars)

df_clean.tail(35)

Unnamed: 0,TIME_PERIOD,inflation_rate_monthly_y
240,2022-11-01,2.7
241,2022-12-01,2.9
242,2023-01-01,3.6
243,2023-02-01,3.8
244,2023-03-01,4.1
245,2023-04-01,4.5
246,2023-05-01,4.4
247,2023-06-01,4.8
248,2023-07-01,4.9
249,2023-08-01,5.0


### Gas Emissions

In [390]:
indicator_name = "env_air_gge"
ds_name = "gas_emissions"
filename = f"{ds_name}.csv"
local_path = os.path.join(OUTPUT_DIR, filename)

INDICATORS.append(indicator_name)

In [391]:
df = get_dataset(
    indicator_name=indicator_name, 
    local_path=local_path,
    country=COUNTRY,
)

df.head()

Found locally: data_eurostat/gas_emissions.csv


Unnamed: 0,freq,unit,airpol,src_crf,geo,1990,1991,1992,1993,1994,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,A,MIO_T,CH4,CRF1,SE,0.01694,0.01792,0.01764,0.01705,0.01741,...,0.00988,0.00955,0.00994,0.0096,0.00968,0.00937,0.00943,0.0103,0.0092,0.00859
1,A,MIO_T,CH4,CRF1A,SE,0.0126,0.01309,0.01276,0.01212,0.01242,...,0.0067,0.00659,0.00682,0.00672,0.00635,0.00675,0.00671,0.00773,0.00703,0.00648
2,A,MIO_T,CH4,CRF1A1,SE,0.00045,0.00053,0.00054,0.00065,0.00075,...,0.00166,0.00169,0.00182,0.00183,0.00183,0.00181,0.00162,0.00233,0.00233,0.00221
3,A,MIO_T,CH4,CRF1A1A,SE,0.00041,0.0005,0.0005,0.00062,0.00071,...,0.00162,0.00165,0.00178,0.00179,0.00179,0.00178,0.00158,0.00193,0.00185,0.00181
4,A,MIO_T,CH4,CRF1A1B,SE,3e-05,3e-05,3e-05,3e-05,3e-05,...,3e-05,4e-05,3e-05,3e-05,3e-05,3e-05,0.0,0.00039,0.0,0.0


In [392]:
df['airpol'].unique()

array(['CH4', 'CH4_CO2E', 'CO2', 'GHG', 'HFC_CO2E', 'HFC_PFC_NSP_CO2E',
       'N2O', 'N2O_CO2E', 'NF3_CO2E', 'PFC_CO2E', 'SF6_CO2E'],
      dtype=object)

In [393]:
filters = {'src_crf': 'CRF1', 'unit': 'MIO_T', 'airpol': 'GHG'}
id_vars = ['freq', 'geo', 'src_crf', 'unit', 'airpol']

df_clean = preprocess_dataset(df, ds_name, filters, id_vars)

df_clean.tail()

Unnamed: 0,TIME_PERIOD,gas_emissions
29,2019-01-01,34.81046
30,2020-01-01,32.02049
31,2021-01-01,33.2899
32,2022-01-01,30.84396
33,2023-01-01,30.43507


### Household Income

In [394]:
indicator_name = "ilc_di01"
ds_name = "household_income"
filename = f"{ds_name}.csv"
local_path = os.path.join(OUTPUT_DIR, filename)

INDICATORS.append(indicator_name)

In [395]:
df = get_dataset(
    indicator_name=indicator_name, 
    local_path=local_path,
    country=COUNTRY,
)

df.head()

Found locally: data_eurostat/household_income.csv


Unnamed: 0,freq,quantile,indic_il,currency,geo,1995,1996,1997,1998,1999,...,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
0,A,D1,SHARE,EUR,SE,,,,,,...,3.2,3.1,3.2,3.3,2.9,3.2,3.3,2.8,2.6,2.7
1,A,D1,SHARE,NAC,SE,,,,,,...,3.2,3.1,3.2,3.3,2.9,3.2,3.3,2.8,2.6,2.7
2,A,D1,SHARE,PPS,SE,,,,,,...,3.2,3.1,3.2,3.3,2.9,3.2,3.3,2.8,2.6,2.7
3,A,D1,TC,EUR,SE,,,,,,...,12885.0,12765.0,13056.0,12915.0,12146.0,12706.0,13168.0,13621.0,13630.0,13741.0
4,A,D1,TC,NAC,SE,,,,,,...,117235.0,119401.0,123628.0,124442.0,124594.0,134546.0,138065.0,138204.0,144879.0,157726.0


In [396]:
df['quantile'].unique()

array(['D1', 'D10', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'P1',
       'P100', 'P2', 'P3', 'P4', 'P5', 'P95', 'P96', 'P97', 'P98', 'P99',
       'Q1', 'Q2', 'Q3', 'Q4', 'QU1', 'QU2', 'QU3', 'QU4', 'QU5'],
      dtype=object)

In [397]:
filters = {'currency': 'EUR', 'indic_il': 'TC'}
id_vars = ['freq', 'quantile', 'indic_il', 'currency', 'geo']
indices = ['Q1', 'Q2', 'Q3']
index_name = 'quantile'

df_clean = preprocess_dataset(df, ds_name, filters, id_vars, index_name, indices)

df_clean.head()

Unnamed: 0,TIME_PERIOD,q1,q2,q3
0,2004-01-01,13211.0,17329.0,22286.0
1,2005-01-01,13677.0,17498.0,22678.0
2,2006-01-01,13513.0,17991.0,23020.0
3,2007-01-01,14378.0,18845.0,24210.0
4,2008-01-01,14754.0,20217.0,26045.0


### Population

In [398]:
indicator_name = "demo_gind"
ds_name = "demographics"
filename = f"{ds_name}.csv"
local_path = os.path.join(OUTPUT_DIR, filename)

INDICATORS.append(indicator_name)

In [399]:
df = get_dataset(
    indicator_name=indicator_name, 
    local_path=local_path,
    country=COUNTRY,
)

df.head()

Found locally: data_eurostat/demographics.csv


Unnamed: 0,freq,indic_de,geo,1960,1961,1962,1963,1964,1965,1966,...,2016,2017,2018,2019,2020,2021,2022,2023,2024,2025
0,A,AVG,SE,7484656.0,7519998.0,7561588.0,7604328.0,7661354.0,7733853.0,7807797.0,...,9923085.0,10057698.0,10175214.0,10278887.0,10353442.0,10415811.0,10486941.0,10536632.0,10569709.0,
1,A,CNMIGRAT,SE,-504.0,13115.0,8627.0,9916.0,21690.0,32694.0,25668.0,...,117693.0,101645.0,86296.0,71647.0,36753.0,50726.0,59233.0,24485.0,28820.0,
2,A,CNMIGRATRT,SE,-0.1,1.7,1.1,1.3,2.8,4.2,3.3,...,11.9,10.1,8.5,7.0,3.5,4.9,5.6,2.3,2.7,
3,A,DEATH,SE,75093.0,73555.0,76791.0,76460.0,76661.0,78194.0,78440.0,...,90982.0,91972.0,92185.0,88766.0,98124.0,91958.0,94737.0,94385.0,91268.0,
4,A,FAVG,SE,3750712.0,3768418.0,3788942.0,3810352.0,3838056.0,3872168.0,3906976.0,...,4950929.0,5009693.0,5062664.0,5109761.0,5144112.0,5174034.0,5207426.0,5231210.0,,


In [400]:
filters = {}
id_vars = ['freq', 'indic_de', 'geo']
indices = ['CNMIGRAT', 'AVG', 'GROW', 'GROWRT', 'LBIRTH', 'DEATH']
index_name = 'indic_de'

df_clean = preprocess_dataset(df, ds_name, filters, id_vars, index_name, indices)

df_clean.head()

Unnamed: 0,TIME_PERIOD,cnmigrat,avg,grow,growrt,lbirth,death
0,1960-01-01,-504.0,7484656.0,26622.0,3.6,102219.0,75093.0
1,1961-01-01,13115.0,7519998.0,44061.0,5.9,104501.0,73555.0
2,1962-01-01,8627.0,7561588.0,39120.0,5.2,107284.0,76791.0
3,1963-01-01,9916.0,7604328.0,46359.0,6.1,112903.0,76460.0
4,1964-01-01,21690.0,7661354.0,67693.0,8.8,122664.0,76661.0


### Unemployment Rate Monthly

In [401]:
indicator_name = "une_rt_m"
ds_name = "unemployment_rate_monthly"
filename = f"{ds_name}.csv"
local_path = os.path.join(OUTPUT_DIR, filename)

INDICATORS.append(indicator_name)

In [402]:
df = get_dataset(
    indicator_name=indicator_name, 
    local_path=local_path,
    country=COUNTRY,
)

df.head()

Found locally: data_eurostat/unemployment_rate_monthly.csv


Unnamed: 0,freq,s_adj,age,unit,sex,geo,1983-01,1983-02,1983-03,1983-04,...,2024-12,2025-01,2025-02,2025-03,2025-04,2025-05,2025-06,2025-07,2025-08,2025-09
0,M,NSA,TOTAL,PC_ACT,F,SE,3.9,3.7,3.6,3.1,...,8.6,10.6,9.0,8.8,8.8,10.3,8.5,7.8,9.0,8.2
1,M,NSA,TOTAL,PC_ACT,M,SE,3.8,4.0,4.0,3.1,...,7.5,10.2,9.7,8.2,9.0,9.1,10.1,8.2,7.8,8.4
2,M,NSA,TOTAL,PC_ACT,T,SE,3.8,3.8,3.8,3.1,...,8.0,10.4,9.4,8.5,8.9,9.7,9.4,8.0,8.4,8.3
3,M,NSA,TOTAL,THS_PER,F,SE,77.0,74.0,73.0,62.0,...,230.0,292.0,242.0,240.0,243.0,286.0,240.0,221.0,247.0,223.0
4,M,NSA,TOTAL,THS_PER,M,SE,88.0,93.0,93.0,72.0,...,223.0,300.0,290.0,245.0,270.0,276.0,319.0,253.0,239.0,252.0


In [403]:
filters = {'s_adj': 'SA', 'sex': 'T', 'unit': 'PC_ACT'}
id_vars = ['freq', 's_adj', 'geo', 'sex', 'unit', 'age']
index_name = 'age'
indices = ['TOTAL', 'Y25-74', 'Y_LT25']

df_clean = preprocess_dataset(df, ds_name, filters, id_vars, index_name, indices)

df_clean.head()

Unnamed: 0,TIME_PERIOD,total,y25-74,y_lt25
0,1983-01-01,3.4,2.5,8.1
1,1983-02-01,3.7,2.5,9.7
2,1983-03-01,3.7,2.5,10.0
3,1983-04-01,3.4,2.4,8.7
4,1983-05-01,3.7,2.6,9.6


## Merge Datasets

In [404]:
rename_columns_map = {
    'q1': 'household_income_q1', 
    'q2': 'household_income_q2',
    'q3': 'household_income_q3',
    'cnmigrat': 'net_migration',
    'avg': 'total_population',
    'grow': 'net_population_growth',
    'growrt': 'growth_rate',
    'lbirth': 'births',
    'death': 'deaths',
    'total': 'total_unemployment_rate',
    'y25-74': 'adult_unemployment_rate',
    'y_lt25': 'youth_unemployment_rate'
}

In [405]:
merged_df = pd.DataFrame()

for name, df_clean in DFS.items():
    if merged_df.empty:
        merged_df = df_clean
        continue

    merged_df = pd.merge(merged_df, df_clean, on="TIME_PERIOD", how="outer")

merged_df = merged_df[merged_df['TIME_PERIOD'] >= '2000']
merged_df = merged_df.sort_values("TIME_PERIOD").reset_index(drop=True)

# Rename unclear columns
merged_df = merged_df.rename(
    columns={old: new for old, new in rename_columns_map.items() if old in merged_df.columns}
)

merged_path = os.path.join(OUTPUT_DIR, "sweden_eurostat_data.csv")
merged_df.to_csv(merged_path, index=False)

merged_df.head()

Unnamed: 0,TIME_PERIOD,gdp_quarterly,gdp,inflation_rate_anually,education_attainment,gdp_per_capita,gdp_growth_rate,price_index,energy_prices,inflation_rate_monthly_m,...,household_income_q3,net_migration,total_population,net_population_growth,growth_rate,births,deaths,total_unemployment_rate,adult_unemployment_rate,youth_unemployment_rate
0,2000-01-01,73892.1,303735.8,,,34240.0,,78.83,,,...,,24386.0,8872109.0,21366.0,2.4,90441.0,93461.0,6.2,5.6,10.4
1,2000-02-01,,,,,,,79.2,,,...,,,,,,,,6.1,5.5,10.7
2,2000-03-01,,,,,,,79.59,,,...,,,,,,,,5.9,5.3,10.5
3,2000-04-01,76068.8,,,,,,79.49,,,...,,,,,,,,5.9,5.1,11.6
4,2000-05-01,,,,,,,79.96,,,...,,,,,,,,5.6,5.1,9.6
