In [28]:
import pandas as pd
import numpy as np

## Load Data

In [29]:
df = pd.read_csv("../DataSets/pivoted_filled_preprocessed_dataset.csv")

# expected column names
COL_COUNTRY = "Reference area"
COL_YEAR = "TIME_PERIOD"

col_earnings = "Average annual gross earnings [ PPP converted]"
col_diff_ends = "Difficulty making ends meet [Percentage of population aged 16 years or over]"
col_employment = "Employment rate [Percentage of population aged 25-64 years]"
col_safe = "Feeling safe at night [Percentage of population aged 15 years or over]"
col_disposable = "Households and NPISHs net adjusted disposable income per capita [ PPP converted]"
col_housing_cost = "Housing affordability [Percentage of household gross adjusted disposable income]"
col_life_exp = "Life expectancy at birth [Years]"
col_life_sat = "Life satisfaction [0-10 scale]"
col_long_unemp = "Long-term unemployment rate [Percentage of labour force]"
col_social_support = "Social support [Percentage of population aged 15 years or over]"
col_reading = "Student reading skills [Points]"

## Helper functions

In [30]:
def capped_linear(x, lo, hi):
    """Linear rescale to 0-1 with clipping outside [lo, hi]."""
    x_clip = np.clip(x, lo, hi)
    return (x_clip - lo) / (hi - lo)

def capped_linear_reverse(x, lo, hi):
    """0-1 where lo is 'bad' and hi is 'good', with clipping."""
    return 1.0 - capped_linear(x, lo, hi)

def percent_0_100(x):
    """Simple 0-1 for percentages where 0=worst, 100=best."""
    return x / 100.0

def percent_0_100_reverse(x):
    """0-1 for 'bad' percentages where 0 is best."""
    return 1.0 - x / 100.0

## Normalization

In [31]:
# life expectancy: 60–85 years
df["life_exp_norm"] = capped_linear(df[col_life_exp], lo=60, hi=85)

# life satisfaction: 0–10 scale
df["life_sat_norm"] = df[col_life_sat] / 10.0

# earnings: 20k–80k PPP
df["earnings_norm"] = capped_linear(df[col_earnings], lo=20000, hi=80000)

# disposable income: 10k–50k PPP
df["disp_income_norm"] = capped_linear(df[col_disposable], lo=10000, hi=50000)

# employment rate (percent, good when high)
df["employment_norm"] = percent_0_100(df[col_employment])

# feeling safe at night (percent, good when high)
df["safe_norm"] = percent_0_100(df[col_safe])

# social support (percent, good when high)
df["support_norm"] = percent_0_100(df[col_social_support])

# reading skills: 350–600 PISA points
df["reading_norm"] = capped_linear(df[col_reading], lo=350, hi=600)

# difficulty making ends meet (percent, bad when high)
df["diff_ends_norm"] = percent_0_100_reverse(df[col_diff_ends])

# housing affordability: percentage of income spent on housing (bad when high).
# Use empirical 5–95 percentiles as "policy" bounds to avoid extremes.
h_lo = df[col_housing_cost].quantile(0.05)
h_hi = df[col_housing_cost].quantile(0.95)
df["housing_cost_norm"] = capped_linear_reverse(df[col_housing_cost], lo=h_lo, hi=h_hi)

# long-term unemployment (percent of labour force, bad when high)
# Again use 5–95 percentiles for robustness.
u_lo = df[col_long_unemp].quantile(0.05)
u_hi = df[col_long_unemp].quantile(0.95)
df["long_unemp_norm"] = capped_linear_reverse(df[col_long_unemp], lo=u_lo, hi=u_hi)

# clip all norms to [0,1] just in case of numerical noise
norm_cols = [c for c in df.columns if c.endswith("_norm")]
df[norm_cols] = df[norm_cols].clip(0.0, 1.0)

## Assignation of Maslow pyramid

In [32]:
level1_cols = ["life_exp_norm", "diff_ends_norm"]
level2_cols = ["employment_norm", "long_unemp_norm",
               "safe_norm", "earnings_norm", "disp_income_norm",
               "housing_cost_norm"]
level3_cols = ["support_norm"]
level4_cols = ["life_sat_norm"]
level5_cols = ["reading_norm"]

df["L1_phys"]      = df[level1_cols].mean(axis=1, skipna=True)
df["L2_safety"]    = df[level2_cols].mean(axis=1, skipna=True)
df["L3_belonging"] = df[level3_cols].mean(axis=1, skipna=True)
df["L4_esteem"]    = df[level4_cols].mean(axis=1, skipna=True)
df["L5_self_act"]  = df[level5_cols].mean(axis=1, skipna=True)

## Weighting of every layer

In [33]:
w = {
    "L1_phys": 0.35,
    "L2_safety": 0.25,
    "L3_belonging": 0.20,
    "L4_esteem": 0.12,
    "L5_self_act": 0.08,
}

level_cols = ["L1_phys", "L2_safety", "L3_belonging", "L4_esteem", "L5_self_act"]

def weighted_index(row):
    # keep only levels that are not NaN
    valid = [c for c in level_cols if pd.notna(row[c])]
    if not valid:
        return np.nan
    # renormalise weights so they sum to 1 over valid levels
    total_w = sum(w[c] for c in valid)
    return sum((w[c] / total_w) * row[c] for c in valid)

df["wellbeing_index"] = df.apply(weighted_index, axis=1)

## Saving Results

In [34]:
cols_out = [
    COL_COUNTRY, COL_YEAR,
    "life_exp_norm", "diff_ends_norm",
    "earnings_norm", "disp_income_norm",
    "employment_norm", "long_unemp_norm",
    "safe_norm", "housing_cost_norm",
    "life_sat_norm", "support_norm",
    "reading_norm",
    "L1_phys", "L2_safety", "L3_belonging", "L4_esteem", "L5_self_act",
    "wellbeing_index",
]

df[cols_out].to_csv("../DataSets/wellbeing_index_by_country_year.csv", index=False)

## Pivoting results having year in columns

In [35]:
pivot_index = df.pivot_table(
    index=COL_COUNTRY,      # one row per country
    columns=COL_YEAR,       # one column per year
    values="wellbeing_index"
)

pivot_index = pivot_index.sort_index(axis=0).sort_index(axis=1)
pivot_index.to_csv("../DataSets/pivot_wellbeing_index.csv")