In [37]:
import sys
import os
import pandas as pd
import matplotlib.pyplot as plt
import imageio
import numpy as np
from sklearn.linear_model import LinearRegression
import statsmodels.formula.api as smf
import seaborn as sns
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tsa.stattools import adfuller

In [38]:
#Import function for cleaning, merged, and import data from cleaning_data.py and Import_and_Cleaning.py
sys.path.append('../src/Import_and_Cleaning')
from import_and_merge_data import load_and_merge_all_data

df_main = load_and_merge_all_data()

In [39]:
# --- Data Transformation and Preparation ---
df_analysis = df_main.copy()
# ---  Log‐transform GDP per capita ---
df_analysis['log_GDP_percapita'] = np.log(df_analysis['GDP_percapita'])

In [40]:
# Prepare the design‐matrix for VIF
features = [
    'Dependency_Ratio_Old',
    'Dependency_Ratio_Young',
    'log_GDP_percapita',
    'trade_gdp'
]

# Subset, drop any NaN/∞, force float
X = (
    df_analysis[features]
      .replace([np.inf, -np.inf], np.nan)
      .dropna()
      .astype(float)
)

In [41]:
# Build VIF table
vif = pd.DataFrame({
    'variable': features,
    'VIF': [variance_inflation_factor(X.values, i)
            for i in range(X.shape[1])]
})
print(vif)

                 variable        VIF
0    Dependency_Ratio_Old  10.926990
1  Dependency_Ratio_Young   7.580817
2       log_GDP_percapita  32.781942
3               trade_gdp   3.559165


**Multicollinearity diagnostics**  
- Variance inflation factors (VIF):  
  - Old-age dependency ratio: 10.93  
  - Youth dependency ratio: 7.58  
  - log (GDP per capita): 32.78  
  - Trade /GDP: 3.56  
- The elevated VIFs for the two dependency ratios (and especially for log GDPpc) reflect structural, demographic correlations rather than a sampling anomaly.  
- **Acknowledge & interpret with caution**: we retain both dependency ratios in the main TWFE model and discuss their coefficients as reflecting shifts in overall population structure, not “pure” ceteris-paribus effects.  
- As robustness checks, we will also estimate two simpler specifications—one with only the old-age ratio and one with only the youth ratio—to show how the coefficients move when the other ratio is omitted.  


In [42]:
# --- Panel Stationarity Check: ADF in Levels vs. First Differences ---
from statsmodels.tsa.stattools import adfuller
import numpy as np
import pandas as pd

# 1) Variables to test
test_vars = [
    'Health_Expenditure',
    'Dependency_Ratio_Old',
    'Dependency_Ratio_Young',
    'log_GDP_percapita',
    'trade_gdp'
]

# 2) Strip non‐numeric characters & coerce to float
for var in test_vars:
    # remove %, commas, any other non‐digit/dot/minus
    df_analysis[var] = (
        df_analysis[var]
          .astype(str)
          .str.replace('%',    '', regex=False)
          .str.replace(',',    '', regex=False)
          .str.replace(r'[^0-9\.\-]', '', regex=True)
    )
    df_analysis[var] = pd.to_numeric(df_analysis[var], errors='coerce')

# 4) Build a panel DataFrame
df_panel = df_analysis.set_index(['ISO3', 'Year']).sort_index()

# 5) Run per‐country ADF in levels & first differences
results = []
for var in test_vars:
    p_lv = []
    p_df = []
    for iso, grp in df_panel.groupby(level=0):
        series = grp[var].dropna()
        if len(series) < 10:
            continue
        # ADF on levels
        stat, pval, *_ = adfuller(series, autolag='AIC')
        p_lv.append(pval)
        # ADF on first differences
        stat, pval, *_ = adfuller(series.diff().dropna(), autolag='AIC')
        p_df.append(pval)

    results.append({
        'Variable':         var,
        'Avg. p-level':     np.mean(p_lv),
        'Avg. p-Δ':         np.mean(p_df),
        'Frac stationary Δ': np.mean([p < 0.05 for p in p_df])
    })

# 6) Display results
pd.DataFrame(results).round(3)


Unnamed: 0,Variable,Avg. p-level,Avg. p-Δ,Frac stationary Δ
0,Health_Expenditure,0.519,0.235,0.593
1,Dependency_Ratio_Old,0.873,0.565,0.093
2,Dependency_Ratio_Young,0.296,0.32,0.389
3,log_GDP_percapita,0.358,0.246,0.481
4,trade_gdp,0.457,0.372,0.442


**Stationarity diagnostics**  
Our per‐country ADF tests show that all five series—health expenditure, the old‐age and youth dependency ratios, log GDP per capita, and trade/GDP—fail to reject the null of a unit root in levels (average p-values > 0.05), confirming their non-stationary behavior. First differencing lowers the p-values. While not every series becomes strictly stationary after differencing, the Δ transformation substantially reduces persistence and mitigates the risk of spurious regression.

Accordingly, we proceed with a first‐difference specification in our TWFE models—recognizing in our write-up that some variables, particularly the old‐age dependency ratio, remain highly persistent even in Δ form.  


In [43]:
# --- Step 3: First‐difference for stationarity ---
# 3a) sort by country and year, work on a copy
df_diff = df_analysis.sort_values(['ISO3', 'Year']).copy()

# 3b) variables to difference (including the dependent)
to_diff = [
    'Dependency_Ratio_Old',
    'Dependency_Ratio_Young',
    'log_GDP_percapita',
    'trade_gdp',
    'Health_Expenditure'
]

# 3c) compute year‐on‐year change for each series
for v in to_diff:
    df_diff[f'd_{v}'] = df_diff.groupby('ISO3')[v].diff()

# 3d) drop the first‐difference NaNs
df_diff = df_diff.dropna(subset=[f'd_{v}' for v in to_diff])

# 3e) inspect the new Δ-variables
df_diff[['ISO3','Year'] + [f'd_{v}' for v in to_diff]].head()


Unnamed: 0,ISO3,Year,d_Dependency_Ratio_Old,d_Dependency_Ratio_Young,d_log_GDP_percapita,d_trade_gdp,d_Health_Expenditure
1,ARG,2001,0.066719,-0.464572,-0.067087,-0.770189,0.151788
2,ARG,2002,0.062658,-0.494729,-1.02213,19.900469,-0.978487
3,ARG,2003,0.037586,-0.531833,0.25632,-1.107976,-0.498795
4,ARG,2004,0.031726,-0.556617,0.244931,0.047898,0.334166
5,ARG,2005,0.058268,-0.560617,0.177838,-0.141375,0.382106


## OLS

In [44]:
import statsmodels.formula.api as smf

# 0) Subset your sample to Year ≤ 2022
df_model = df_analysis[df_analysis['Year'] <= 2022].copy()

# 1) Pooled OLS
formula_ols = (
    'Health_Expenditure ~ '
    'Dependency_Ratio_Old + Dependency_Ratio_Young + '
    'log_GDP_percapita + trade_gdp'
)

pooled_ols = smf.ols(formula_ols, data=df_model).fit()
print("=== Model 1: Pooled OLS ===")
print(pooled_ols.summary())


=== Model 1: Pooled OLS ===
                            OLS Regression Results                            
Dep. Variable:     Health_Expenditure   R-squared:                       0.501
Model:                            OLS   Adj. R-squared:                  0.499
Method:                 Least Squares   F-statistic:                     295.5
Date:                Wed, 18 Jun 2025   Prob (F-statistic):          5.02e-176
Time:                        15:18:32   Log-Likelihood:                -2448.6
No. Observations:                1184   AIC:                             4907.
Df Residuals:                    1179   BIC:                             4933.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------


## Country FE

In [45]:
# 2) Country FE
formula_fe_country = (
    'Health_Expenditure ~ '
    'Dependency_Ratio_Old + Dependency_Ratio_Young + '
    'log_GDP_percapita + trade_gdp + '
    'C(ISO3)'
)

fe_country = smf.ols(formula_fe_country, data=df_model).fit()
print("\n=== Model 2: Country FE ===")
print(fe_country.summary())



=== Model 2: Country FE ===
                            OLS Regression Results                            
Dep. Variable:     Health_Expenditure   R-squared:                       0.915
Model:                            OLS   Adj. R-squared:                  0.911
Method:                 Least Squares   F-statistic:                     221.3
Date:                Wed, 18 Jun 2025   Prob (F-statistic):               0.00
Time:                        15:18:32   Log-Likelihood:                -1399.0
No. Observations:                1184   AIC:                             2910.
Df Residuals:                    1128   BIC:                             3194.
Df Model:                          55                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------

## Two‐Way Fixed‐Effects (TWFE)

In [46]:
formula_twfe = (
    'Health_Expenditure ~ '
    'Dependency_Ratio_Old + Dependency_Ratio_Young + '
    'log_GDP_percapita + trade_gdp + '
    'C(ISO3) + C(Year)'
)

twfe = smf.ols(formula_twfe, data=df_model).fit()
print("\n=== Model 3: TWFE ===")
print(twfe.summary())



=== Model 3: TWFE ===
                            OLS Regression Results                            
Dep. Variable:     Health_Expenditure   R-squared:                       0.922
Model:                            OLS   Adj. R-squared:                  0.917
Method:                 Least Squares   F-statistic:                     170.1
Date:                Wed, 18 Jun 2025   Prob (F-statistic):               0.00
Time:                        15:18:32   Log-Likelihood:                -1348.5
No. Observations:                1184   AIC:                             2853.
Df Residuals:                    1106   BIC:                             3249.
Df Model:                          77                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
Inter

## TWFE (Clustered)

In [47]:
# --- Model 4: TWFE with country-clustered standard errors ---
# 1) Define your formula and your full df_model (Year ≤ 2022) as before:

formula_twfe = (
    'Health_Expenditure ~ '
    'Dependency_Ratio_Old + Dependency_Ratio_Young + '
    'log_GDP_percapita + trade_gdp + '
    'C(ISO3) + C(Year)'
)

df_model = df_analysis[df_analysis['Year'] <= 2022].copy()

# 2) Drop any rows with missing values in *all* the variables used in the formula:
needed = [
    'Health_Expenditure',
    'Dependency_Ratio_Old',
    'Dependency_Ratio_Young',
    'log_GDP_percapita',
    'trade_gdp',
    'ISO3',
    'Year'
]
df_model2 = df_model.dropna(subset=needed)

# 3) Fit TWFE with country‐clustered SE, passing exactly df_model2['ISO3']:
twfe_clust = smf.ols(formula_twfe, data=df_model2).fit(
    cov_type='cluster',
    cov_kwds={'groups': df_model2['ISO3'].values}
)

print(twfe_clust.summary())


                            OLS Regression Results                            
Dep. Variable:     Health_Expenditure   R-squared:                       0.922
Model:                            OLS   Adj. R-squared:                  0.917
Method:                 Least Squares   F-statistic:                     12.50
Date:                Wed, 18 Jun 2025   Prob (F-statistic):           3.15e-14
Time:                        15:18:32   Log-Likelihood:                -1348.5
No. Observations:                1184   AIC:                             2853.
Df Residuals:                    1106   BIC:                             3249.
Df Model:                          77                                         
Covariance Type:              cluster                                         
                             coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------
Intercept                  7



## First Difference

In [48]:
# --- Model 5: First‐Difference regression ---
# 5a) Subset the differenced data to Year ≤ 2022
df_diff_model = df_diff[df_diff['Year'] <= 2022].copy()

# 5b) Specify the formula on your Δ-variables
formula_diff = (
    'd_Health_Expenditure ~ '
    'd_Dependency_Ratio_Old + d_Dependency_Ratio_Young + '
    'd_log_GDP_percapita + d_trade_gdp'
)

# 5c) Fit the OLS on first-differences
fd_clust = smf.ols(formula_diff, data=df_diff_model).fit(
    cov_type='cluster',
    cov_kwds={'groups': df_diff_model['ISO3'].values}
)

print("\n=== Model 5: Δ-model with country‐clustered SE ===")
print(fd_clust.summary())


=== Model 5: Δ-model with country‐clustered SE ===
                             OLS Regression Results                             
Dep. Variable:     d_Health_Expenditure   R-squared:                       0.041
Model:                              OLS   Adj. R-squared:                  0.037
Method:                   Least Squares   F-statistic:                     6.310
Date:                  Wed, 18 Jun 2025   Prob (F-statistic):           0.000333
Time:                          15:18:32   Log-Likelihood:                -679.38
No. Observations:                  1132   AIC:                             1369.
Df Residuals:                      1127   BIC:                             1394.
Df Model:                             4                                         
Covariance Type:                cluster                                         
                               coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------

In [49]:
# --- 1) Pick your differenced model DataFrame (Year ≤ 2022) ---
df_diff_model = df_diff[df_diff['Year'] <= 2022].copy()

# --- 2) Create a 1-year lag of the Δ old‐age ratio ---
df_diff_model['d_DepOld_lag1'] = (
    df_diff_model
      .groupby('ISO3')['d_Dependency_Ratio_Old']
      .shift(1)
)

# --- 3) Drop any rows with missing lag (or any other NaN) ---
needed = [
    'd_Health_Expenditure',
    'd_Dependency_Ratio_Old',
    'd_DepOld_lag1',
    'd_Dependency_Ratio_Young',
    'd_log_GDP_percapita',
    'd_trade_gdp'
]
df_ld = df_diff_model.dropna(subset=needed)

# --- 4) Specify & run the “lagged Δ‐model” ---
import statsmodels.formula.api as smf

formula_lag = (
    'd_Health_Expenditure ~ '
    'd_Dependency_Ratio_Old + d_DepOld_lag1 + '
    'd_Dependency_Ratio_Young + '
    'd_log_GDP_percapita + d_trade_gdp'
)

ld_ols = smf.ols(formula_lag, data=df_ld).fit(
    cov_type='cluster',
    cov_kwds={'groups': df_ld['ISO3'].values}
)

print(ld_ols.summary())


                             OLS Regression Results                             
Dep. Variable:     d_Health_Expenditure   R-squared:                       0.051
Model:                              OLS   Adj. R-squared:                  0.047
Method:                   Least Squares   F-statistic:                     7.978
Date:                  Wed, 18 Jun 2025   Prob (F-statistic):           1.30e-05
Time:                          15:22:42   Log-Likelihood:                -655.38
No. Observations:                  1080   AIC:                             1323.
Df Residuals:                      1074   BIC:                             1353.
Df Model:                             5                                         
Covariance Type:                cluster                                         
                               coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------
Inte