## integrate R&D and contribution revenue into a broader index

### 📝 Note: Health Expenditure Performance Index (HEP)

**HEE = Health Expenditure Efficiency Index (HEE)

Clean the following data (for missing data)
Life expectancy, Infant Mortality, Average Schooling (years), Learning Outcome, Health Expenditure,Education Expenditure

Replicate PSP model by using the following data to generate HEP(Health Expenditure Performance) 
Independent variables = Life expectancy, Infant Mortality, Average Schooling (years), Learning Outcome

After that we will generate HEE (Health Expenditure Efficiency) by divided the HEP with Expenditure
HEE = HEP/(Health Expenditure + Education Expenditure)



In [1]:
import pandas as pd
import os

# === Step 1: Load and clean merged data ===

# Adjusted path: go up one level from notebooks/ to access data/interim
df = pd.read_csv("../data/interim/merged_data.csv")

# Filter to only include years up to 2022
df = df[df["Year"] <= 2022]

# Sort by Country and Year (most recent first) for proper fill
df = df.sort_values(by=["Country", "Year"], ascending=[True, False])

# Fill missing values using most recent data available (backward then forward fill)
df_cleaned = (
    df.groupby("Country", group_keys=False)
      .apply(lambda g: g.bfill().ffill(), include_groups=False)
      .reset_index(drop=True)
)

# Save cleaned data
os.makedirs("../data/processed", exist_ok=True)
df_cleaned.to_csv("../data/processed/merged_data_clean.csv", index=False)

# === Step 2: Normalize performance indicators ===

def normalize(series):
    return (series - series.min()) / (series.max() - series.min())

df_norm = df_cleaned.copy()

# Apply normalization (adjust columns to exact spelling)
df_norm["life_expectancy_norm"] = normalize(df_norm["Life_Expectancy"])
df_norm["infant_mortality_norm"] = 1 - normalize(df_norm["Mortality_Rate"])
df_norm["average_schooling_norm"] = normalize(df_norm["average_schooling"])
df_norm["learning_outcome_norm"] = normalize(df_norm["learning_scores"])

# === Step 3: Calculate HEP and HEE ===

df_norm["HEP"] = df_norm[[
    "life_expectancy_norm",
    "infant_mortality_norm",
    "average_schooling_norm",
    "learning_outcome_norm"
]].mean(axis=1)

df_norm["total_expenditure"] = (
    df_norm["Health_Expenditure"] + df_norm["Education_Expenditure"]
)

df_norm["HEE"] = df_norm["HEP"] / df_norm["total_expenditure"]

# === Step 4: Save result ===
result.to_csv("../data/processed/hep_hee_results.csv", index=False)

print("✅ HEP & HEE calculated and saved successfully.")
print(result.tail())


NameError: name 'result' is not defined

Add region


In [None]:
import pandas as pd
import os

# === Step 1: Load the data ===
data_path = "../data/processed/hep_hee_results.csv"
df = pd.read_csv(data_path)

# === Step 2: Define compact country-to-region mapping ===
emea = [
    "Afghanistan", "Albania", "Algeria", "Andorra", "Angola", "Armenia", "Austria", "Azerbaijan", "Bahrain", 
    "Belarus", "Belgium", "Benin", "Bosnia and Herzegovina", "Botswana", "Bulgaria", "Burkina Faso", "Burundi",
    "Cameroon", "Cape Verde", "Central African Republic", "Chad", "Comoros", "Congo", "Croatia", "Cyprus",
    "Czechia", "Democratic Republic of the Congo", "Denmark", "Djibouti", "Egypt", "Equatorial Guinea", "Eritrea",
    "Estonia", "Eswatini", "Ethiopia", "Finland", "France", "Gabon", "Gambia", "Georgia", "Germany", "Ghana",
    "Greece", "Guinea", "Guinea-Bissau", "Hungary", "Iceland", "Iran", "Iraq", "Ireland", "Israel", "Italy",
    "Jordan", "Kazakhstan", "Kenya", "Kuwait", "Kyrgyzstan", "Latvia", "Lebanon", "Lesotho", "Liberia", "Libya",
    "Lithuania", "Luxembourg", "Madagascar", "Malawi", "Mali", "Malta", "Mauritania", "Mauritius", "Moldova",
    "Monaco", "Montenegro", "Morocco", "Mozambique", "Namibia", "Netherlands", "Niger", "Nigeria",
    "North Macedonia", "Norway", "Oman", "Poland", "Portugal", "Qatar", "Romania", "Russia", "Rwanda",
    "Saudi Arabia", "Senegal", "Serbia", "Seychelles", "Sierra Leone", "Slovakia", "Slovenia", "Somalia",
    "South Africa", "South Sudan", "Spain", "Sudan", "Syrian Arab Republic", "Tajikistan", "Togo", "Tunisia",
    "Turkey", "Turkmenistan", "Uganda", "Ukraine", "United Arab Emirates", "United Kingdom",
    "United Republic of Tanzania", "Uzbekistan", "Western Sahara", "Yemen", "Zambia", "Zimbabwe"
]
apac = [
    "Australia", "Bangladesh", "Bhutan", "Brunei Darussalam", "Cambodia", "China", "Fiji", "India", "Indonesia",
    "Japan", "Lao People's Democratic Republic", "Malaysia", "Maldives", "Mongolia", "Myanmar", "Nepal",
    "New Zealand", "Pakistan", "Papua New Guinea", "Philippines", "Republic of Korea", "Singapore", "Sri Lanka",
    "Thailand", "Timor-Leste", "Viet Nam", "Vietnam", "South Korea"
]
latam = [
    "Argentina", "Bahamas", "Barbados", "Belize", "Bolivia", "Brazil", "Chile", "Colombia", "Costa Rica", "Cuba",
    "Dominican Republic", "Ecuador", "El Salvador", "Guatemala", "Guyana", "Haiti", "Honduras", "Jamaica",
    "Mexico", "Nicaragua", "Panama", "Paraguay", "Peru", "Suriname", "Trinidad and Tobago", "Uruguay",
    "Venezuela"
]
na = ["Canada", "United States"]

# Combine into a single dictionary
country_region_map = {
    **{country: "EMEA" for country in emea},
    **{country: "APAC" for country in apac},
    **{country: "LATAM" for country in latam},
    **{country: "NA" for country in na},
}

# === Step 3: Map region and validate ===
df["Region"] = df["Country"].map(country_region_map)
unmapped = df[df["Region"].isnull()]["Country"].unique()

if len(unmapped) > 0:
    print("⚠️ Warning: The following countries couldn't be mapped to a region:")
    for c in unmapped:
        print(f"- {c}")
    print("👉 Please update 'country_region_map' with these countries.")

# === Step 4: Save updated file ===
output_path = "../data/processed/hep_hee_results_with_region.csv"
os.makedirs(os.path.dirname(output_path), exist_ok=True)
df.to_csv(output_path, index=False)
print(f"✅ 'Region' column added and data saved to: {output_path}")


✅ 'Region' column added and data saved to: ../data/processed/hep_hee_results_with_region.csv


Visualize HEE and HEP

In [None]:
# !pip install plotly

import pandas as pd
import plotly.express as px
import os

# === Step 1: Load data ===
df = pd.read_csv("../data/processed/hep_hee_results_with_region.csv")

# === Step 2: Filter for 2020 and 2022 ===
df_filtered = df[df["Year"].isin([2020, 2022])]
df_wide = df_filtered.pivot(index="Country", columns="Year", values=["HEP", "HEE"]).dropna()
df_wide.columns = [f"{metric}_{year}" for metric, year in df_wide.columns]
df_wide = df_wide.reset_index()

# Merge back region info
regions = df[["Country", "Region"]].drop_duplicates()
df_wide = df_wide.merge(regions, on="Country", how="left")

# === Step 3: Plot HEP ===
fig_hep = px.scatter(
    df_wide,
    x="HEP_2020",
    y="HEP_2022",
    text="Country",
    color="Country",  # Unique color per country
    hover_name="Country",
    title="HEP Comparison: 2020 vs 2022",
    labels={"HEP_2020": "HEP in 2020", "HEP_2022": "HEP in 2022"},
    width=800,
    height=600
)
fig_hep.add_shape(type='line', x0=0, y0=0, x1=1, y1=1, line=dict(dash="dash", color="gray"))
fig_hep.update_traces(marker=dict(size=10), textposition='top center')
fig_hep.show()

# === Step 4: Plot HEE (with dynamic axis scaling) ===
hee_min = min(df_wide["HEE_2020"].min(), df_wide["HEE_2022"].min())
hee_max = max(df_wide["HEE_2020"].max(), df_wide["HEE_2022"].max())
hee_min_padded = max(0, hee_min - 0.1)
hee_max_padded = hee_max + 0.1

fig_hee = px.scatter(
    df_wide,
    x="HEE_2020",
    y="HEE_2022",
    text="Country",
    color="Country",
    hover_name="Country",
    title="HEE Comparison: 2020 vs 2022",
    labels={"HEE_2020": "HEE in 2020", "HEE_2022": "HEE in 2022"},
    width=800,
    height=600
)
fig_hee.add_shape(
    type='line',
    x0=hee_min_padded,
    y0=hee_min_padded,
    x1=hee_max_padded,
    y1=hee_max_padded,
    line=dict(dash="dash", color="gray")
)
fig_hee.update_traces(marker=dict(size=10), textposition='top center')
fig_hee.update_layout(
    xaxis=dict(range=[hee_min_padded, hee_max_padded]),
    yaxis=dict(range=[hee_min_padded, hee_max_padded])
)
fig_hee.show()
