In [18]:
import pandas as pd  
import os


In [19]:
csv_path = os.path.join('..', 'Web Scrapping', 'raw_ghi_data.csv')

# Read the CSV file
ghi_df = pd.read_csv(csv_path)  # Using pandas

In [20]:
ghi_df.head(50)

Unnamed: 0,Year,Rank,Country,Health Index
0,2012,1,Japan,96.1
1,2012,2,Indonesia,94.1
2,2012,3,Sri Lanka,93.5
3,2012,4,Thailand,92.6
4,2012,5,Israel,91.7
5,2012,6,Denmark,89.8
6,2012,7,Colombia,88.9
7,2012,8,Malta,87.0
8,2012,9,New Zealand,86.8
9,2012,10,Sweden,86.8


In [21]:
ghi_df.drop('Rank',axis=1,inplace=True)

In [22]:
ghi_df.isna().sum() # check any null values

Year            0
Country         0
Health Index    0
dtype: int64

In [23]:
print(len(ghi_df['Year'].unique())) # total unique year value
print(ghi_df['Year'].max()) #max year
print(ghi_df['Year'].min()) #min year
print(ghi_df['Year'].max()-ghi_df['Year'].min()+1) #looks like all years from 1990 to 2021 exist

12
2023
2012
12


In [24]:
all_years = range(2013, 2025)  # 2021 inclusive

# Create a pivot table with countries as rows and years as columns
pivot_df = (
    ghi_df.pivot_table(
        index='Country',
        columns='Year',
        values='Health Index',
        aggfunc='first'  # Takes the first occurrence if duplicates exist
    )
    .reindex(columns=all_years)  # Ensure all years are present as columns
    .sort_index(axis=0)  # Sort countries alphabetically
    .reset_index()       # Move Country from index to column
)

# Rename the columns to match your requirements
pivot_df.columns.name = None  # Remove the 'Year' label from columns
pivot_df = pivot_df.rename(columns={'Country': 'Country'})

In [25]:
pivot_df.head(50)

Unnamed: 0,Country,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
0,Albania,44.4,61.4,67.4,,,,,53.0,50.7,50.5,49.6,
1,Algeria,,,57.2,,,,54.1,54.9,55.0,52.9,53.6,
2,Argentina,78.3,76.7,75.2,73.7,73.3,71.3,69.4,69.3,68.6,69.3,68.9,
3,Armenia,,,48.3,,,,,,,,,
4,Australia,75.6,73.6,71.9,73.7,74.2,76.2,76.4,77.4,77.7,78.1,75.3,
5,Austria,81.5,79.7,78.4,77.8,80.9,78.8,79.2,78.7,78.4,76.7,77.2,
6,Azerbaijan,28.7,,23.9,,,,,43.2,44.0,44.8,46.8,
7,Bahrain,,,84.4,,,,,,,,,
8,Bangladesh,,,43.3,,43.5,44.6,40.3,42.8,42.7,42.3,42.3,
9,Belarus,,48.0,45.4,,53.8,,58.0,59.0,44.4,45.6,47.1,


In [26]:
# Fill NaN values with the respective country's mean
pivot_df = pivot_df.set_index("Country").apply(lambda row: row.fillna(row.mean()), axis=1).reset_index()

In [27]:
pivot_df.to_csv('global_health_index.csv',index=False)