In [1]:
import pandas as pd  
import os


In [2]:
csv_path = os.path.join('..', 'Web Scrapping', 'raw_gpi_data.csv')

# Read the CSV file
gpi_df = pd.read_csv(csv_path)  # Using pandas

In [3]:
gpi_df.head(50)

Unnamed: 0,Year,Rank,Country,GPI
0,2024,1,Iceland,1.112
1,2024,2,Ireland,1.303
2,2024,3,Austria,1.313
3,2024,4,New Zealand,1.323
4,2024,5,Singapore,1.339
5,2024,6,Switzerland,1.35
6,2024,7,Portugal,1.372
7,2024,8,Denmark,1.382
8,2024,9,Slovenia,1.395
9,2024,10,Malaysia,1.427


In [4]:
gpi_df.drop('Rank',axis=1,inplace=True)

In [5]:
gpi_df.isna().sum() # check any null values

Year       0
Country    0
GPI        0
dtype: int64

In [6]:
print(len(gpi_df['Year'].unique())) # total unique year value
print(gpi_df['Year'].max()) #max year
print(gpi_df['Year'].min()) #min year
print(gpi_df['Year'].max()-gpi_df['Year'].min()+1) #looks like all years from 1990 to 2021 exist

17
2024
2008
17


In [7]:
all_years = range(2008, 2025)  # 2021 inclusive

# Create a pivot table with countries as rows and years as columns
pivot_df = (
    gpi_df.pivot_table(
        index='Country',
        columns='Year',
        values='GPI',
        aggfunc='first'  # Takes the first occurrence if duplicates exist
    )
    .reindex(columns=all_years)  # Ensure all years are present as columns
    .sort_index(axis=0)  # Sort countries alphabetically
    .reset_index()       # Move Country from index to column
)

# Rename the columns to match your requirements
pivot_df.columns.name = None  # Remove the 'Year' label from columns
pivot_df = pivot_df.rename(columns={'Country': 'Country'})

In [22]:
pivot_df.sort_values(by=[2023],ascending=False).head(10)

Unnamed: 0,Country,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
133,South Sudan,,,,,2.848,2.927,3.166,3.434,3.449,3.411,3.481,3.443,3.429,3.41,3.376,3.356,3.324
36,Democratic Republic of the Congo,2.798,2.85,2.915,2.806,2.843,2.907,3.067,3.06,2.943,3.028,3.233,3.12,3.131,3.224,3.202,3.348,3.264
160,Yemen,2.421,2.536,2.585,2.604,2.682,2.616,2.587,2.841,3.276,3.182,3.189,3.384,3.406,3.515,3.595,3.322,3.397
0,Afghanistan,3.218,3.296,3.201,3.212,3.265,3.293,3.272,3.363,3.457,3.552,3.614,3.619,3.657,3.663,3.62,3.287,3.294
139,Syria,2.101,2.184,2.203,2.21,2.741,3.29,3.347,3.386,3.396,3.423,3.394,3.329,3.332,3.269,3.209,3.202,3.173
136,Sudan,3.054,3.077,3.003,3.078,3.208,3.133,3.219,3.148,3.136,3.016,2.97,2.833,2.949,2.995,3.129,3.193,3.327
152,Ukraine,2.136,2.203,2.239,2.13,2.066,2.233,2.581,2.991,3.094,2.991,2.921,2.836,2.743,2.546,2.674,3.165,3.28
121,Russia,3.151,3.238,3.146,3.063,3.055,3.056,3.189,3.192,3.211,3.206,3.306,3.233,3.139,3.163,3.112,3.126,3.134
130,Somalia,3.175,3.242,3.342,3.208,3.276,3.206,3.153,3.132,3.152,3.137,3.169,3.126,3.153,3.21,3.156,3.114,3.091
97,Myanmar,2.297,2.296,2.303,2.161,2.248,2.24,2.186,2.206,2.116,2.037,2.254,2.22,2.29,2.343,2.782,3.088,2.943


 Handle missing values. Fill null values with zero

In [35]:
# Fill NaN values with the respective country's mean
pivot_df = pivot_df.set_index("Country").apply(lambda row: row.fillna(row.mean()), axis=1).reset_index()

In [36]:
pivot_df['Country'] = pivot_df['Country'].replace('United States of America', 'United States')


In [37]:
pivot_df.to_csv('global_peace_index.csv',index=False)