In [26]:
import pandas as pd  
import os


In [27]:
csv_path = os.path.join('..', 'Web Scrapping', 'raw_gpfi_data.csv')

# Read the CSV file
gpfi_df = pd.read_csv(csv_path)  # Using pandas

In [28]:
gpfi_df.head(50)

Unnamed: 0,Year,Country,Rank,Global Score
0,2024,Norway,1,91.89
1,2024,Denmark,2,89.6
2,2024,Sweden,3,88.32
3,2024,Netherlands,4,87.73
4,2024,Finland,5,86.55
5,2024,Estonia,6,86.44
6,2024,Portugal,7,85.9
7,2024,Ireland,8,85.59
8,2024,Switzerland,9,84.01
9,2024,Germany,10,83.84


In [29]:
gpfi_df.drop('Rank',axis=1,inplace=True)

In [30]:
gpfi_df.isna().sum() # check any null values

Year            0
Country         0
Global Score    0
dtype: int64

In [31]:
print(len(gpfi_df['Year'].unique())) # total unique year value
print(gpfi_df['Year'].max()) #max year
print(gpfi_df['Year'].min()) #min year
print(gpfi_df['Year'].max()-gpfi_df['Year'].min()+1) #looks like all years from 1990 to 2021 exist

12
2024
2013
12


In [32]:
all_years = range(2013, 2025)  # 2021 inclusive

# Create a pivot table with countries as rows and years as columns
pivot_df = (
    gpfi_df.pivot_table(
        index='Country',
        columns='Year',
        values='Global Score',
        aggfunc='first'  # Takes the first occurrence if duplicates exist
    )
    .reindex(columns=all_years)  # Ensure all years are present as columns
    .sort_index(axis=0)  # Sort countries alphabetically
    .reset_index()       # Move Country from index to column
)

# Rename the columns to match your requirements
pivot_df.columns.name = None  # Remove the 'Year' label from columns
pivot_df = pivot_df.rename(columns={'Country': 'Country'})

In [33]:
pivot_df.head(50)

Unnamed: 0,Country,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
0,Afghanistan,62.64,62.93,62.56,62.25,60.54,62.72,63.45,62.3,59.81,38.27,39.75,19.09
1,Albania,69.12,70.08,71.23,70.08,70.08,70.51,70.16,69.75,69.41,56.41,57.86,54.1
2,Algeria,63.46,63.74,63.37,58.31,57.17,56.87,54.25,54.48,52.74,45.53,45.74,41.98
3,Andorra,93.18,93.18,80.13,80.13,78.97,77.79,75.37,76.77,76.68,68.79,75.05,61.44
4,Angola,62.2,63.5,62.16,60.11,59.58,61.65,65.04,66.08,65.94,57.17,48.3,52.44
5,Argentina,74.33,74.73,73.89,74.91,74.93,73.95,71.7,71.22,71.01,77.28,73.36,63.13
6,Armenia,71.96,70.93,71.57,71.21,69.62,70.01,71.02,71.4,71.17,68.97,70.61,71.6
7,Australia,84.76,83.09,82.97,82.16,83.98,84.54,83.45,79.79,80.21,73.77,78.24,73.42
8,Austria,90.6,89.99,89.15,86.82,86.53,85.96,84.67,84.22,83.66,76.74,77.3,74.69
9,Azerbaijan,52.27,47.13,41.59,42.11,43.6,40.27,40.87,41.52,41.23,39.4,39.93,27.99


In [34]:
# Fill NaN values with the respective country's mean
pivot_df = pivot_df.set_index("Country").apply(lambda row: row.fillna(row.mean()), axis=1).reset_index()

In [35]:
pivot_df.to_csv('global_press_freedom_index.csv',index=False)