In [92]:
# Import Libraries 

!pip3 install matplotlib
!pip3 install scikit-learn

import pandas as pd
import plotly.express as pe
import matplotlib as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import numpy as np

# Import Data from Google Sheets 

google_sheets_url = "https://docs.google.com/spreadsheets/d/1E0lyCSxlC0ajNtzjpWo17TX5DEeEjd33E-j6c7fOBcg/export?format=csv&gid=199229195"

df_full_country_list= pd.read_csv(google_sheets_url)

df_full_country_list



Unnamed: 0,Country Name,Indicator Name,Year,Value
0,United States,National savings (% GDP),1980,22.059
1,United States,Unemployment levels (%),1980,7.175
2,United States,GDP per capita,1980,12552.943
3,United States,"Inflation (CPI, %))",1980,86.750
4,Germany,Unemployment levels (%),1980,3.359
...,...,...,...,...
5224,Indonesia,"Birth rate, crude (per 1,000 people)",2024,
5225,Japan,"Birth rate, crude (per 1,000 people)",2024,
5226,Poland,"Birth rate, crude (per 1,000 people)",2024,
5227,United States,"Birth rate, crude (per 1,000 people)",2024,


In [93]:
# 1. Filter the DataFrame by Year

# Define the filtering parameters
START_YEAR = 2000
END_YEAR = 2023
COUNTRIES_TO_DROP = ['China', "Cote d'Ivoire", "Ghana"]

## 1. Filter by Year AND Drop Countries
df_final_working = df_full_country_list[
    # Filter for years between 2000 and 2023
    (df_full_country_list['Year'] >= START_YEAR) & 
    (df_full_country_list['Year'] <= END_YEAR) &
    
    # Drop the specified countries using the negation operator (~)
    (~df_full_country_list['Country Name'].isin(COUNTRIES_TO_DROP))
].copy()

In [94]:
# ---------                                                        --------
#              Create economic success indicator 
# ---------                                                        --------

# 1. Start with pivoting dataframe to ensure GDP, Unemployment and Inflation each have an individual column  

df_esi = df_final_working.pivot_table(
    index=['Country Name', 'Year'],
    columns='Indicator Name',
    values='Value'
).reset_index()

# 2. Define a list to include three indicators used: GDP, Unemplyoment and Inflation

Econ_indicators = [
    'GDP per capita',
    'Unemployment levels (%)',
    'Inflation (CPI, %))'
]

# 3. Concatenate DF with indicator list and rename columns 

df_esi = df_econ[['Country Name', 'Year'] + ESI_indicators].copy()

df_esi.rename(columns={
    'GDP per capita': 'GDP',
    'Unemployment levels (%)': 'UNEMP',
    'Inflation (CPI, %))': 'INFLATION'
}, inplace=True)



In [None]:
# 4. Recode directionality and standardize to prepare data for principal component analysis 

X_econ = df_esi[['GDP', 'UNEMP', 'INFLATION']].copy()

# 4.1 Invert the unemployment and inflation indicators by multiplying by -1.
X_econ_recoded = X_econ.copy()
X_econ_recoded['UNEMP'] = X_econ_recoded['UNEMP'] * -1
X_econ_recoded['INFLATION'] = X_econ_recoded['INFLATION'] * -1

# 4.2 Standardize (Z-score) the recoded data.
scaler = StandardScaler()
X_standardized = scaler.fit_transform(X_econ_recoded)


In [None]:
# 5. Apply principal component analysis

# 5.1 Initialize PCA
pca_econ = PCA()
pca_econ.fit(X_standardized)

# 5.2 Transform the standardized data to get the scores for each country-year
PC_scores = pca_esi.transform(X_standardized)

# 5.4 The first Principal Component (PC1) is the Economic Success Indicator
df_econ_working['Economic Success (PCA)'] = PC_scores[:, 0]


# 5.5 REVIEW PCA OUTPUT (Crucial for Documentation) ---

print("--- DataFrame Head with Economic Success (PCA) Score ---")
print(df_econ_working[['Country Name', 'Year', 'Economic Success (PCA)']].head())

print("\n--- PCA Loadings (Weights for ESI) ---")
# These loadings are the statistically optimal weights for your ESI
loadings = pd.Series(pca_esi.components_[0], index=X_econ_recoded.columns)
print(loadings)

print("\n--- Explained Variance Ratio ---")
# This tells you the percentage of total variance the ESI captures.
# The first number in the array is the most important: the percentage explained by PC1.
print(pca_esi.explained_variance_ratio_)

--- DataFrame Head with Economic Success (PCA) Score ---
Indicator Name Country Name  Year  Economic Success (PCA)
0                     Chile  2000               -1.158580
1                     Chile  2001               -1.170196
2                     Chile  2002               -1.152127
3                     Chile  2003               -1.114838
4                     Chile  2004               -1.097073

--- PCA Loadings (Weights for ESI) ---
Indicator Name
GDP          0.628189
UNEMP        0.544070
INFLATION   -0.556207
dtype: float64

--- Explained Variance Ratio ---
[0.63846135 0.22725596 0.1342827 ]
