In [40]:
import pandas as pd
import plotly as pe
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import numpy as np

# Import Data from Google Sheets 

google_sheets_url = "https://docs.google.com/spreadsheets/d/1E0lyCSxlC0ajNtzjpWo17TX5DEeEjd33E-j6c7fOBcg/export?format=csv&gid=199229195"

df_full_country_list= pd.read_csv(google_sheets_url)

df_full_country_list


Unnamed: 0,Country Name,Indicator Name,Year,Value
0,United States,National savings (% GDP),1980,22.059
1,United States,Unemployment levels (%),1980,7.175
2,United States,GDP per capita,1980,12552.943
3,United States,"Inflation (CPI, %))",1980,86.750
4,Germany,Unemployment levels (%),1980,3.359
...,...,...,...,...
4261,Indonesia,"Birth rate, crude (per 1,000 people)",2024,
4262,Japan,"Birth rate, crude (per 1,000 people)",2024,
4263,Poland,"Birth rate, crude (per 1,000 people)",2024,
4264,United States,"Birth rate, crude (per 1,000 people)",2024,


In [42]:
# --- DATA CLEANING & PIVOTING ---

# Define the filtering parameters 

START_YEAR = 2000
END_YEAR = 2023
COUNTRIES_TO_DROP = ['China', "Cote d'Ivoire", "Ghana"]

## 1. Filter by Year AND Drop Countries

# Assuming df_full_country_list is defined
df_final_working = df_full_country_list[
    (df_full_country_list['Year'] >= START_YEAR) & 
    (df_full_country_list['Year'] <= END_YEAR) &
    (~df_full_country_list['Country Name'].isin(COUNTRIES_TO_DROP))
].copy()

# 2. Pivot the filtered data once to create the single wide data source for both indices
df_wide = df_final_working.pivot_table(
    index=['Country Name', 'Year'],
    columns='Indicator Name',
    values='Value'
).reset_index()

df_wide


Indicator Name,Country Name,Year,"Birth rate, crude (per 1,000 people)",Current health expenditure (% of GDP),GDP per capita,Gini index,"Inflation (CPI, %))","Life expectancy at birth, total (years)",National savings (% GDP),Poverty headcount ratio at national poverty lines (% of population),Unemployment levels (%),Urban population (% of total population)
0,Chile,2000,16.322,7.000207,5099.922,52.8,57.794,77.128000,22.505,36.0,9.708,86.073
1,Chile,2001,15.914,7.097331,4608.057,,59.317,77.580000,22.953,,9.867,86.363
2,Chile,2002,15.249,7.208338,4479.771,,60.993,77.840000,23.122,,9.800,86.606
3,Chile,2003,14.789,7.250478,4824.870,51.5,61.647,77.884000,23.573,35.4,9.533,86.665
4,Chile,2004,14.415,6.910286,6183.906,,63.144,78.276000,24.473,,10.017,86.725
...,...,...,...,...,...,...,...,...,...,...,...,...
232,United States,2019,11.400,16.661253,65561.320,41.9,258.353,78.787805,19.329,,3.675,82.459
233,United States,2020,10.900,18.813253,64518.063,40.0,262.389,76.980488,18.232,,8.100,82.664
234,United States,2021,11.000,17.506386,71365.337,39.7,281.782,76.329268,17.583,,5.358,82.873
235,United States,2022,11.000,16.496140,77944.095,41.7,299.845,77.434146,18.186,,3.642,83.084


In [None]:
# --------------------------------------------------------------------------
#              SECTION B: WELL-BEING INDICATOR (WTI) - FINAL FIX
# --------------------------------------------------------------------------

WTI_indicators = [
    'Life expectancy at birth, total (years)',
    'Gini index',
    'Birth rate, crude (per 1,000 people)'
]

# 1. Select, Rename, and Impute WTI Data
df_wti_working = df_wide[['Country Name', 'Year'] + WTI_indicators].copy()

df_wti_working.rename(columns={
    'Life expectancy at birth, total (years)': 'LIFE_EXP',
    'Gini index': 'GINI',
    'Birth rate, crude (per 1,000 people)': 'BIRTH_RATE'
}, inplace=True)

# Imputation: FFILL, LINEAR INTERPOLATION, and BFILL
for col in ['LIFE_EXP', 'GINI', 'BIRTH_RATE']:
    # Step 1: Forward Fill (FFILL)
    df_wti_working[col] = df_wti_working.groupby('Country Name')[col].ffill()
    
    # Step 2: Linear Interpolation 
    df_wti_working[col] = df_wti_working.groupby('Country Name')[col].apply(
        lambda x: x.interpolate(method='linear')
    )
    
    # Step 3: Backward Fill (BFILL)
    df_wti_working[col] = df_wti_working.groupby('Country Name')[col].bfill()
    
    # Step 4 (Fallback): Fill any remaining NaNs with the overall column mean
    df_wti_working[col].fillna(df_wti_working[col].mean(), inplace=True)
  
# --- CRITICAL FIX: Reset the index before PCA ---
# This ensures the DataFrame index is sequential (0, 1, 2, ...) 
# and matches the NumPy output index.
df_wti_working.reset_index(drop=True, inplace=True)
# -------------------------------------------------

# 2. Recode, Standardize, and Apply PCA
X_wti = df_wti_working[['LIFE_EXP', 'GINI', 'BIRTH_RATE']].copy()

# Recode (Invert) the "bad" indicators: GINI and BIRTH_RATE
X_wti_recoded = X_wti.copy()
X_wti_recoded['GINI'] = X_wti_recoded['GINI'] * -1             # Invert: Lower GINI = Higher WTI
X_wti_recoded['BIRTH_RATE'] = X_wti_recoded['BIRTH_RATE'] * -1 # Invert: Lower BR = Higher WTI

# Standardize
scaler_wti = StandardScaler()
X_wti_standardized = scaler_wti.fit_transform(X_wti_recoded)

# Apply PCA
pca_wti = PCA()
pca_wti.fit(X_wti_standardized)

# Transform and assign WTI score
PC_scores_wti = pca_wti.transform(X_wti_standardized)

# Assign WTI score - now simple assignment should work because the index is reset
df_wti_working['Well-Being (PCA)'] = PC_scores_wti[:, 0]

In [5]:
# --- 1. Display the DataFrame with the new ESI Scores ---
print("--- ESI Scores per Country-Year ---")
print(df_esi_working[['Country Name', 'Year', 'Economic Success (PCA)']].head(10))

# --- 2. Display the PCA Loadings (Indicator Weights) ---
print("\n--- PCA Loadings (Weights for ESI) ---")
# The first component (index 0) holds the weights for your ESI
loadings_esi = pd.Series(pca_econ.components_[0], index=X_econ_recoded.columns)
print(loadings_esi)

# --- 3. Display the Explained Variance Ratio ---
print("\n--- Explained Variance Ratio ---")
# The first number shows the percentage of variance captured by the ESI (PC1)
print(pca_econ.explained_variance_ratio_)

--- ESI Scores per Country-Year ---
Indicator Name Country Name  Year  Economic Success (PCA)
0                     Chile  2000               -1.048667
1                     Chile  2001               -1.060953
2                     Chile  2002               -1.043638
3                     Chile  2003               -1.006676
4                     Chile  2004               -0.987004
5                     Chile  2005               -0.868700
6                     Chile  2006               -0.694231
7                     Chile  2007               -0.540844
8                     Chile  2008               -0.537456
9                     Chile  2009               -0.814789

--- PCA Loadings (Weights for ESI) ---
Indicator Name
GDP          0.646417
UNEMP        0.560706
INFLATION   -0.517449
dtype: float64

--- Explained Variance Ratio ---
[0.62557104 0.25168055 0.1227484 ]


In [6]:

# --------------------------------------------------------------------------
#              SECTION B: WELL-BEING INDICATOR (WTI)
# --------------------------------------------------------------------------

WTI_indicators = [
    'Life expectancy at birth, total (years)',
    'Gini index',
    'Birth rate, crude (per 1,000 people)'
]

# 1. Select, Rename, and Impute WTI Data
df_wti_working = df_wide[['Country Name', 'Year'] + WTI_indicators].copy()

df_wti_working.rename(columns={
    'Life expectancy at birth, total (years)': 'LIFE_EXP',
    'Gini index': 'GINI',
    'Birth rate, crude (per 1,000 people)': 'BIRTH_RATE'
}, inplace=True)

# Imputation: FFILL and BFILL
for col in ['LIFE_EXP', 'GINI', 'BIRTH_RATE']:
    df_wti_working[col] = df_wti_working.groupby('Country Name')[col].ffill().bfill()
  

# 2. Recode, Standardize, and Apply PCA
X_wti = df_wti_working[['LIFE_EXP', 'GINI', 'BIRTH_RATE']].copy()

# Recode (Invert) the "bad" indicators: GINI and BIRTH_RATE
X_wti_recoded = X_wti.copy()
X_wti_recoded['GINI'] = X_wti_recoded['GINI'] * -1

scaler_wti = StandardScaler()
X_wti_standardized = scaler_wti.fit_transform(X_wti_recoded)

pca_wti = PCA()
pca_wti.fit(X_wti_standardized)

PC_scores_wti = pca_wti.transform(X_wti_standardized)
df_wti_working['Well-Being (PCA)'] = PC_scores_wti[:, 0]




In [7]:
# --------------------------------------------------------------------------
#              SECTION C: MERGE AND FINALIZE
# --------------------------------------------------------------------------

# Merge the ESI and WTI scores into a single final DataFrame
df_merged_scores = pd.merge(
    df_esi_working[['Country Name', 'Year', 'Economic Success (PCA)']],
    df_wti_working[['Country Name', 'Year', 'Well-Being (PCA)']],
    on=['Country Name', 'Year'],
    how='left'
)

print("--- Final Merged Scores Head (WTI and ESI ready for plotting) ---")
print(df_merged_scores.head())

df_merged_scores 


--- Final Merged Scores Head (WTI and ESI ready for plotting) ---
Indicator Name Country Name  Year  Economic Success (PCA)  Well-Being (PCA)
0                     Chile  2000               -1.048667          0.632306
1                     Chile  2001               -1.060953          0.551362
2                     Chile  2002               -1.043638          0.453000
3                     Chile  2003               -1.006676          0.333113
4                     Chile  2004               -0.987004          0.260487


Indicator Name,Country Name,Year,Economic Success (PCA),Well-Being (PCA)
0,Chile,2000,-1.048667,0.632306
1,Chile,2001,-1.060953,0.551362
2,Chile,2002,-1.043638,0.453000
3,Chile,2003,-1.006676,0.333113
4,Chile,2004,-0.987004,0.260487
...,...,...,...,...
232,United States,2019,3.194410,-0.592014
233,United States,2020,2.868758,-0.614702
234,United States,2021,3.471997,-0.570981
235,United States,2022,3.977010,-0.554299


In [8]:
import plotly.express as px
import pandas as pd

# 1. CALCULATE THE AVERAGE SCORES PER COUNTRY (from yearly data)
df_final_ranking = df_merged_scores.groupby('Country Name').agg(
    {'Economic Success (PCA)': 'mean',
     'Well-Being (PCA)': 'mean'}
).reset_index()

# Rename columns for plotting simplicity
df_final_ranking.rename(columns={
    'Economic Success (PCA)': 'Avg_ESI',
    'Well-Being (PCA)': 'Avg_WTI'
}, inplace=True)


# 2. CREATE THE SCATTER PLOT
fig = px.scatter(
    df_final_ranking,
    x='Avg_ESI',           # X-axis: Economic Success Index
    y='Avg_WTI',           # Y-axis: Well-Being Translation Index
    text='Country Name',   # Label points with the country name
    title='ESI vs. WTI: Simple Comparison (2000-2023 Average)',
    labels={
        'Avg_ESI': 'Economic Success Index (ESI)',
        'Avg_WTI': 'Well-Being Index (WTI)'
    }
)

# Optional: Add Quadrant lines at the sample average (Zero)
fig.add_vline(x=0, line_dash="dash", line_color="red")
fig.add_hline(y=0, line_dash="dash", line_color="blue")

fig.show()

In [9]:
import plotly.express as px
import pandas as pd

# 1. CALCULATE THE AVERAGE ESI SCORE PER COUNTRY (from yearly data)
# This step is necessary to create the final ranking
df_ranking = df_merged_scores.groupby('Country Name')['Economic Success (PCA)'].mean().reset_index()

# Rename the column
df_ranking.rename(columns={'Economic Success (PCA)': 'Avg_ESI'}, inplace=True)

# Sort the ranking from highest ESI to lowest
df_ranking_sorted = df_ranking.sort_values(by='Avg_ESI', ascending=False)


# 2. SELECT TOP AND BOTTOM COUNTRIES FOR VISUALIZATION
df_top = df_ranking_sorted.head(10)
df_bottom = df_ranking_sorted.tail(5)
# Concatenate the top 10 and bottom 5 for contrast
df_bar_viz = pd.concat([df_top, df_bottom])

# 3. CREATE THE HORIZONTAL BAR CHART
fig = px.bar(
    df_bar_viz,
    x='Avg_ESI',
    y='Country Name',
    orientation='h', # Makes the bars horizontal (better for country names)
    color='Avg_ESI',
    color_continuous_scale=px.colors.sequential.Teal,
    title='Top & Bottom ESI Ranking (2000-2023 Average)',
    labels={'Avg_ESI': 'Average Economic Success Index (ESI) Score'}
)

# Reverse the Y-axis so the highest score is at the top
fig.update_layout(
    yaxis={'categoryorder': 'total ascending'},
    showlegend=False
)

# Add a vertical line at ESI = 0 (the sample average)
fig.add_vline(x=0, line_width=2, line_dash="dash", line_color="red", annotation_text="Sample Average")

fig.show()

In [10]:
import plotly.express as px
import pandas as pd

# 1. CALCULATE THE AVERAGE WTI SCORE PER COUNTRY (from yearly data)
# Use 'Well-Being (PCA)' column
df_ranking_wti = df_merged_scores.groupby('Country Name')['Well-Being (PCA)'].mean().reset_index()

# Rename the column
df_ranking_wti.rename(columns={'Well-Being (PCA)': 'Avg_WTI'}, inplace=True)

# Sort the ranking from highest WTI to lowest
df_ranking_wti_sorted = df_ranking_wti.sort_values(by='Avg_WTI', ascending=False)


# 2. SELECT TOP AND BOTTOM COUNTRIES FOR VISUALIZATION
df_top_wti = df_ranking_wti_sorted.head(10)
df_bottom_wti = df_ranking_wti_sorted.tail(5)
# Concatenate the top 10 and bottom 5 for contrast
df_bar_viz_wti = pd.concat([df_top_wti, df_bottom_wti])

# 3. CREATE THE HORIZONTAL BAR CHART
fig = px.bar(
    df_bar_viz_wti,
    x='Avg_WTI',
    y='Country Name',
    orientation='h',
    color='Avg_WTI',
    color_continuous_scale=px.colors.sequential.Plasma, # Using a different color scale
    title='Top & Bottom Well-Being (WTI) Ranking (2000-2023 Average)',
    labels={'Avg_WTI': 'Average Well-Being Translation Index (WTI) Score'}
)

# Reverse the Y-axis so the highest score is at the top
fig.update_layout(
    yaxis={'categoryorder': 'total ascending'},
    showlegend=False
)

# Add a vertical line at WTI = 0 (the sample average)
fig.add_vline(x=0, line_width=2, line_dash="dash", line_color="blue", annotation_text="Sample Average")

fig.show()

In [11]:
import plotly.express as px
import pandas as pd

# 1. DEFINE AND FILTER DATA (Excluding China, Ghana, and Cote d'Ivoire)
countries_to_plot_final = [
    'Germany', 'Denmark', 'Poland', 'United States', 'Chile', 
    'Costa Rica', 'Japan', 'Indonesia', 'South Africa'
]

# Filter the yearly scores DataFrame (Assuming df_merged_scores is defined)
df_country_trend = df_merged_scores[
    df_merged_scores['Country Name'].isin(countries_to_plot_final)
].copy()

# 2. TRANSFORM DATA TO LONG FORMAT
# This is required to plot two separate lines (ESI and WTI) easily
df_long_country_trend = df_country_trend.melt(
    id_vars=['Country Name', 'Year'],
    value_vars=['Economic Success (PCA)', 'Well-Being (PCA)'],
    var_name='Index Type',
    value_name='Score'
)

# 3. CREATE THE FACETED LINE CHART
fig = px.line(
    df_long_country_trend,
    x='Year',
    y='Score',
    color='Index Type',      # Plot two lines (ESI vs WTI) on each facet
    line_dash='Index Type',
    facet_col='Country Name', # Creates a separate plot for each country
    facet_col_wrap=4,         # Arranges plots in 4 columns
    title='ESI vs. WTI Trend for Key Countries (2000-2023)',
    labels={
        'Score': 'PCA Score (Sample Mean = 0)',
        'Index Type': 'Index'
    }
)

# Add a horizontal line at the zero mark (the sample average) for clarity
fig.add_hline(y=0, line_width=1, line_dash="dot", line_color="gray")

# Adjust the layout to make titles and hover info clean
fig.update_layout(height=800)
# Clean up facet titles
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1])) 

fig.show()

In [12]:
import plotly.express as px
import pandas as pd

# 1. DEFINE AND FILTER DATA (Excluding China, Ghana, and Cote d'Ivoire)
countries_to_plot_final = [
    'Germany', 'Denmark', 'Poland', 'United States', 'Chile', 
    'Costa Rica', 'Japan', 'Indonesia', 'South Africa'
]

# Filter the yearly scores DataFrame
df_country_trend = df_merged_scores[
    df_merged_scores['Country Name'].isin(countries_to_plot_final)
].copy()

# 2. TRANSFORM DATA TO LONG FORMAT
df_long_country_trend = df_country_trend.melt(
    id_vars=['Country Name', 'Year'],
    value_vars=['Economic Success (PCA)', 'Well-Being (PCA)'],
    var_name='Index Type',
    value_name='Score'
)

# 3. CREATE THE SINGLE, COMBINED LINE CHART (18 lines)
fig = px.line(
    df_long_country_trend,
    x='Year',
    y='Score',
    color='Country Name',      # Distinguish countries by color
    line_dash='Index Type',    # Distinguish ESI (solid) vs WTI (dashed) by line style
    title='Combined ESI vs. WTI Trends for Key Countries (2000-2023)',
    labels={
        'Score': 'PCA Score (Sample Mean = 0)',
        'Index Type': 'Index'
    }
)

# Add a horizontal line at the zero mark (the sample average)
fig.add_hline(y=0, line_width=1, line_dash="dot", line_color="gray")

# Adjust the layout for readability
fig.update_layout(height=600, legend_title_text='Country / Index Type')

fig.show()