In [43]:
import pandas as pd
import requests
import plotly.express as px
import numpy as np

# Fetch the GDP per capita data
df_gdp = pd.read_csv("https://ourworldindata.org/grapher/gdp-per-capita-worldbank.csv?v=1&csvType=full&useColumnShortNames=true", storage_options = {'User-Agent': 'Our World In Data data fetch/1.0'})

# Fetch the metadata
metadata = requests.get("https://ourworldindata.org/grapher/gdp-per-capita-worldbank.metadata.json?v=1&csvType=full&useColumnShortNames=true").json()

In [44]:
# Load OECD data
oecd_data = pd.read_csv("/Users/ren/Documents/Ciencia de Datos/GDP shi/OECD,DF_BLI,+all.csv")

# Let's examine both datasets
print("GDP data shape:", df_gdp.shape)
print("OECD data shape:", oecd_data.shape)

# Check the first few rows of each dataset
print("\nGDP data preview:")
display(df_gdp.head())

print("\nOECD data preview:")
display(oecd_data.head())

GDP data shape: (7063, 4)
OECD data shape: (2369, 22)

GDP data preview:


Unnamed: 0,Entity,Code,Year,ny_gdp_pcap_pp_kd
0,Afghanistan,AFG,2000,1617.8264
1,Afghanistan,AFG,2001,1454.1108
2,Afghanistan,AFG,2002,1774.3087
3,Afghanistan,AFG,2003,1815.9282
4,Afghanistan,AFG,2004,1776.9182



OECD data preview:


Unnamed: 0,STRUCTURE,STRUCTURE_ID,STRUCTURE_NAME,ACTION,LOCATION,Country,INDICATOR,Indicator,MEASURE,Measure,...,OBS_VALUE,Observation Value,OBS_STATUS,Observation Status,UNIT_MEASURE,Unit of Measures,UNIT_MULT,Multiplier,BASE_PER,Base reference period
0,DATAFLOW,OECD:DF_BLI(1.0),Better Life Index,I,AUT,Austria,JE_LMIS,Labour market insecurity,L,Value,...,2.3,,A,,PC,Percentage,0,Units,,
1,DATAFLOW,OECD:DF_BLI(1.0),Better Life Index,I,BEL,Belgium,JE_LMIS,Labour market insecurity,L,Value,...,2.4,,A,,PC,Percentage,0,Units,,
2,DATAFLOW,OECD:DF_BLI(1.0),Better Life Index,I,CAN,Canada,JE_LMIS,Labour market insecurity,L,Value,...,3.8,,A,,PC,Percentage,0,Units,,
3,DATAFLOW,OECD:DF_BLI(1.0),Better Life Index,I,CZE,Czechia,JE_LMIS,Labour market insecurity,L,Value,...,2.3,,A,,PC,Percentage,0,Units,,
4,DATAFLOW,OECD:DF_BLI(1.0),Better Life Index,I,DNK,Denmark,JE_LMIS,Labour market insecurity,L,Value,...,4.5,,A,,PC,Percentage,0,Units,,


In [45]:
# Check unique indicators in OECD data
indicators = oecd_data['Indicator'].unique()
print("Available indicators:", indicators)

# Get unique inequality values to understand the data structure
inequalities = oecd_data['Inequality'].unique()
print("\nInequality categories:", inequalities)

# Let's see what countries are in the dataset
oecd_countries = oecd_data['Country'].unique()
print(f"\nNumber of countries in OECD data: {len(oecd_countries)}")
print("Example countries:", oecd_countries[:10])

Available indicators: ['Labour market insecurity' 'Dwellings without basic facilities'
 'Feeling safe walking alone at night' 'Housing expenditure'
 'Employment rate' 'Long-term unemployment rate'
 'Quality of support network' 'Educational attainment' 'Water quality'
 'Voter turnout' 'Self-reported health'
 'Employees working very long hours'
 'Stakeholder engagement for developing regulations' 'Student skills'
 'Life satisfaction' 'Rooms per person'
 'Household net adjusted disposable income' 'Household net wealth'
 'Personal earnings' 'Years in education' 'Air pollution'
 'Life expectancy' 'Homicide rate'
 'Time devoted to leisure and personal care']

Inequality categories: [nan]

Number of countries in OECD data: 42
Example countries: ['Austria' 'Belgium' 'Canada' 'Czechia' 'Denmark' 'Finland' 'France'
 'Germany' 'Greece' 'Hungary']


In [46]:
# Process OECD data to extract the indicators we want
# We'll focus on TOT (total) values for all population

# Work-Life Balance (Employees working very long hours)
work_life_balance = oecd_data[(oecd_data['Indicator'] == 'Employees working very long hours') & 
                             (oecd_data['INEQUALITY'] == 'TOT')]

# Civic Engagement (Voter turnout)
civic_engagement = oecd_data[(oecd_data['Indicator'] == 'Voter turnout') & 
                           (oecd_data['INEQUALITY'] == 'TOT')]

# Life Satisfaction indicator
life_satisfaction = oecd_data[(oecd_data['Indicator'] == 'Self-reported health') & 
                            (oecd_data['INEQUALITY'] == 'TOT')]

# Check the shapes to make sure we have data
print("Work-Life Balance data shape:", work_life_balance.shape)
print("Civic Engagement data shape:", civic_engagement.shape)
print("Life Satisfaction data shape:", life_satisfaction.shape)

# Preview the data to verify we have the right information
print("\nWork-Life Balance data preview:")
display(work_life_balance.head())

print("\nCivic Engagement data preview:")
display(civic_engagement.head())

print("\nLife Satisfaction data preview:")
display(life_satisfaction.head())

Work-Life Balance data shape: (40, 22)
Civic Engagement data shape: (42, 22)
Life Satisfaction data shape: (40, 22)

Work-Life Balance data preview:


Unnamed: 0,STRUCTURE,STRUCTURE_ID,STRUCTURE_NAME,ACTION,LOCATION,Country,INDICATOR,Indicator,MEASURE,Measure,...,OBS_VALUE,Observation Value,OBS_STATUS,Observation Status,UNIT_MEASURE,Unit of Measures,UNIT_MULT,Multiplier,BASE_PER,Base reference period
1096,DATAFLOW,OECD:DF_BLI(1.0),Better Life Index,I,AUS,Australia,WL_EWLH,Employees working very long hours,L,Value,...,12.5,,A,,PC,Percentage,0,Units,,
1097,DATAFLOW,OECD:DF_BLI(1.0),Better Life Index,I,AUT,Austria,WL_EWLH,Employees working very long hours,L,Value,...,5.3,,A,,PC,Percentage,0,Units,,
1098,DATAFLOW,OECD:DF_BLI(1.0),Better Life Index,I,BEL,Belgium,WL_EWLH,Employees working very long hours,L,Value,...,4.3,,A,,PC,Percentage,0,Units,,
1099,DATAFLOW,OECD:DF_BLI(1.0),Better Life Index,I,CAN,Canada,WL_EWLH,Employees working very long hours,L,Value,...,3.3,,A,,PC,Percentage,0,Units,,
1100,DATAFLOW,OECD:DF_BLI(1.0),Better Life Index,I,CZE,Czechia,WL_EWLH,Employees working very long hours,L,Value,...,4.5,,A,,PC,Percentage,0,Units,,



Civic Engagement data preview:


Unnamed: 0,STRUCTURE,STRUCTURE_ID,STRUCTURE_NAME,ACTION,LOCATION,Country,INDICATOR,Indicator,MEASURE,Measure,...,OBS_VALUE,Observation Value,OBS_STATUS,Observation Status,UNIT_MEASURE,Unit of Measures,UNIT_MULT,Multiplier,BASE_PER,Base reference period
875,DATAFLOW,OECD:DF_BLI(1.0),Better Life Index,I,AUS,Australia,CG_VOTO,Voter turnout,L,Value,...,92.0,,A,,PC,Percentage,0,Units,,
876,DATAFLOW,OECD:DF_BLI(1.0),Better Life Index,I,AUT,Austria,CG_VOTO,Voter turnout,L,Value,...,76.0,,A,,PC,Percentage,0,Units,,
877,DATAFLOW,OECD:DF_BLI(1.0),Better Life Index,I,BEL,Belgium,CG_VOTO,Voter turnout,L,Value,...,88.0,,A,,PC,Percentage,0,Units,,
878,DATAFLOW,OECD:DF_BLI(1.0),Better Life Index,I,CAN,Canada,CG_VOTO,Voter turnout,L,Value,...,68.0,,A,,PC,Percentage,0,Units,,
879,DATAFLOW,OECD:DF_BLI(1.0),Better Life Index,I,CZE,Czechia,CG_VOTO,Voter turnout,L,Value,...,62.0,,A,,PC,Percentage,0,Units,,



Life Satisfaction data preview:


Unnamed: 0,STRUCTURE,STRUCTURE_ID,STRUCTURE_NAME,ACTION,LOCATION,Country,INDICATOR,Indicator,MEASURE,Measure,...,OBS_VALUE,Observation Value,OBS_STATUS,Observation Status,UNIT_MEASURE,Unit of Measures,UNIT_MULT,Multiplier,BASE_PER,Base reference period
914,DATAFLOW,OECD:DF_BLI(1.0),Better Life Index,I,AUS,Australia,HS_SFRH,Self-reported health,L,Value,...,85.0,,A,,PC,Percentage,0,Units,,
915,DATAFLOW,OECD:DF_BLI(1.0),Better Life Index,I,AUT,Austria,HS_SFRH,Self-reported health,L,Value,...,71.0,,A,,PC,Percentage,0,Units,,
916,DATAFLOW,OECD:DF_BLI(1.0),Better Life Index,I,BEL,Belgium,HS_SFRH,Self-reported health,L,Value,...,74.0,,A,,PC,Percentage,0,Units,,
917,DATAFLOW,OECD:DF_BLI(1.0),Better Life Index,I,CAN,Canada,HS_SFRH,Self-reported health,L,Value,...,89.0,,A,,PC,Percentage,0,Units,,
918,DATAFLOW,OECD:DF_BLI(1.0),Better Life Index,I,CZE,Czechia,HS_SFRH,Self-reported health,L,Value,...,62.0,,A,,PC,Percentage,0,Units,,


In [47]:
# Process the GDP data for merging
# First, let's identify the available years
numeric_columns = df_gdp.select_dtypes(include=['number']).columns
years = [col for col in numeric_columns if col != 'Year']
print("Available numeric columns in GDP data:", numeric_columns)
print(f"Latest available year column: {years[-1]}")

# For this analysis, we'll use 2021 GDP data as it's likely to have better coverage
# We need to reshape the GDP data to get one row per country with the latest value
latest_gdp = df_gdp.copy()

# Group by entity (country) and get the most recent non-null GDP value
latest_gdp = latest_gdp.sort_values(['Entity', 'Year']).groupby('Entity').last().reset_index()
latest_gdp = latest_gdp[['Entity', 'ny_gdp_pcap_pp_kd']]
latest_gdp.rename(columns={'Entity': 'Country', 'ny_gdp_pcap_pp_kd': 'GDP_per_capita'}, inplace=True)

print("\nLatest GDP data shape:", latest_gdp.shape)
print("Preview of latest GDP data:")
display(latest_gdp.head())

Available numeric columns in GDP data: Index(['Year', 'ny_gdp_pcap_pp_kd'], dtype='object')
Latest available year column: ny_gdp_pcap_pp_kd

Latest GDP data shape: (213, 2)
Preview of latest GDP data:


Unnamed: 0,Country,GDP_per_capita
0,Afghanistan,1992.4244
1,Albania,18244.293
2,Algeria,15159.324
3,Andorra,64631.297
4,Angola,7244.893


In [48]:
# Prepare simplified dataframes with just the country and indicator value
wlb_simplified = work_life_balance[['Country', 'OBS_VALUE']].copy()
wlb_simplified.rename(columns={'OBS_VALUE': 'Work_Life_Balance'}, inplace=True)

ce_simplified = civic_engagement[['Country', 'OBS_VALUE']].copy()
ce_simplified.rename(columns={'OBS_VALUE': 'Civic_Engagement'}, inplace=True)

ls_simplified = life_satisfaction[['Country', 'OBS_VALUE']].copy()
ls_simplified.rename(columns={'OBS_VALUE': 'Life_Satisfaction'}, inplace=True)

# Check that we have data
print("Work-Life Balance simplified shape:", wlb_simplified.shape)
print("Civic Engagement simplified shape:", ce_simplified.shape)
print("Life Satisfaction simplified shape:", ls_simplified.shape)

# Display the first few rows of each simplified dataframe
print("\nWork-Life Balance simplified preview:")
display(wlb_simplified.head())

print("\nCivic Engagement simplified preview:")
display(ce_simplified.head())

print("\nLife Satisfaction simplified preview:")
display(ls_simplified.head())

Work-Life Balance simplified shape: (40, 2)
Civic Engagement simplified shape: (42, 2)
Life Satisfaction simplified shape: (40, 2)

Work-Life Balance simplified preview:


Unnamed: 0,Country,Work_Life_Balance
1096,Australia,12.5
1097,Austria,5.3
1098,Belgium,4.3
1099,Canada,3.3
1100,Czechia,4.5



Civic Engagement simplified preview:


Unnamed: 0,Country,Civic_Engagement
875,Australia,92.0
876,Austria,76.0
877,Belgium,88.0
878,Canada,68.0
879,Czechia,62.0



Life Satisfaction simplified preview:


Unnamed: 0,Country,Life_Satisfaction
914,Australia,85.0
915,Austria,71.0
916,Belgium,74.0
917,Canada,89.0
918,Czechia,62.0


In [49]:
# Merge the datasets
# Start with GDP data
merged_df = latest_gdp.copy()

# Merge with Work-Life Balance
merged_df = pd.merge(merged_df, wlb_simplified, on='Country', how='left')

# Merge with Civic Engagement
merged_df = pd.merge(merged_df, ce_simplified, on='Country', how='left')

# Merge with Life Satisfaction
merged_df = pd.merge(merged_df, ls_simplified, on='Country', how='left')

# Check the final merged dataset
print("Final merged dataframe shape:", merged_df.shape)
print("\nPreview of merged dataframe:")
display(merged_df.head(10))

# Check for missing values
print("\nMissing values in each column:")
print(merged_df.isnull().sum())

# Get the list of OECD countries (excluding 'OECD - Total' which is not a country)
oecd_country_list = [country for country in oecd_countries if country != 'OECD - Total']
print(f"\nNumber of OECD countries: {len(oecd_country_list)}")

# Filter the merged dataframe to include only OECD countries
oecd_merged_df = merged_df[merged_df['Country'].isin(oecd_country_list)].copy()
print(f"\nOECD countries in merged dataframe: {len(oecd_merged_df)}")

# Find common OECD countries between all datasets (those with complete data)
oecd_complete_data = oecd_merged_df.dropna().copy()
print(f"\nNumber of OECD countries with complete data: {len(oecd_complete_data)}")
print("\nOECD countries with complete data:")
display(oecd_complete_data['Country'].values)

Final merged dataframe shape: (213, 5)

Preview of merged dataframe:


Unnamed: 0,Country,GDP_per_capita,Work_Life_Balance,Civic_Engagement,Life_Satisfaction
0,Afghanistan,1992.4244,,,
1,Albania,18244.293,,,
2,Algeria,15159.324,,,
3,Andorra,64631.297,,,
4,Angola,7244.893,,,
5,Antigua and Barbuda,28967.256,,,
6,Argentina,27104.98,,,
7,Armenia,19230.19,,,
8,Aruba,40516.81,,,
9,Australia,59552.88,12.5,92.0,85.0



Missing values in each column:
Country                0
GDP_per_capita         0
Work_Life_Balance    176
Civic_Engagement     175
Life_Satisfaction    177
dtype: int64

Number of OECD countries: 41

OECD countries in merged dataframe: 38

Number of OECD countries with complete data: 35

OECD countries with complete data:


array(['Australia', 'Austria', 'Belgium', 'Canada', 'Chile', 'Colombia',
       'Costa Rica', 'Czechia', 'Denmark', 'Estonia', 'Finland', 'France',
       'Germany', 'Greece', 'Hungary', 'Iceland', 'Ireland', 'Israel',
       'Italy', 'Latvia', 'Lithuania', 'Luxembourg', 'Mexico',
       'Netherlands', 'New Zealand', 'Norway', 'Poland', 'Portugal',
       'Russia', 'Slovenia', 'Spain', 'Sweden', 'Switzerland',
       'United Kingdom', 'United States'], dtype=object)

In [50]:
# If we don't have enough OECD countries with complete data, let's try a more flexible approach
# We'll create separate datasets for each indicator paired with GDP, but only for OECD countries

# Filter the individual dataframes to include only OECD countries
gdp_wlb_oecd = pd.merge(latest_gdp[latest_gdp['Country'].isin(oecd_country_list)], 
                        wlb_simplified[wlb_simplified['Country'].isin(oecd_country_list)], 
                        on='Country', how='inner')

gdp_ce_oecd = pd.merge(latest_gdp[latest_gdp['Country'].isin(oecd_country_list)], 
                      ce_simplified[ce_simplified['Country'].isin(oecd_country_list)], 
                      on='Country', how='inner')

gdp_ls_oecd = pd.merge(latest_gdp[latest_gdp['Country'].isin(oecd_country_list)], 
                      ls_simplified[ls_simplified['Country'].isin(oecd_country_list)], 
                      on='Country', how='inner')

print(f"OECD countries - GDP + Work-Life Balance: {len(gdp_wlb_oecd)} countries")
print(f"OECD countries - GDP + Civic Engagement: {len(gdp_ce_oecd)} countries")
print(f"OECD countries - GDP + Life Satisfaction: {len(gdp_ls_oecd)} countries")

# Let's use the merged dataframe with all data for OECD countries that have it
oecd_merged_df_clean = oecd_merged_df.dropna().copy()
print(f"\nFinal cleaned OECD dataframe has {len(oecd_merged_df_clean)} countries")
display(oecd_merged_df_clean.head())

OECD countries - GDP + Work-Life Balance: 37 countries
OECD countries - GDP + Civic Engagement: 38 countries
OECD countries - GDP + Life Satisfaction: 36 countries

Final cleaned OECD dataframe has 35 countries


Unnamed: 0,Country,GDP_per_capita,Work_Life_Balance,Civic_Engagement,Life_Satisfaction
9,Australia,59552.88,12.5,92.0,85.0
10,Austria,65015.133,5.3,76.0,71.0
17,Belgium,64185.848,4.3,88.0,74.0
32,Canada,55918.93,3.3,68.0,89.0
37,Chile,29462.64,7.7,47.0,60.0


In [None]:
# Create a normalized version of the dataset as requested
# First, create a copy of the merged dataframe with complete data for all three indicators
complete_data = merged_df.dropna().copy()

# Apply normalization to the three indicators
# For Work_Life_Balance: lower values are better, so we want 1 - normalized value
# 1 - (value - min) / (max - min)
def normalize(column):
    min_val = column.min()
    max_val = column.max()
    return 1 - (column - min_val) / (max_val - min_val)

# Create a dataframe with normalized values
normalized_df = merged_df.copy()

# Normalize Work_Life_Balance (lower values are better)
# For this metric, we'll invert the formula as lower percentages are better
wlb_min = normalized_df['Work_Life_Balance'].dropna().min()
wlb_max = normalized_df['Work_Life_Balance'].dropna().max()
normalized_df['Work_Life_Balance_Normalized'] = normalized_df['Work_Life_Balance'].apply(
    lambda x: 1 - (x - wlb_min) / (wlb_max - wlb_min) if pd.notnull(x) else np.nan
)

# Normalize Civic_Engagement (higher values are better)
# For this metric, higher percentages are better, so we use the direct formula
ce_min = normalized_df['Civic_Engagement'].dropna().min()
ce_max = normalized_df['Civic_Engagement'].dropna().max()
normalized_df['Civic_Engagement_Normalized'] = normalized_df['Civic_Engagement'].apply(
    lambda x: (x - ce_min) / (ce_max - ce_min) if pd.notnull(x) else np.nan
)

# Normalize Life_Satisfaction (higher values are better)
# For this metric, higher percentages are better, so we use the direct formula
ls_min = normalized_df['Life_Satisfaction'].dropna().min()
ls_max = normalized_df['Life_Satisfaction'].dropna().max()
normalized_df['Life_Satisfaction_Normalized'] = normalized_df['Life_Satisfaction'].apply(
    lambda x: (x - ls_min) / (ls_max - ls_min) if pd.notnull(x) else np.nan
)

# Create a final clean dataframe with original and normalized values
final_df = normalized_df[['Country', 'GDP_per_capita', 
                         'Work_Life_Balance', 'Work_Life_Balance_Normalized',
                         'Civic_Engagement', 'Civic_Engagement_Normalized',
                         'Life_Satisfaction', 'Life_Satisfaction_Normalized']]

# Display statistics about the normalized dataframe
print("Final dataframe with normalized values shape:", final_df.shape)
print("\nPreview of final dataframe with normalized values:")
display(final_df.head(10))

# Create a focused dataframe with only OECD countries and complete data
oecd_final_df = final_df[final_df['Country'].isin(oecd_country_list)].dropna()
print(f"\nOECD countries with complete normalized data: {len(oecd_final_df)}")
display(oecd_final_df.head())

# Save the final dataframe if needed
# final_df.to_csv("/Users/ren/Documents/Ciencia de Datos/GDP shi/normalized_indicators.csv", index=False)

Final dataframe with normalized values shape: (213, 8)

Preview of final dataframe with normalized values:


Unnamed: 0,Country,GDP_per_capita,Work_Life_Balance,Work_Life_Balance_Normalized,Civic_Engagement,Civic_Engagement_Normalized,Life_Satisfaction,Life_Satisfaction_Normalized
0,Afghanistan,1992.4244,,0.0,,0.0,,0.0
1,Albania,18244.293,,0.0,,0.0,,0.0
2,Algeria,15159.324,,0.0,,0.0,,0.0
3,Andorra,64631.297,,0.0,,0.0,,0.0
4,Angola,7244.893,,0.0,,0.0,,0.0
5,Antigua and Barbuda,28967.256,,0.0,,0.0,,0.0
6,Argentina,27104.98,,0.0,,0.0,,0.0
7,Armenia,19230.19,,0.0,,0.0,,0.0
8,Aruba,40516.81,,0.0,,0.0,,0.0
9,Australia,59552.88,12.5,0.539033,92.0,1.0,85.0,0.923077



OECD countries with complete normalized data: 35


Unnamed: 0,Country,GDP_per_capita,Work_Life_Balance,Work_Life_Balance_Normalized,Civic_Engagement,Civic_Engagement_Normalized,Life_Satisfaction,Life_Satisfaction_Normalized
9,Australia,59552.88,12.5,0.539033,92.0,1.0,85.0,0.923077
10,Austria,65015.133,5.3,0.806691,76.0,0.659574,71.0,0.653846
17,Belgium,64185.848,4.3,0.843866,88.0,0.914894,74.0,0.711538
32,Canada,55918.93,3.3,0.881041,68.0,0.489362,89.0,1.0
37,Chile,29462.64,7.7,0.717472,47.0,0.042553,60.0,0.442308


In [None]:
# Create a simplified version with just the country, GDP, and normalized indicators
simplified_df = pd.DataFrame({
    'Country': final_df['Country'],
    'GDP_per_capita': final_df['GDP_per_capita'],
    'Work_Life_Balance': final_df['Work_Life_Balance_Normalized'],
    'Civic_Engagement': final_df['Civic_Engagement_Normalized'],
    'Life_Satisfaction': final_df['Life_Satisfaction_Normalized']
})

# Filter for only rows with complete data
simplified_df_clean = simplified_df.dropna()

print("Simplified dataframe shape:", simplified_df.shape)
print("Simplified dataframe with complete data shape:", simplified_df_clean.shape)
print("\nPreview of simplified dataframe with complete data:")
display(simplified_df_clean.head(10))

# Calculate the average of the three normalized indicators as a combined well-being index
simplified_df_clean['Well_Being_Index'] = simplified_df_clean[['Work_Life_Balance', 
                                                             'Civic_Engagement', 
                                                             'Life_Satisfaction']].mean(axis=1)

# Sort by Well-Being Index in descending order
simplified_df_sorted = simplified_df_clean.sort_values('Well_Being_Index', ascending=False)
print("\nCountries sorted by Well-Being Index (highest to lowest):")
display(simplified_df_sorted[['Country', 'GDP_per_capita', 'Well_Being_Index']].head(15))

# Check missing values in the simplified dataframe
print("\nCount of NaN values in each column before cleaning:")
print(simplified_df.isna().sum())

# Check correlation after filtering for complete data only
print("\nNumber of countries with complete data for correlation analysis:", len(simplified_df_clean))

Simplified dataframe shape: (213, 5)
Simplified dataframe with complete data shape: (213, 5)

Preview of simplified dataframe with complete data:


Unnamed: 0,Country,GDP_per_capita,Work_Life_Balance,Civic_Engagement,Life_Satisfaction
0,Afghanistan,1992.4244,0.0,0.0,0.0
1,Albania,18244.293,0.0,0.0,0.0
2,Algeria,15159.324,0.0,0.0,0.0
3,Andorra,64631.297,0.0,0.0,0.0
4,Angola,7244.893,0.0,0.0,0.0
5,Antigua and Barbuda,28967.256,0.0,0.0,0.0
6,Argentina,27104.98,0.0,0.0,0.0
7,Armenia,19230.19,0.0,0.0,0.0
8,Aruba,40516.81,0.0,0.0,0.0
9,Australia,59552.88,0.539033,1.0,0.923077



Countries sorted by Well-Being Index (highest to lowest):


Unnamed: 0,Country,GDP_per_capita,Well_Being_Index
185,Sweden,63114.68,0.871292
112,Luxembourg,132846.58,0.843384
17,Belgium,64185.848,0.823433
9,Australia,59552.88,0.820703
49,Denmark,72097.305,0.816168
136,Netherlands,71446.695,0.81558
143,Norway,90469.81,0.794857
32,Canada,55918.93,0.790134
180,Spain,47141.543,0.738673
137,New Zealand,48826.574,0.737604


In [53]:
# Visualize the relationship between GDP per capita and the Well-Being Index
fig1 = px.scatter(
    simplified_df_clean, 
    x='GDP_per_capita', 
    y='Well_Being_Index',
    hover_name='Country',
    title='GDP per Capita vs. Well-Being Index',
    labels={'GDP_per_capita': 'GDP per Capita', 'Well_Being_Index': 'Well-Being Index (Normalized)'},
    trendline='ols'
)

fig1.update_layout(
    xaxis=dict(title=dict(font=dict(size=14))),
    yaxis=dict(title=dict(font=dict(size=14))),
    hoverlabel=dict(font_size=12),
    title=dict(font=dict(size=18))
)

fig1.show()

# Create a correlation matrix for GDP and all normalized indicators
correlation_matrix = simplified_df_clean[['GDP_per_capita', 'Work_Life_Balance', 
                                        'Civic_Engagement', 'Life_Satisfaction', 
                                        'Well_Being_Index']].corr()

print("\nCorrelation Matrix:")
display(correlation_matrix)

# Visualize the correlation matrix
fig2 = px.imshow(
    correlation_matrix,
    text_auto=True,
    title='Correlation Matrix of GDP per Capita and Normalized Well-Being Indicators',
    color_continuous_scale='RdBu_r',
    zmin=-1, zmax=1
)

fig2.show()


Correlation Matrix:


Unnamed: 0,GDP_per_capita,Work_Life_Balance,Civic_Engagement,Life_Satisfaction,Well_Being_Index
GDP_per_capita,1.0,0.498244,0.484819,0.505463,0.526441
Work_Life_Balance,0.498244,1.0,0.841328,0.824255,0.95136
Civic_Engagement,0.484819,0.841328,1.0,0.839136,0.938895
Life_Satisfaction,0.505463,0.824255,0.839136,1.0,0.938405
Well_Being_Index,0.526441,0.95136,0.938895,0.938405,1.0


In [54]:
# Create visualizations for each indicator vs. GDP (OECD countries only)
# Work-Life Balance vs. GDP
fig1 = px.scatter(
    gdp_wlb_oecd, 
    x='GDP_per_capita', 
    y='Work_Life_Balance',
    hover_name='Country',
    title='GDP per Capita vs. Work-Life Balance (OECD Countries)',
    labels={'GDP_per_capita': 'GDP per Capita', 'Work_Life_Balance': 'Work-Life Balance (% Working Very Long Hours)'},
    trendline='ols'
)

fig1.update_layout(
    xaxis=dict(title=dict(font=dict(size=14))),
    yaxis=dict(title=dict(font=dict(size=14))),
    hoverlabel=dict(font_size=12),
    title=dict(font=dict(size=18))
)

fig1.show()

In [55]:
# Civic Engagement vs. GDP (OECD countries only)
fig2 = px.scatter(
    gdp_ce_oecd, 
    x='GDP_per_capita', 
    y='Civic_Engagement',
    hover_name='Country',
    title='GDP per Capita vs. Civic Engagement (OECD Countries)',
    labels={'GDP_per_capita': 'GDP per Capita', 'Civic_Engagement': 'Civic Engagement (Voter Turnout %)'},
    trendline='ols'
)

fig2.update_layout(
    xaxis=dict(title=dict(font=dict(size=14))),
    yaxis=dict(title=dict(font=dict(size=14))),
    hoverlabel=dict(font_size=12),
    title=dict(font=dict(size=18))
)

fig2.show()

# Life Satisfaction vs. GDP (OECD countries only)
fig3 = px.scatter(
    gdp_ls_oecd, 
    x='GDP_per_capita', 
    y='Life_Satisfaction',
    hover_name='Country',
    title='GDP per Capita vs. Life Satisfaction (OECD Countries)',
    labels={'GDP_per_capita': 'GDP per Capita', 'Life_Satisfaction': 'Self-Reported Health (%)'},
    trendline='ols'
)

fig3.update_layout(
    xaxis=dict(title=dict(font=dict(size=14))),
    yaxis=dict(title=dict(font=dict(size=14))),
    hoverlabel=dict(font_size=12),
    title=dict(font=dict(size=18))
)

fig3.show()

In [56]:
# Create a correlation matrix for OECD countries that have all data points
if len(oecd_merged_df_clean) > 0:
    # Calculate and display correlations
    correlation_matrix = oecd_merged_df_clean.drop('Country', axis=1).corr()
    print("Correlation Matrix (OECD Countries):")
    display(correlation_matrix)

    # Visualize the correlation matrix
    fig4 = px.imshow(
        correlation_matrix,
        text_auto=True,
        title='Correlation Matrix of GDP per Capita and OECD Indicators (OECD Countries)',
        color_continuous_scale='RdBu_r',
        zmin=-1, zmax=1
    )
    fig4.show()
else:
    print("Not enough OECD countries with complete data to create a correlation matrix.")
    
    # Alternative: calculate pairwise correlations
    print("\nPairwise correlations with GDP per capita (OECD Countries):")
    
    gdp_wlb_corr = gdp_wlb_oecd['GDP_per_capita'].corr(gdp_wlb_oecd['Work_Life_Balance'])
    gdp_ce_corr = gdp_ce_oecd['GDP_per_capita'].corr(gdp_ce_oecd['Civic_Engagement'])
    gdp_ls_corr = gdp_ls_oecd['GDP_per_capita'].corr(gdp_ls_oecd['Life_Satisfaction'])
    
    print(f"GDP per capita vs Work-Life Balance: {gdp_wlb_corr:.4f}")
    print(f"GDP per capita vs Civic Engagement: {gdp_ce_corr:.4f}")
    print(f"GDP per capita vs Life Satisfaction: {gdp_ls_corr:.4f}")

Correlation Matrix (OECD Countries):


Unnamed: 0,GDP_per_capita,Work_Life_Balance,Civic_Engagement,Life_Satisfaction
GDP_per_capita,1.0,-0.419823,0.44276,0.374671
Work_Life_Balance,-0.419823,1.0,-0.097907,0.287201
Civic_Engagement,0.44276,-0.097907,1.0,0.337837
Life_Satisfaction,0.374671,0.287201,0.337837,1.0


# Normalized Indicators Analysis

This analysis examines the relationship between GDP per capita and three OECD indicators after normalization:

1. **Work-Life Balance** (measured by employees working very long hours) - normalized so that lower percentages receive higher scores
2. **Civic Engagement** (measured by voter turnout) - normalized so that higher percentages receive higher scores
3. **Life Satisfaction** (measured by self-reported health) - normalized so that higher percentages receive higher scores

All indicators are normalized to a scale of 0-1, where 1 is the best possible value. The normalization formula used is:

- For indicators where higher values are better (Civic Engagement, Life Satisfaction):
  - `normalized = (value - min) / (max - min)`

- For indicators where lower values are better (Work-Life Balance):
  - `normalized = 1 - (value - min) / (max - min)`

A combined "Well-Being Index" is calculated as the average of the three normalized indicators.

# Analysis Summary

This analysis explores the relationship between GDP per capita and three OECD indicators among OECD member countries:

1. **Work-Life Balance** (measured by percentage of employees working very long hours)
2. **Civic Engagement** (measured by voter turnout)
3. **Life Satisfaction** (measured by self-reported health)

By focusing specifically on OECD countries, we're analyzing a more homogenous group of mostly developed economies, which may reveal different patterns than a global analysis would.

The scatter plots show the relationship between GDP per capita and each indicator, with trend lines to visualize the general pattern. The correlation analysis quantifies the strength and direction of these relationships.

Key findings:
- [Add your observations about the relationships based on the actual visualizations]
- [Discuss any interesting patterns or outlier countries]
- [Note limitations in the data or analysis]