Final Project: finding confounding variables

Idan Hermelin

Noam Biton

In [91]:
import pandas as pd

# Primary dataset
gdp_growth_df = pd.read_csv('GDPs\gdp_growth.csv')
gdp_growth_df.head()

# Auxiliary datasets
happiness_2015_df = pd.read_csv('happiness_score/2015.csv')
happiness_2016_df = pd.read_csv('happiness_score/2016.csv')
happiness_2017_df = pd.read_csv('happiness_score/2017.csv')
happiness_2018_df = pd.read_csv('happiness_score/2018.csv')
happiness_2019_df = pd.read_csv('happiness_score/2019.csv')

In [92]:
gdp_growth_df.rename(columns={"Country Name": "Country"}, inplace=True)

In [93]:
#change happiness columns to have the same name:
happiness_2017_df.rename(columns={"Happiness.Score": "Happiness Score"}, inplace=True)
happiness_2018_df.rename(columns={"Score": "Happiness Score"}, inplace=True)
happiness_2019_df.rename(columns={"Score": "Happiness Score"}, inplace=True)
happiness_2017_df.to_csv('happiness_score/2017.csv', index=False)
happiness_2018_df.to_csv('happiness_score/2018.csv', index=False)
happiness_2019_df.to_csv('happiness_score/2019.csv', index=False)

In [114]:
happiness_2015_df.columns

Index(['Country', 'Region', 'Happiness Rank', 'Happiness Score',
       'Standard Error', 'Economy (GDP per Capita)', 'Family',
       'Health (Life Expectancy)', 'Freedom', 'Trust (Government Corruption)',
       'Generosity', 'Dystopia Residual'],
      dtype='object')

In [94]:
#change country column to have the same name:
happiness_2018_df.rename(columns={"Country or region": "Country"}, inplace=True)
happiness_2019_df.rename(columns={"Country or region": "Country"}, inplace=True)
happiness_2018_df.to_csv('happiness_score/2018.csv', index=False)
happiness_2019_df.to_csv('happiness_score/2019.csv', index=False)

Functions for creating dfs of GDP trends

In [95]:
def merge_happiness(gdp_growth_df, happiness_df):
    return gdp_growth_df.merge(happiness_df, on=["Country"], how="left")

In [107]:
def GDP_happiness_df(year1, year2, merged_df):
    last_10_years_df = [col for col in merged_df.columns if (col.isdigit() and int(col) >= year1 and int(col) <= year2)]
    merged_df = merged_df[["Country", "Happiness Score"] + last_10_years_df]
    merged_df = merged_df.dropna()
    return merged_df

In [108]:
def calc_trend_growth(merged_df, year1, year2):
    last_10_years_cols = [col for col in merged_df.columns if (col.isdigit() and int(col) >= year1 and int(col) <= year2)]
    merged_df['GDP Growth Trend Over 10 Years'] = merged_df[last_10_years_cols].mean(axis=1, skipna=True)
    return merged_df

Create the dfs that contains the trends in the GDP:

In [112]:
merged_2015_to_2019_dfs = []
for year in range(2015, 2020):  
    happiness_df = pd.read_csv(f'happiness_score/{year}.csv')
    merged_df = merge_happiness(gdp_growth_df, happiness_df)
    merged_df = GDP_happiness_df(year-10, year, merged_df)
    merged_df = calc_trend_growth(merged_df, year-10, year)
    merged_2015_to_2019_dfs.append(merged_df)

In [113]:
merged_2015_to_2019_dfs[0].head()

Unnamed: 0,Country,Happiness Score,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,GDP Growth Trend Over 10 Years
2,Afghanistan,3.575,11.229715,5.357403,13.82632,3.924984,21.390528,14.362441,0.426355,12.752287,5.600745,2.724543,1.451315,8.458785
4,Angola,4.033,15.028915,11.547683,14.010018,11.166138,0.858713,4.403933,3.471976,8.542188,4.954545,4.822628,0.943572,7.250028
5,Albania,4.959,5.526424,5.902659,5.98326,7.500041,3.354289,3.706938,2.545406,1.417243,1.002018,1.774449,2.218726,3.721041
8,United Arab Emirates,6.901,4.855141,9.83732,3.18439,3.191836,-5.242922,1.60285,6.928509,4.483792,5.053078,4.410085,5.060335,3.942219
9,Argentina,6.574,8.85166,8.047152,9.007651,4.057233,-5.918525,10.125398,6.003952,-1.02642,2.405324,-2.512615,2.73116,3.797452


In [100]:
#!pip install statsmodels

In [101]:
import statsmodels.api as sm

def rank_confounders(df, treatment, outcome, confounders):
    rankings = []
    for confounder in confounders:
        # Model without confounder
        model1 = sm.OLS(df[outcome], sm.add_constant(df[[treatment]])).fit()
        r2_without = model1.rsquared

        # Model with confounder
        model2 = sm.OLS(df[outcome], sm.add_constant(df[[treatment, confounder]])).fit()
        r2_with = model2.rsquared

        r2_change = r2_with - r2_without
        rankings.append((confounder, r2_change))

    rankings.sort(key=lambda x: x[1], reverse=True)
    return pd.DataFrame(rankings, columns=["Confounder", "R² Change"])