In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats

import warnings
warnings.filterwarnings('ignore')

### <font color='blue'> OECD Data </font>

### Import Data

In [2]:
# to make a list of csvs titles that will be nested together
# dl = download

#to read csv of data

## data source: OECD
dl_avg_annual_hrs_worked_df = pd.read_csv('../data/OECD_Avg_annual_hours_worked_per_worker_original_file.csv')
dl_avg_annual_wages_df = pd.read_csv('../data/OECD_Average_annual_wages_original_file.csv')
dl_causes_of_mortality_df = pd.read_csv('../data/OECD_Causes_of_mortality_original_file.csv')
# dl_composite_leading_indicators_df = pd.read_csv('../data/OECD_Compsite_Leading_Indicators_original_file.csv')


### High-level Data Cleaning & Aggregation

In [3]:
## this dataset has two layer of filtering
dl_causes_of_mortality_df = dl_causes_of_mortality_df.loc[dl_causes_of_mortality_df['Measure'] == 'Number of total deaths']
dl_causes_of_mortality_df.head(10)

Unnamed: 0,VAR,Variable,UNIT,Measure,COU,Country,YEA,Year,Value,Flag Codes,Flags
154249,CICDALLC,All causes of death,NBPOPUPC,Number of total deaths,AUS,Australia,2000,2000,128784.0,,
154250,CICDALLC,All causes of death,NBPOPUPC,Number of total deaths,AUS,Australia,2001,2001,128657.0,,
154251,CICDALLC,All causes of death,NBPOPUPC,Number of total deaths,AUS,Australia,2002,2002,133047.0,,
154252,CICDALLC,All causes of death,NBPOPUPC,Number of total deaths,AUS,Australia,2003,2003,131784.0,,
154253,CICDALLC,All causes of death,NBPOPUPC,Number of total deaths,AUS,Australia,2004,2004,132314.0,,
154254,CICDALLC,All causes of death,NBPOPUPC,Number of total deaths,AUS,Australia,2006,2006,133739.0,,
154255,CICDALLC,All causes of death,NBPOPUPC,Number of total deaths,AUS,Australia,2007,2007,137854.0,,
154256,CICDALLC,All causes of death,NBPOPUPC,Number of total deaths,AUS,Australia,2008,2008,143946.0,,
154257,CICDALLC,All causes of death,NBPOPUPC,Number of total deaths,AUS,Australia,2009,2009,140760.0,,
154258,CICDALLC,All causes of death,NBPOPUPC,Number of total deaths,AUS,Australia,2010,2010,143473.0,,


In [4]:
# to rename the 'value' column to indicate the content

# list of column headers by df 
value_cols = ['Avg. Work Hours (Annual)','Avg. Wages (Annual)','Mortality Causes','CLI Values (Monthly)']
# dict of dataframes 
oecd_dfs = {0: dl_avg_annual_hrs_worked_df, 
            1: dl_avg_annual_wages_df,
            2: dl_causes_of_mortality_df
#             3: dl_composite_leading_indicators_df
           }

## lists for editing dataframes
list_of_cols_to_drop = ["TIME",
                        "YEA",
                        "Unit Code",
                        "EMPSTAT",
                        "Frequency",
                        "FREQUENCY",
                        "Measure",
                        "PowerCode Code",
                        "PowerCode",
                        "SERIES",
                        "Reference Period Code",
                        "Reference Period",
                        "Flag Codes",
                        "SUBJECT",
                        "Reference",
                        "VAR",
                        "Flags"]


In [5]:
i = 0

# iterate through dataframes 
while i < len(oecd_dfs):
    # rename columns per dataframe
    ## all edited dataframes are now stored in oecd_dfs object 
    oecd_dfs[i] = oecd_dfs[i].rename(columns= {"COUNTRY":"COU",
                                 "LOCATION":"COU",
                                 "UNIT":"Unit",
                                 "Currency": "Unit",
                                 "Time":"Year",
                                 "Variable":"Description", 
                                 "Subject":"Description",
                                 "Employment status":"Description",
                                 "Series":"Description"
                                 })

    # change datat type of year col
    oecd_dfs[i]['Year'] = pd.to_numeric(oecd_dfs[i]['Year'])
    # to add a dataset col
    oecd_dfs[i]["Dataset"] = value_cols[i]
    
    # drop columns per dataframe
    for item in list_of_cols_to_drop: 
        # test if column exists in dataframe
        if item in oecd_dfs[i].columns:
            oecd_dfs[i] = oecd_dfs[i].drop(columns= item)
        else:
            continue

    
    # repostion columns in dataframe
    # get a list of of columns in the dataframe
    print(f"before columns {list(oecd_dfs[i].columns)}")
    
    # assign to df
    oecd_dfs[i] = oecd_dfs[i][["Dataset","COU","Country","Year","Description","Value","Unit"]]
    
    
    print(f"Completed df {i} of 3")
    print(f"final columns {list(oecd_dfs[i].columns)}")
    print("=========================")   
    
    
    i += 1

before columns ['COU', 'Country', 'Description', 'Year', 'Unit', 'Value', 'Dataset']
Completed df 0 of 3
final columns ['Dataset', 'COU', 'Country', 'Year', 'Description', 'Value', 'Unit']
before columns ['COU', 'Country', 'Description', 'Year', 'Unit', 'Value', 'Dataset']
Completed df 1 of 3
final columns ['Dataset', 'COU', 'Country', 'Year', 'Description', 'Value', 'Unit']
before columns ['Description', 'Unit', 'COU', 'Country', 'Year', 'Value', 'Dataset']
Completed df 2 of 3
final columns ['Dataset', 'COU', 'Country', 'Year', 'Description', 'Value', 'Unit']


In [6]:
# to concatenate dataframes
oecd_df = pd.concat([oecd_dfs[0],oecd_dfs[1],oecd_dfs[2]],ignore_index=True)
print(f"Combined all df")
print("=========================")

Combined all df


In [7]:
oecd_df = oecd_df.dropna(axis=0, how='any')

In [8]:
oecd_df.to_csv('../resources/OECD_Dataframes.csv', sep=',' , encoding= 'utf-8', index=False)

In [9]:
oecd_df.tail()

Unnamed: 0,Dataset,COU,Country,Year,Description,Value,Unit
36313,Mortality Causes,CRI,Costa Rica,2007,Drug use disorders,3.0,NBPOPUPC
36314,Mortality Causes,CRI,Costa Rica,2008,Drug use disorders,2.0,NBPOPUPC
36315,Mortality Causes,CRI,Costa Rica,2010,Drug use disorders,1.0,NBPOPUPC
36316,Mortality Causes,CRI,Costa Rica,2013,Drug use disorders,3.0,NBPOPUPC
36317,Mortality Causes,CRI,Costa Rica,2014,Drug use disorders,1.0,NBPOPUPC


### OECD  Merge Ready to Happiness Report

In [10]:
descriptions = ['Total employment','In 2018 constant prices at 2018 USD PPPs','All causes of death']

In [11]:
# to remove monthly report
oecd_yearly_reports_df = oecd_df.loc[oecd_df['Dataset'] != 'CLI Values (Monthly)']

print(descriptions)

# to filter the datasets by the descriptions
oecd_yearly_reports_df = oecd_yearly_reports_df.loc[oecd_df['Description'].isin(descriptions)].reset_index(drop=True)
oecd_yearly_reports_df.tail(40)

['Total employment', 'In 2018 constant prices at 2018 USD PPPs', 'All causes of death']


Unnamed: 0,Dataset,COU,Country,Year,Description,Value,Unit
2024,Mortality Causes,CRI,Costa Rica,2009,All causes of death,16652.0,NBPOPUPC
2025,Mortality Causes,CRI,Costa Rica,2010,All causes of death,18988.0,NBPOPUPC
2026,Mortality Causes,CRI,Costa Rica,2011,All causes of death,18458.0,NBPOPUPC
2027,Mortality Causes,CRI,Costa Rica,2012,All causes of death,18913.0,NBPOPUPC
2028,Mortality Causes,CRI,Costa Rica,2013,All causes of death,19365.0,NBPOPUPC
2029,Mortality Causes,CRI,Costa Rica,2014,All causes of death,20290.0,NBPOPUPC
2030,Mortality Causes,COL,Colombia,2000,All causes of death,187246.0,NBPOPUPC
2031,Mortality Causes,COL,Colombia,2001,All causes of death,191350.0,NBPOPUPC
2032,Mortality Causes,COL,Colombia,2002,All causes of death,192030.0,NBPOPUPC
2033,Mortality Causes,COL,Colombia,2003,All causes of death,189770.0,NBPOPUPC


### Reshaping Data Tables for Machine Learning

In [12]:
## show that the same amount of countries in the description .value_counts(sort=True)
oecd_dfs[0].loc[:,['Description',"COU"]].groupby("Description")['COU'].nunique().sort_values()

Description
Dependent employment    32
Total employment        39
Name: COU, dtype: int64

In [13]:
i = 0

actual_descriptions = ['Total employment','In 2018 constant prices at 2018 USD PPPs', 'All causes of death','Amplitude adjusted (CLI)']

while i < len(oecd_dfs):
    # show that the same amount of countries in the description
    results =  oecd_dfs[i].loc[:,['Description',"COU"]].groupby("Description")['COU'].nunique().sort_values(ascending=False)
    actual = results.filter(items = [actual_descriptions[i]])
    
    print('=========')
    print("Actual Description Used:")
    print(actual)
    print('-------------------')
    print("Unqiue COU by Description:")
    print(results)
    
    i+= 1

Actual Description Used:
Description
Total employment    39
Name: COU, dtype: int64
-------------------
Unqiue COU by Description:
Description
Total employment        39
Dependent employment    32
Name: COU, dtype: int64
Actual Description Used:
Description
In 2018 constant prices at 2018 USD PPPs    35
Name: COU, dtype: int64
-------------------
Unqiue COU by Description:
Description
In 2018 constant prices at 2018 USD PPPs    35
Current prices in NCU                       35
2018 constant prices and NCU                35
Name: COU, dtype: int64
Actual Description Used:
Description
All causes of death    41
Name: COU, dtype: int64
-------------------
Unqiue COU by Description:
Description
Tuberculosis                                                    41
Diseases of the blood and blood-forming organs                  41
Endocrine, nutritional and metabolic diseases                   41
Transport Accidents                                             41
Diseases of the respiratory syste

### Consolidating the OECD tables into a Pivot Table

In [14]:
# to make a list of years with a numeric data type for filtering tables
years = [*range(2010,2018,1)]
years

[2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017]

In [15]:
len(oecd_dfs)-2
oecd_dfs[2].head()

Unnamed: 0,Dataset,COU,Country,Year,Description,Value,Unit
154249,Mortality Causes,AUS,Australia,2000,All causes of death,128784.0,NBPOPUPC
154250,Mortality Causes,AUS,Australia,2001,All causes of death,128657.0,NBPOPUPC
154251,Mortality Causes,AUS,Australia,2002,All causes of death,133047.0,NBPOPUPC
154252,Mortality Causes,AUS,Australia,2003,All causes of death,131784.0,NBPOPUPC
154253,Mortality Causes,AUS,Australia,2004,All causes of death,132314.0,NBPOPUPC


#### To Consolidate Pivot Tables

In [16]:
OECD_pivot_tables=[]

## CLI not included because it is an annual report

for dfs in range(len(oecd_dfs)):
    for year in range(len(years)):
        
        # to filter dataframes by description value
        df = oecd_dfs[dfs].loc[(oecd_dfs[dfs]['Year'] == years[year]) & (oecd_dfs[dfs]['Description'] == descriptions[dfs])]
        
        
        # to reshape the data
        oecd = pd.pivot_table(df, values='Value', index=['COU','Country'], columns=['Dataset'], aggfunc= np.sum)
    
        # data cleaning
        oecd_tables = oecd.dropna(axis=0).reset_index()
    
        # append to list
        OECD_pivot_tables.append(oecd_tables)
    
        print(f"Completed {dfs} Dataframe for year: {years[year]}")

Completed 0 Dataframe for year: 2010
Completed 0 Dataframe for year: 2011
Completed 0 Dataframe for year: 2012
Completed 0 Dataframe for year: 2013
Completed 0 Dataframe for year: 2014
Completed 0 Dataframe for year: 2015
Completed 0 Dataframe for year: 2016
Completed 0 Dataframe for year: 2017
Completed 1 Dataframe for year: 2010
Completed 1 Dataframe for year: 2011
Completed 1 Dataframe for year: 2012
Completed 1 Dataframe for year: 2013
Completed 1 Dataframe for year: 2014
Completed 1 Dataframe for year: 2015
Completed 1 Dataframe for year: 2016
Completed 1 Dataframe for year: 2017
Completed 2 Dataframe for year: 2010
Completed 2 Dataframe for year: 2011
Completed 2 Dataframe for year: 2012
Completed 2 Dataframe for year: 2013
Completed 2 Dataframe for year: 2014
Completed 2 Dataframe for year: 2015
Completed 2 Dataframe for year: 2016
Completed 2 Dataframe for year: 2017


In [17]:
# to review pivot tables
OECD_pivot_tables[23].head(10)

Dataset,COU,Country,Mortality Causes
0,AUT,Austria,83270.0
1,CZE,Czech Republic,111443.0
2,HUN,Hungary,131674.0
3,ISL,Iceland,2236.0
4,LTU,Lithuania,40142.0


In [18]:
# to slice list of pivot tables by year

dataframes = [*range(len(OECD_pivot_tables))]
print(dataframes)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]


In [19]:
def slice_per(source, step):
    return [source[i::step] for i in range(step)]

yearly_df = slice_per(dataframes,8)
yearly_df

[[0, 8, 16],
 [1, 9, 17],
 [2, 10, 18],
 [3, 11, 19],
 [4, 12, 20],
 [5, 13, 21],
 [6, 14, 22],
 [7, 15, 23]]

#### Join Pivot Tables for Machine Learning

In [20]:
# to inner join the pivot tables per the indexes
oecd_ml_df = []


try:
    for index in yearly_df:            
        # to merge 3 pivot tables per dataset column
        oecd_df_part1 = pd.merge(OECD_pivot_tables[index[0]], OECD_pivot_tables[index[(0+1)]], on=['COU','Country'], how='inner')

        # to merge 3 pivot tables per dataset column
        oecd_df_part2 = pd.merge(oecd_df_part1, OECD_pivot_tables[index[(0+2)]], on=['COU','Country'], how='inner')

        print(f"Completed merge of pivot table {index[0]} & {(index[0+1])} & {(index[0+2])}")

        oecd_ml_df.append(oecd_df_part2)
        print(f"Pivot Table created of indexes:{index}")
                   
except IndexError:
    pass

Completed merge of pivot table 0 & 8 & 16
Pivot Table created of indexes:[0, 8, 16]
Completed merge of pivot table 1 & 9 & 17
Pivot Table created of indexes:[1, 9, 17]
Completed merge of pivot table 2 & 10 & 18
Pivot Table created of indexes:[2, 10, 18]
Completed merge of pivot table 3 & 11 & 19
Pivot Table created of indexes:[3, 11, 19]
Completed merge of pivot table 4 & 12 & 20
Pivot Table created of indexes:[4, 12, 20]
Completed merge of pivot table 5 & 13 & 21
Pivot Table created of indexes:[5, 13, 21]
Completed merge of pivot table 6 & 14 & 22
Pivot Table created of indexes:[6, 14, 22]
Completed merge of pivot table 7 & 15 & 23
Pivot Table created of indexes:[7, 15, 23]


In [21]:
oecd_ml_df[4].tail()

Dataset,COU,Country,Avg. Work Hours (Annual),Avg. Wages (Annual),Mortality Causes
30,PRT,Portugal,1714.0,25360.07583,105219.0
31,SVK,Slovak Republic,1760.0,22510.86712,51345.0
32,SVN,Slovenia,1681.9,34282.48437,18886.0
33,SWE,Sweden,1470.0,42444.08578,89062.0
34,USA,United States,1784.0,60617.97202,2626418.0


### <font color = 'green'> World Bank Data </font>

#### Importing Data

In [22]:
#to read csv of data
df= pd.read_csv('../../../WDIData.csv')

#to read csv of country names
df_country_names= pd.read_csv('../data/WDI_Country_Code_and_Names.csv',encoding = "ISO-8859-1")


#to read csv of indicators
df_list_indicators = pd.read_csv('../data/WDI_list_of_reviewed_indicators.csv')

#### Data Cleaning

In [23]:
#to grab the headers of the dataset
df_headers=list(df.columns.values)

#only pull the headers that are years
df_years = df_headers[-59:]

#only select previous years before 2010
df_years_drop = df_years[0:50]
#to make column '2018' a list
df_years_drop_2018 = list([df_years[-1]])

# to combine lists of years into 1 drop line
df_drop = df_years_drop + df_years_drop_2018

#to drop the years and create a summarized df
df_columns_removed = df.drop(df_drop,axis=1)
df_columns_removed.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,2010,2011,2012,2013,2014,2015,2016,2017
0,Arab World,ARB,"2005 PPP conversion factor, GDP (LCU per inter...",PA.NUS.PPP.05,,,,,,,,
1,Arab World,ARB,"2005 PPP conversion factor, private consumptio...",PA.NUS.PRVT.PP.05,,,,,,,,
2,Arab World,ARB,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.ZS,82.407647,82.827636,83.169227,83.587141,83.954293,84.23063,84.570425,
3,Arab World,ARB,Access to electricity (% of population),EG.ELC.ACCS.ZS,86.136134,86.782683,87.288244,88.389705,88.076774,88.517967,88.768654,
4,Arab World,ARB,"Access to electricity, rural (% of rural popul...",EG.ELC.ACCS.RU.ZS,74.543489,75.770972,76.772916,78.839139,77.487377,78.564439,78.95878,


Assigning Country Code

In [24]:
#to preview df_country_names
df_country_names.head()

#make a list of country short name
country_short_names = list(df_country_names['Short Name'])

#filter df by short names list
df_filter_cols_nd_cols_rem = df_columns_removed.loc[df_columns_removed['Country Name'].isin(country_short_names)]

In [25]:
df_filter_cols_nd_cols_rem.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,2010,2011,2012,2013,2014,2015,2016,2017
75153,Afghanistan,AFG,"2005 PPP conversion factor, GDP (LCU per inter...",PA.NUS.PPP.05,,,,,,,,
75154,Afghanistan,AFG,"2005 PPP conversion factor, private consumptio...",PA.NUS.PRVT.PP.05,,,,,,,,
75155,Afghanistan,AFG,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.ZS,20.68,22.33,24.08,26.17,27.99,30.1,32.44,
75156,Afghanistan,AFG,Access to electricity (% of population),EG.ELC.ACCS.ZS,42.7,43.222019,69.1,67.259552,89.5,71.5,84.137138,
75157,Afghanistan,AFG,"Access to electricity, rural (% of rural popul...",EG.ELC.ACCS.RU.ZS,32.4,33.38011,63.8,58.423667,87.8,64.2,78.961074,


Filter indicators

In [26]:
df_list_indicators.head()

Unnamed: 0,Indicator Name
0,GDP (constant 2010 US$)
1,Population density (people per sq. km of land ...
2,"Literacy rate, adult total (% of people ages 1..."
3,"Probability of dying at age 5-14 years (per 1,..."
4,GINI index (World Bank estimate)


In [27]:
indicators =  list(df_list_indicators['Indicator Name'])
print(indicators)

#filter by indicators
df_countries_indiciator = df_filter_cols_nd_cols_rem.loc[df_filter_cols_nd_cols_rem['Indicator Name'].isin(indicators)]

#remove unnessecary columns
df_wdi = df_countries_indiciator.drop(columns='Indicator Code')

['GDP (constant 2010 US$)', 'Population density (people per sq. km of land area)', 'Literacy rate, adult total (% of people ages 15 and above)', 'Probability of dying at age 5-14 years (per 1,000 children age 5)', 'GINI index (World Bank estimate)']


#### Reshaping Data

In [28]:
df_wdi.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,2010,2011,2012,2013,2014,2015,2016,2017
75666,Afghanistan,AFG,GDP (constant 2010 US$),15856570000.0,15924180000.0,17954880000.0,18960480000.0,19477070000.0,19759740000.0,20206380000.0,20744940000.0
75693,Afghanistan,AFG,GINI index (World Bank estimate),,,,,,,,
75928,Afghanistan,AFG,"Literacy rate, adult total (% of people ages 1...",,31.74112,,,,,,
76284,Afghanistan,AFG,Population density (people per sq. km of land ...,44.11844,45.50531,47.0192,48.60412,50.17618,51.67493,53.08341,54.42221
76438,Afghanistan,AFG,"Probability of dying at age 5-14 years (per 1,...",11.5,,,,,10.4,,9.9


In [29]:
years = list(df_wdi.iloc[:,3:].columns)
numeric_years = [int(i) for i in years]

In [30]:
i = 0
world_bank_indicators = []


while i < len(years):
    
    # to create a df of series
    wdi_data = df_wdi[['Country Code','Country Name', 'Indicator Name', years[i]]]
    wdi_data = wdi_data.rename(columns={'Country Code':'COU','Country Name':'Country'})
    
    
    # to reshape the data
    wdi = pd.pivot_table(wdi_data, values=years[i], index=['COU','Country'], columns=['Indicator Name'], aggfunc= np.sum)
    wdi.reset_index()
    
    # data cleaning
    wdi_df = wdi.dropna(axis=0, how='all').reset_index()    
    
    # append to list
    world_bank_indicators.append(wdi_df)
    
    # to create csv
    world_bank_indicators[i].to_csv(f'../../../WDI_Data_Happiness_Table_{years[i]}.csv', sep=',' , encoding= 'utf-8', index=False)
    
    print(f"Complete Dataframe for year: {years[i]}")

    i += 1

Complete Dataframe for year: 2010
Complete Dataframe for year: 2011
Complete Dataframe for year: 2012
Complete Dataframe for year: 2013
Complete Dataframe for year: 2014
Complete Dataframe for year: 2015
Complete Dataframe for year: 2016
Complete Dataframe for year: 2017


In [31]:
world_bank_indicators[6].head()

Indicator Name,COU,Country,GDP (constant 2010 US$),GINI index (World Bank estimate),"Literacy rate, adult total (% of people ages 15 and above)",Population density (people per sq. km of land area),"Probability of dying at age 5-14 years (per 1,000 children age 5)"
0,ABW,Aruba,2546020000.0,0.0,0.0,582.344444,0.0
1,AFG,Afghanistan,20206380000.0,0.0,0.0,53.083405,0.0
2,AGO,Angola,101823000000.0,0.0,0.0,23.111786,0.0
3,ALB,Albania,13470270000.0,0.0,0.0,104.96719,0.0
4,AND,Andorra,3319880000.0,0.0,0.0,164.42766,0.0


In [32]:
world_bank_indicators[4].dtypes

Indicator Name
COU                                                                   object
Country                                                               object
GDP (constant 2010 US$)                                              float64
GINI index (World Bank estimate)                                     float64
Literacy rate, adult total (% of people ages 15 and above)           float64
Population density (people per sq. km of land area)                  float64
Probability of dying at age 5-14 years (per 1,000 children age 5)    float64
dtype: object

### <font color='red'> To Combine both datasets </font>

In [33]:
print(len(world_bank_indicators))
print(len(oecd_ml_df))

## Both datasets are the same length, thus we can merge them to each other 

8
8


OECD data has less rows, thus we will perform an inner merge between the World Bank Data and the OECD data.

In [34]:
i = 0 

ml_datasets = []

while i < len(oecd_ml_df):
    
    # to merge 3 pivot tables per dataset column
    ml_df = pd.merge(oecd_ml_df[i], world_bank_indicators[i], on=['COU','Country'], how='inner')
    ml_df['Year'] = numeric_years[i]
    ml_datasets.append(ml_df)
    
    # to create csv
    ml_datasets[i].to_csv(f'../resources/Machine_Learning_Dataset_{years[i]}.csv', sep=',' , encoding= 'utf-8', index=False)
    
    print(f'Completed dataset {i} of 7')
    
    i +=1
    

Completed dataset 0 of 7
Completed dataset 1 of 7
Completed dataset 2 of 7
Completed dataset 3 of 7
Completed dataset 4 of 7
Completed dataset 5 of 7
Completed dataset 6 of 7
Completed dataset 7 of 7


In [35]:
ml_datasets[0].head()

Unnamed: 0,COU,Country,Avg. Work Hours (Annual),Avg. Wages (Annual),Mortality Causes,GDP (constant 2010 US$),GINI index (World Bank estimate),"Literacy rate, adult total (% of people ages 15 and above)",Population density (people per sq. km of land area),"Probability of dying at age 5-14 years (per 1,000 children age 5)",Year
0,AUS,Australia,1699.94,52252.97944,143473.0,1144260000000.0,34.7,0.0,2.867859,1.0,2010
1,AUT,Austria,1557.0,50096.71444,77199.0,391893000000.0,30.3,0.0,101.28743,1.0,2010
2,BEL,Belgium,1546.0,50872.487,105152.0,483548000000.0,28.4,0.0,359.827807,1.0,2010
3,CAN,Canada,1715.0,45502.56024,240075.0,1613460000000.0,33.6,0.0,3.73951,1.1,2010
4,CHE,Switzerland,1624.3,62002.87617,62649.0,583783000000.0,32.6,0.0,198.018747,0.9,2010


### <font color='magenta'> UN Happiness Report </font>

In [36]:
#to read csv un happiness data names
un_happiness_report = pd.read_csv('../data/world_happiness_report_2019_original_file.csv',encoding = "ISO-8859-1")

In [37]:
# years is from earlier in the report
years
numeric_years

[2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017]

In [38]:
un_happiness_report = un_happiness_report.rename(columns={'Country name':'Country'})

In [39]:
un_happiness_report.head()

Unnamed: 0,COU,Country,Year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,...,GINI index (World Bank estimate),"GINI index (World Bank estimate), average 2000-16","gini of household income reported in Gallup, by wp5-year","Most people can be trusted, Gallup","Most people can be trusted, WVS round 1981-1984","Most people can be trusted, WVS round 1989-1993","Most people can be trusted, WVS round 1994-1998","Most people can be trusted, WVS round 1999-2004","Most people can be trusted, WVS round 2005-2009","Most people can be trusted, WVS round 2010-2014"
0,AFG,Afghanistan,2008,3.72359,7.16869,0.450662,50.799999,0.718114,0.177889,0.881686,...,,,,,,,,,,
1,AFG,Afghanistan,2009,4.401778,7.33379,0.552308,51.200001,0.678896,0.200178,0.850035,...,,,0.441906,0.286315,,,,,,
2,AFG,Afghanistan,2010,4.758381,7.386629,0.539075,51.599998,0.600127,0.134353,0.706766,...,,,0.327318,0.275833,,,,,,
3,AFG,Afghanistan,2011,3.831719,7.415019,0.521104,51.919998,0.495901,0.172137,0.731109,...,,,0.336764,,,,,,,
4,AFG,Afghanistan,2012,3.782938,7.517126,0.520637,52.240002,0.530935,0.244273,0.77562,...,,,0.34454,,,,,,,


In [40]:
i = 0

happiness_datasets = []

while i < len(years):
    
    # to filter by year to fit other datasets 2010 to 2017
    un_happiness_report_merge = un_happiness_report.loc[un_happiness_report['Year'] == numeric_years[i]]
    happiness_merge_ready = un_happiness_report_merge.drop(columns=['Year'])
    
    
    # to merge 3 pivot tables per dataset column
    life_ml_df = pd.merge(ml_datasets[i], happiness_merge_ready, on=['COU','Country'], how='inner')
    
    happiness_datasets.append(life_ml_df)
    
    # to create csv
    happiness_datasets[i].to_csv(f'../resources/Machine_Learning_Dataset_{years[i]}.csv', sep=',' , encoding= 'utf-8', index=False)
    
    
    print(f"Merged UN Year {years[i]} with OECD & WDI Data")
    
    i+=1
    

Merged UN Year 2010 with OECD & WDI Data
Merged UN Year 2011 with OECD & WDI Data
Merged UN Year 2012 with OECD & WDI Data
Merged UN Year 2013 with OECD & WDI Data
Merged UN Year 2014 with OECD & WDI Data
Merged UN Year 2015 with OECD & WDI Data
Merged UN Year 2016 with OECD & WDI Data
Merged UN Year 2017 with OECD & WDI Data


In [41]:
happiness_datasets[7]

Unnamed: 0,COU,Country,Avg. Work Hours (Annual),Avg. Wages (Annual),Mortality Causes,GDP (constant 2010 US$),GINI index (World Bank estimate)_x,"Literacy rate, adult total (% of people ages 15 and above)",Population density (people per sq. km of land area),"Probability of dying at age 5-14 years (per 1,000 children age 5)",...,GINI index (World Bank estimate)_y,"GINI index (World Bank estimate), average 2000-16","gini of household income reported in Gallup, by wp5-year","Most people can be trusted, Gallup","Most people can be trusted, WVS round 1981-1984","Most people can be trusted, WVS round 1989-1993","Most people can be trusted, WVS round 1994-1998","Most people can be trusted, WVS round 1999-2004","Most people can be trusted, WVS round 2005-2009","Most people can be trusted, WVS round 2010-2014"
0,AUT,Austria,1511.0,50851.39713,83270.0,432754000000.0,0.0,0.0,106.607445,0.8,...,,0.302692,0.322942,,,,,,,
1,CZE,Czech Republic,1784.0,25804.10436,111443.0,241074000000.0,0.0,0.0,137.198109,0.8,...,,0.264167,0.282946,,,0.302275,0.285192,,,
2,HUN,Hungary,1741.3,22492.67745,131674.0,153629000000.0,0.0,0.0,108.11848,1.0,...,,0.297917,0.330753,,0.331434,,0.224615,,0.288842,
3,ISL,Iceland,1483.0,64415.10845,2236.0,17512770000.0,0.0,0.0,3.425436,0.8,...,,0.280833,0.332438,,,,,,,
4,LTU,Lithuania,1608.0,25207.13875,40142.0,47626190000.0,0.0,0.0,45.151862,1.6,...,,0.355,0.428504,,,,0.213082,,,


In [42]:
happiness_datasets[7].dtypes

COU                                                                   object
Country                                                               object
Avg. Work Hours (Annual)                                             float64
Avg. Wages (Annual)                                                  float64
Mortality Causes                                                     float64
GDP (constant 2010 US$)                                              float64
GINI index (World Bank estimate)_x                                   float64
Literacy rate, adult total (% of people ages 15 and above)           float64
Population density (people per sq. km of land area)                  float64
Probability of dying at age 5-14 years (per 1,000 children age 5)    float64
Year                                                                   int64
Life Ladder                                                          float64
Log GDP per capita                                                   float64

### <font color = 'orange'> To Combine OECD Data with UN Happiness Report </font>

In [43]:
# to check to see if OECD has countries that UN report does not

oecd_cou_series= list(set(oecd_yearly_reports_df['COU']))
UN_cou_series= list(set(un_happiness_report['COU']))

print(f"OECD COU: {len(oecd_cou_series)}")
print(f"UN COU: {len(UN_cou_series)}")

same_cou = len(set(oecd_cou_series) & set(UN_cou_series))

print(f"Number countries that are the same {same_cou}")

OECD COU: 42
UN COU: 163
Number countries that are the same 41


In [44]:
un_happiness_report.dtypes

COU                                                          object
Country                                                      object
Year                                                          int64
Life Ladder                                                 float64
Log GDP per capita                                          float64
Social support                                              float64
Healthy life expectancy at birth                            float64
Freedom to make life choices                                float64
Generosity                                                  float64
Perceptions of corruption                                   float64
Positive affect                                             float64
Negative affect                                             float64
Confidence in national government                           float64
Democratic Quality                                          float64
Delivery Quality                                

In [45]:
oecd_yearly_reports_df.dtypes

Dataset         object
COU             object
Country         object
Year             int64
Description     object
Value          float64
Unit            object
dtype: object

In [46]:
oecd_yearly_reports_df.tail(40)

Unnamed: 0,Dataset,COU,Country,Year,Description,Value,Unit
2024,Mortality Causes,CRI,Costa Rica,2009,All causes of death,16652.0,NBPOPUPC
2025,Mortality Causes,CRI,Costa Rica,2010,All causes of death,18988.0,NBPOPUPC
2026,Mortality Causes,CRI,Costa Rica,2011,All causes of death,18458.0,NBPOPUPC
2027,Mortality Causes,CRI,Costa Rica,2012,All causes of death,18913.0,NBPOPUPC
2028,Mortality Causes,CRI,Costa Rica,2013,All causes of death,19365.0,NBPOPUPC
2029,Mortality Causes,CRI,Costa Rica,2014,All causes of death,20290.0,NBPOPUPC
2030,Mortality Causes,COL,Colombia,2000,All causes of death,187246.0,NBPOPUPC
2031,Mortality Causes,COL,Colombia,2001,All causes of death,191350.0,NBPOPUPC
2032,Mortality Causes,COL,Colombia,2002,All causes of death,192030.0,NBPOPUPC
2033,Mortality Causes,COL,Colombia,2003,All causes of death,189770.0,NBPOPUPC


In [47]:
# to merge OECD Data with UN Happiness Report
oecd_happiness_df = pd.merge(oecd_yearly_reports_df, un_happiness_report, on=['COU','Country','Year'], how='inner')
oecd_happiness_df['Year'] = pd.to_numeric(oecd_happiness_df['Year'])
oecd_happiness_df.to_csv('../resources/OECD_Happiness_Merge_Dataframes.csv', sep=',' , encoding= 'utf-8', index=False)

In [48]:
oecd_happiness_df.dtypes

Dataset                                                      object
COU                                                          object
Country                                                      object
Year                                                          int64
Description                                                  object
Value                                                       float64
Unit                                                         object
Life Ladder                                                 float64
Log GDP per capita                                          float64
Social support                                              float64
Healthy life expectancy at birth                            float64
Freedom to make life choices                                float64
Generosity                                                  float64
Perceptions of corruption                                   float64
Positive affect                                 

In [49]:
oecd_happiness_df.groupby('Country')['Year'].unique()

Country
Australia         [2005, 2007, 2008, 2010, 2011, 2012, 2013, 201...
Austria           [2006, 2008, 2010, 2011, 2012, 2013, 2014, 201...
Belgium           [2005, 2007, 2008, 2010, 2011, 2012, 2013, 201...
Brazil            [2005, 2007, 2008, 2009, 2010, 2011, 2012, 201...
Canada            [2005, 2007, 2008, 2009, 2010, 2011, 2012, 201...
Chile             [2006, 2007, 2008, 2009, 2010, 2011, 2012, 201...
Colombia          [2006, 2007, 2008, 2009, 2010, 2011, 2012, 201...
Costa Rica        [2006, 2007, 2008, 2009, 2010, 2011, 2012, 201...
Czech Republic    [2005, 2007, 2010, 2011, 2012, 2013, 2014, 201...
Denmark           [2005, 2007, 2008, 2009, 2010, 2011, 2012, 201...
Estonia           [2006, 2007, 2008, 2009, 2011, 2012, 2013, 201...
Finland           [2006, 2008, 2010, 2011, 2012, 2013, 2014, 201...
France            [2005, 2006, 2008, 2009, 2010, 2011, 2012, 201...
Germany           [2005, 2007, 2008, 2009, 2010, 2011, 2012, 201...
Greece            [2005, 2007, 2009, 201

In [50]:
oecd_happiness_df.tail(40)

Unnamed: 0,Dataset,COU,Country,Year,Description,Value,Unit,Life Ladder,Log GDP per capita,Social support,...,GINI index (World Bank estimate),"GINI index (World Bank estimate), average 2000-16","gini of household income reported in Gallup, by wp5-year","Most people can be trusted, Gallup","Most people can be trusted, WVS round 1981-1984","Most people can be trusted, WVS round 1989-1993","Most people can be trusted, WVS round 1994-1998","Most people can be trusted, WVS round 1999-2004","Most people can be trusted, WVS round 2005-2009","Most people can be trusted, WVS round 2010-2014"
1131,Mortality Causes,TUR,Turkey,2016,All causes of death,420124.0,NBPOPUPC,5.326222,10.075611,0.879995,...,0.419,0.4058,0.33758,,,0.098058,0.064664,0.187987,0.048552,0.118725
1132,Mortality Causes,BRA,Brazil,2005,All causes of death,1006375.0,NBPOPUPC,6.636771,9.41724,0.882923,...,0.563,0.547286,,,,0.06602,,,0.093811,0.070638
1133,Mortality Causes,BRA,Brazil,2007,All causes of death,1047309.0,NBPOPUPC,6.320673,9.493141,0.886402,...,0.549,0.547286,,,,0.06602,,,0.093811,0.070638
1134,Mortality Causes,BRA,Brazil,2008,All causes of death,1074686.0,NBPOPUPC,6.691425,9.532628,0.878108,...,0.54,0.547286,,,,0.06602,,,0.093811,0.070638
1135,Mortality Causes,BRA,Brazil,2009,All causes of death,1102614.0,NBPOPUPC,7.000832,9.521485,0.912818,...,0.537,0.547286,0.428937,0.134151,,0.06602,,,0.093811,0.070638
1136,Mortality Causes,BRA,Brazil,2010,All causes of death,1136514.0,NBPOPUPC,6.837331,9.584492,0.905528,...,,0.547286,0.430274,0.153934,,0.06602,,,0.093811,0.070638
1137,Mortality Causes,BRA,Brazil,2011,All causes of death,1169966.0,NBPOPUPC,7.037817,9.614011,0.916253,...,0.529,0.547286,0.410126,,,0.06602,,,0.093811,0.070638
1138,Mortality Causes,BRA,Brazil,2012,All causes of death,1180628.0,NBPOPUPC,6.660004,9.623768,0.890314,...,0.526,0.547286,0.51713,,,0.06602,,,0.093811,0.070638
1139,Mortality Causes,BRA,Brazil,2013,All causes of death,1209863.0,NBPOPUPC,7.140283,9.644257,0.910422,...,0.528,0.547286,0.400359,,,0.06602,,,0.093811,0.070638
1140,Mortality Causes,BRA,Brazil,2014,All causes of death,1226284.0,NBPOPUPC,6.980999,9.64045,0.898316,...,0.515,0.547286,0.394136,,,0.06602,,,0.093811,0.070638
