In [1]:
## Dependancies ##

# Data 
import numpy as np
import pandas as pd

# Create Data
from sklearn.datasets import make_blobs


# Charting
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import graphviz 
import pydotplus


import plotly as py
import plotly.graph_objs as go
py.offline.init_notebook_mode(connected=True)

import seaborn as sb

# Data Preprocessing
from sklearn.preprocessing import StandardScaler

# Linear Regression
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Multi Linear Regresssion
from mpl_toolkits.mplot3d import Axes3D

# Lasso Model
from sklearn.linear_model import Lasso

# Ridge & Elastic Model
# Note: Use an alpha of .01 when creating the model
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet

# Decision Trees
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
## dataset
from sklearn.datasets import load_iris



# warnings
import warnings
warnings.simplefilter('ignore')



In [2]:
happiness_data = pd.read_csv('../resources/Complete_Happiness_Data.csv')

In [3]:
happiness_data.tail()

Unnamed: 0,COU,Country,Year,Avg. Work Hours (Annual),Avg. Wages (Annual),GDP (constant 2010 US$),Population density (people per sq. km of land area),Life Ladder,Log GDP per capita,Social support,...,Generosity,Perceptions of corruption,Positive affect,Negative affect,Confidence in national government,Democratic Quality,Delivery Quality,Standard deviation of ladder by country-year,Standard deviation/Mean of ladder by country-year,"gini of household income reported in Gallup, by wp5-year"
226,LVA,Latvia,2012,1738.0,18272.87632,26293020000.0,32.716613,5.125025,9.944049,0.851195,...,-0.040608,0.894979,0.560013,0.232225,0.19122,0.614571,0.721495,1.890645,0.368905,0.321799
227,LVA,Latvia,2013,1732.0,19151.96466,26931900000.0,32.362872,5.06977,9.978767,0.834023,...,-0.075946,0.836554,0.642102,0.227449,0.2338,0.682029,0.75729,1.719731,0.339213,0.30594
228,LVA,Latvia,2014,1741.0,20429.45208,27432360000.0,32.064683,5.729115,10.006597,0.881256,...,-0.046076,0.803688,0.652273,0.225979,0.229045,0.668352,0.854488,1.9629,0.342618,0.406468
229,LVA,Latvia,2016,1709.0,23591.68628,28830700000.0,31.513943,5.940446,10.075054,0.917074,...,-0.159757,0.86764,0.653751,0.231384,0.315261,0.660759,0.869054,1.70512,0.287036,0.384025
230,LVA,Latvia,2017,1695.0,24633.97485,30167430000.0,31.235896,5.977818,10.129182,0.895099,...,-0.15859,0.798378,0.623313,0.231753,0.264001,0.628388,0.881661,1.814803,0.30359,0.362185


In [4]:
happiness_data.dtypes

COU                                                          object
Country                                                      object
Year                                                          int64
Avg. Work Hours (Annual)                                    float64
Avg. Wages (Annual)                                         float64
GDP (constant 2010 US$)                                     float64
Population density (people per sq. km of land area)         float64
Life Ladder                                                 float64
Log GDP per capita                                          float64
Social support                                              float64
Healthy life expectancy at birth                            float64
Freedom to make life choices                                float64
Generosity                                                  float64
Perceptions of corruption                                   float64
Positive affect                                 

In [5]:
happiness_data.count()

COU                                                         231
Country                                                     231
Year                                                        231
Avg. Work Hours (Annual)                                    231
Avg. Wages (Annual)                                         231
GDP (constant 2010 US$)                                     231
Population density (people per sq. km of land area)         231
Life Ladder                                                 231
Log GDP per capita                                          231
Social support                                              231
Healthy life expectancy at birth                            231
Freedom to make life choices                                231
Generosity                                                  231
Perceptions of corruption                                   231
Positive affect                                             231
Negative affect                         

In [6]:
years = list(happiness_data['Year'].unique())

In [7]:
happiness_data_1 = happiness_data.iloc[:,4:]
depedent_var = list(happiness_data_1.columns)

In [8]:
life_ladder = [depedent_var.index('Life Ladder')]
life_ladder

[3]

In [9]:
columns = list(happiness_data_1.columns)
columns

['Avg. Wages (Annual)',
 'GDP (constant 2010 US$)',
 'Population density (people per sq. km of land area)',
 'Life Ladder',
 'Log GDP per capita',
 'Social support',
 'Healthy life expectancy at birth',
 'Freedom to make life choices',
 'Generosity',
 'Perceptions of corruption',
 'Positive affect',
 'Negative affect',
 'Confidence in national government',
 'Democratic Quality',
 'Delivery Quality',
 'Standard deviation of ladder by country-year',
 'Standard deviation/Mean of ladder by country-year',
 'gini of household income reported in Gallup, by wp5-year']

In [10]:
items_in_list = len(happiness_data_1.columns)

index_of_columns = [*range(0,items_in_list,1)]

In [11]:
index_of_columns.remove(3)

In [12]:
column_arrangement = life_ladder + index_of_columns

In [13]:
column_arrangement

[3, 0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]

In [14]:
# to rearrange columns 
column_order  = [columns[i] for i in column_arrangement]
column_order

['Life Ladder',
 'Avg. Wages (Annual)',
 'GDP (constant 2010 US$)',
 'Population density (people per sq. km of land area)',
 'Log GDP per capita',
 'Social support',
 'Healthy life expectancy at birth',
 'Freedom to make life choices',
 'Generosity',
 'Perceptions of corruption',
 'Positive affect',
 'Negative affect',
 'Confidence in national government',
 'Democratic Quality',
 'Delivery Quality',
 'Standard deviation of ladder by country-year',
 'Standard deviation/Mean of ladder by country-year',
 'gini of household income reported in Gallup, by wp5-year']

In [15]:
happiness_data_1 = happiness_data_1[column_order]

### Data Analysis

In [16]:
results = []

for year in years:
    
    # to make a dataframe per year
    annual_happiness_data = happiness_data.loc[happiness_data['Year'] == year].reset_index()
    
    yearly_data = annual_happiness_data.iloc[:,4:]
    
    corr_matrix = yearly_data.corr(method ='pearson')
    corr_matrix = corr_matrix[column_order]
    corr_matrix = corr_matrix.drop(columns=['Standard deviation of ladder by country-year','Standard deviation/Mean of ladder by country-year'])
    print(f"Completed year {year}")
    

    results.append(corr_matrix)

Completed year 2010
Completed year 2011
Completed year 2012
Completed year 2013
Completed year 2014
Completed year 2016
Completed year 2017


In [17]:
results[3]

Unnamed: 0,Life Ladder,Avg. Wages (Annual),GDP (constant 2010 US$),Population density (people per sq. km of land area),Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect,Confidence in national government,Democratic Quality,Delivery Quality,"gini of household income reported in Gallup, by wp5-year"
Avg. Work Hours (Annual),-0.35177,-0.633037,0.047232,-0.21591,-0.703352,-0.64697,-0.076833,-0.562359,-0.366074,0.474857,-0.172443,0.485688,-0.532944,-0.734256,-0.670649,0.074241
Avg. Wages (Annual),0.695395,1.0,0.312365,0.24266,0.928908,0.613728,0.178235,0.649153,0.650605,-0.672509,0.515375,-0.426342,0.640131,0.613056,0.782058,0.340105
GDP (constant 2010 US$),0.125108,0.312365,1.0,0.044552,0.221957,0.095158,0.081371,0.022381,0.166148,0.04692,0.182325,-0.024883,-0.08379,-0.058061,0.052523,0.638913
Population density (people per sq. km of land area),0.090064,0.24266,0.044552,1.0,0.208105,0.09092,0.198045,0.038576,0.003895,0.002359,0.003634,0.119713,0.197156,-0.078453,0.062597,-0.245257
Life Ladder,1.0,0.695395,0.125108,0.090064,0.606104,0.517876,0.159234,0.724002,0.708992,-0.731761,0.659153,-0.538055,0.681317,0.369858,0.662075,0.341798
Log GDP per capita,0.606104,0.928908,0.221957,0.208105,1.0,0.626338,0.14803,0.575541,0.535622,-0.666295,0.345439,-0.450423,0.693424,0.679169,0.785042,0.225297
Social support,0.517876,0.613728,0.095158,0.09092,0.626338,1.0,0.106645,0.69385,0.582816,-0.433856,0.348891,-0.548597,0.392664,0.682809,0.70644,0.135031
Healthy life expectancy at birth,0.159234,0.178235,0.081371,0.198045,0.14803,0.106645,1.0,0.10064,0.236672,-0.02255,0.153246,0.092526,-0.159392,-0.010634,0.138534,0.073982
Freedom to make life choices,0.724002,0.649153,0.022381,0.038576,0.575541,0.69385,0.10064,1.0,0.687713,-0.707734,0.595853,-0.721348,0.59878,0.628732,0.804273,0.321321
Generosity,0.708992,0.650605,0.166148,0.003895,0.535622,0.582816,0.236672,0.687713,1.0,-0.550184,0.640219,-0.355646,0.41041,0.418259,0.681066,0.532821


In [19]:
yearly_corr_table = [] 

i=0 # to iterate through years to provide detail to the table
 
# we are currently just interested in how the life ladder (happiness index) Correlates to the other values in the table
while i < len(years):
    for table in results:
        corr_table = table['Life Ladder'].to_frame() # to reshape data for matrix scatter plot.
        corr_table_rev_1 = corr_table.drop(index=['Life Ladder','Standard deviation of ladder by country-year','Standard deviation/Mean of ladder by country-year'])

        # to rank the columns based on the average of the annual values
        corr_table_rev_1['Average Rank'] = corr_table_rev_1.rank(method='average',ascending=False)
        corr_table_rev_1['Year'] = int(years[i])
        
        # to tranpose the final table to have rows equal to the year
        corr_table_rev_2 = corr_table_rev_1.T
        
        # to append results of both the corr table & the analysis table to a list 
        yearly_corr_table.append(corr_table_rev_2) 

        # to concat the correlation table & the analysis into a df
        result = pd.concat(yearly_corr_table).round(3)
        
        print(f"Completed Concat of year {years[i]}")
        
        i += 1

result # print result of loop

Completed Concat of year 2010
Completed Concat of year 2011
Completed Concat of year 2012
Completed Concat of year 2013
Completed Concat of year 2014
Completed Concat of year 2016
Completed Concat of year 2017


Unnamed: 0,Avg. Work Hours (Annual),Avg. Wages (Annual),GDP (constant 2010 US$),Population density (people per sq. km of land area),Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect,Confidence in national government,Democratic Quality,Delivery Quality,"gini of household income reported in Gallup, by wp5-year"
Life Ladder,-0.364,0.749,0.056,0.077,0.441,0.627,-0.424,0.73,0.647,-0.725,0.746,-0.377,0.735,0.463,0.644,0.165
Average Rank,13.0,1.0,12.0,11.0,9.0,7.0,15.0,4.0,5.0,16.0,2.0,14.0,3.0,8.0,6.0,10.0
Year,2010.0,2010.0,2010.0,2010.0,2010.0,2010.0,2010.0,2010.0,2010.0,2010.0,2010.0,2010.0,2010.0,2010.0,2010.0,2010.0
Life Ladder,-0.429,0.783,0.085,0.11,0.643,0.587,0.01,0.729,0.716,-0.753,0.816,-0.422,0.685,0.44,0.726,0.303
Average Rank,15.0,2.0,12.0,11.0,7.0,8.0,13.0,3.0,5.0,16.0,1.0,14.0,6.0,9.0,4.0,10.0
Year,2011.0,2011.0,2011.0,2011.0,2011.0,2011.0,2011.0,2011.0,2011.0,2011.0,2011.0,2011.0,2011.0,2011.0,2011.0,2011.0
Life Ladder,-0.396,0.709,0.074,0.044,0.63,0.478,0.496,0.786,0.678,-0.759,0.779,-0.578,0.748,0.441,0.702,0.197
Average Rank,14.0,4.0,12.0,13.0,7.0,9.0,8.0,1.0,6.0,16.0,2.0,15.0,3.0,10.0,5.0,11.0
Year,2012.0,2012.0,2012.0,2012.0,2012.0,2012.0,2012.0,2012.0,2012.0,2012.0,2012.0,2012.0,2012.0,2012.0,2012.0,2012.0
Life Ladder,-0.352,0.695,0.125,0.09,0.606,0.518,0.159,0.724,0.709,-0.732,0.659,-0.538,0.681,0.37,0.662,0.342


In [None]:
result
result_v1 = result.drop(columns=['Standard deviation/Mean of ladder by country-year','Standard deviation of ladder by country-year'])

result_v1 

In [None]:
average_per_col = result_v1.mean(axis=0).round(3)
std_per_col = result_v1.std(axis=0).round(2)

analysis_per_col = pd.concat([average_per_col,std_per_col], axis=1)
# analysis_per_col = analysis_per_col.rename(index={0:'Average per Col',1:'Std per Col'})
analysis_per_col

In [None]:
averages = analysis_per_col.iloc[:,0]
analysis_per_col['Average Rank'] = averages.rank(method='average',ascending=False)
analysis_per_col = analysis_per_col.sort_values('Average Rank')
summaries_per_col = analysis_per_col.T.rename(index={0:'Average per Col',1:'Std per Col'})

summaries_per_col

In [None]:
x = list(result.columns)

# I could make a dicitonary and refer to the keys of the dictionary to automate the indexing of the yearly_corr_table
# but it doesn't seem to add value here

y_0 = list(yearly_corr_table[0].values[0])
y_1 = list(yearly_corr_table[1].values[0])
y_2 = list(yearly_corr_table[2].values[0])
y_3 = list(yearly_corr_table[3].values[0])
y_4 = list(yearly_corr_table[4].values[0])
y_5 = list(yearly_corr_table[5].values[0])
y_6 = list(yearly_corr_table[6].values[0])

In [None]:
fig, axs = plt.subplots(3, sharex=True, sharey=True, gridspec_kw={'hspace': 0})
fig.suptitle('Sharing both axes')
axs[0].plot(x, y ** 2)
axs[1].plot(x, 0.3 * y, 'o')
axs[2].plot(x, y, '+')

# Hide x labels and tick labels for all but bottom plot.
for ax in axs:
    ax.label_outer()

In [None]:
## to plot annual results in a scatter plot

fig = go.Figure()


t = np.linspace(-1,1,40).round(1)
t = t[::-1]

# we have to add a trace for each year
fig.add_trace(go.Scatter(
    x=y_0, y=t,
    name=x[0],
    mode='markers',
    marker_color='#db5153'
))

fig.add_trace(go.Scatter(
    x=y_1, y=t,
    name=x[1],
    marker_color='#db51d0'
))

fig.add_trace(go.Scatter(
    x=y_2, y=t,
    name=x[2],
    marker_color='#44d4db'
))

fig.add_trace(go.Scatter(
    x=y_3, y=t,
    name=x[3],
    marker_color='#fc9935'
))

fig.add_trace(go.Scatter(
    x=y_4, y=t,
    name=x[4],
    marker_color='#16d977'
))

fig.add_trace(go.Scatter(
    x=y_5, y=t,
    name=x[5],
    marker_color='#91e344'
))

fig.add_trace(go.Scatter(
    x=y_6, y=t,
    
    name=x[6],
    marker_color='#f5f371'
))

fig.update_xaxes(nticks=40)
fig.update_yaxes(nticks=20)

# Set options common to all traces with fig.update_traces
fig.update_traces(mode='markers', marker_line_width=2, marker_size=10)
fig.update_layout(title='Styled Scatter',
                  yaxis_zeroline=False, xaxis_zeroline=False)


fig.show()



In [None]:
data = happiness_data.iloc[:,3:]

In [None]:
corr_matrix = data.corr()
correlation_values_to_life_ladder = corr_matrix["Life Ladder"].sort_values(ascending=False)
# correlation_values_to_life_ladder = correlation_values_to_life_ladder.reset_index(name='Corr Values')
# correlation_values_to_life_ladder = correlation_values_to_life_ladder.rename(columns={'index':'Happiness Factors'})

In [None]:
correlation_values_to_life_ladder.T

In [None]:
fig = go.Figure(
    dimensions= dimensions,
    color="species")
fig.show()

In [None]:
# to assign X & y axis values
X =  happiness_data.iloc[:,3:]
y =  happiness_data.iloc[:,1]

In [None]:
# encoding categorigcal data

In [None]:
# from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [None]:
y_train

In [None]:
y_test