In [1]:
## Dependancies ##

# Data 
import numpy as np
import pandas as pd

# Create Data
from sklearn.datasets import make_blobs


# Charting
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import graphviz 
import pydotplus


import plotly as py
import plotly.graph_objs as go
py.offline.init_notebook_mode(connected=True)

import seaborn as sb

# Data Preprocessing
from sklearn.preprocessing import StandardScaler

# Linear Regression
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Math
from statistics import mean

# Multi Linear Regresssion
from mpl_toolkits.mplot3d import Axes3D

# Lasso Model
from sklearn.linear_model import Lasso

# Ridge & Elastic Model
# Note: Use an alpha of .01 when creating the model
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet

# Decision Trees
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
## dataset
from sklearn.datasets import load_iris

# warnings
import warnings
warnings.simplefilter('ignore')



In [2]:
# to assign figure size for Matplotlib charts
fig_size = plt.rcParams["figure.figsize"]
fig_size[0] = 20
fig_size[1] = 12
plt.rcParams["figure.figsize"] = fig_size

In [3]:
# read csv data from Data_Cleaning_&_Consolidation_Annual_Reporting.ipynb
happiness_data = pd.read_csv('../resources/Complete_Happiness_Data.csv')

In [4]:
happiness_data.tail()

Unnamed: 0,COU,Country,Year,Avg. Work Hours (Annual),Avg. Wages (Annual),GDP (constant 2010 US$),Population density (people per sq. km of land area),Life Ladder,Log GDP per capita,Social support,...,Generosity,Perceptions of corruption,Positive affect,Negative affect,Confidence in national government,Democratic Quality,Delivery Quality,Standard deviation of ladder by country-year,Standard deviation/Mean of ladder by country-year,"gini of household income reported in Gallup, by wp5-year"
226,LVA,Latvia,2012,1738.0,18272.87632,26293020000.0,32.716613,5.125025,9.944049,0.851195,...,-0.040608,0.894979,0.560013,0.232225,0.19122,0.614571,0.721495,1.890645,0.368905,0.321799
227,LVA,Latvia,2013,1732.0,19151.96466,26931900000.0,32.362872,5.06977,9.978767,0.834023,...,-0.075946,0.836554,0.642102,0.227449,0.2338,0.682029,0.75729,1.719731,0.339213,0.30594
228,LVA,Latvia,2014,1741.0,20429.45208,27432360000.0,32.064683,5.729115,10.006597,0.881256,...,-0.046076,0.803688,0.652273,0.225979,0.229045,0.668352,0.854488,1.9629,0.342618,0.406468
229,LVA,Latvia,2016,1709.0,23591.68628,28830700000.0,31.513943,5.940446,10.075054,0.917074,...,-0.159757,0.86764,0.653751,0.231384,0.315261,0.660759,0.869054,1.70512,0.287036,0.384025
230,LVA,Latvia,2017,1695.0,24633.97485,30167430000.0,31.235896,5.977818,10.129182,0.895099,...,-0.15859,0.798378,0.623313,0.231753,0.264001,0.628388,0.881661,1.814803,0.30359,0.362185


In [5]:
#check types
happiness_data.dtypes

COU                                                          object
Country                                                      object
Year                                                          int64
Avg. Work Hours (Annual)                                    float64
Avg. Wages (Annual)                                         float64
GDP (constant 2010 US$)                                     float64
Population density (people per sq. km of land area)         float64
Life Ladder                                                 float64
Log GDP per capita                                          float64
Social support                                              float64
Healthy life expectancy at birth                            float64
Freedom to make life choices                                float64
Generosity                                                  float64
Perceptions of corruption                                   float64
Positive affect                                 

In [6]:
# check for NaN values
happiness_data.count()

COU                                                         231
Country                                                     231
Year                                                        231
Avg. Work Hours (Annual)                                    231
Avg. Wages (Annual)                                         231
GDP (constant 2010 US$)                                     231
Population density (people per sq. km of land area)         231
Life Ladder                                                 231
Log GDP per capita                                          231
Social support                                              231
Healthy life expectancy at birth                            231
Freedom to make life choices                                231
Generosity                                                  231
Perceptions of corruption                                   231
Positive affect                                             231
Negative affect                         

In [7]:
#establish observed year list
years = list(happiness_data['Year'].unique())

In [8]:
happiness_data_1 = happiness_data.iloc[:,4:]
depedent_var = list(happiness_data_1.columns)

In [9]:
life_ladder = [depedent_var.index('Life Ladder')]
life_ladder

[3]

In [10]:
columns = list(happiness_data_1.columns)
columns

['Avg. Wages (Annual)',
 'GDP (constant 2010 US$)',
 'Population density (people per sq. km of land area)',
 'Life Ladder',
 'Log GDP per capita',
 'Social support',
 'Healthy life expectancy at birth',
 'Freedom to make life choices',
 'Generosity',
 'Perceptions of corruption',
 'Positive affect',
 'Negative affect',
 'Confidence in national government',
 'Democratic Quality',
 'Delivery Quality',
 'Standard deviation of ladder by country-year',
 'Standard deviation/Mean of ladder by country-year',
 'gini of household income reported in Gallup, by wp5-year']

In [11]:
items_in_list = len(happiness_data_1.columns)

index_of_columns = [*range(0,items_in_list,1)]

In [12]:
index_of_columns.remove(3)

In [13]:
column_arrangement = life_ladder + index_of_columns

In [14]:
column_arrangement

[3, 0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]

In [15]:
# to rearrange columns 
column_order  = [columns[i] for i in column_arrangement]
column_order

['Life Ladder',
 'Avg. Wages (Annual)',
 'GDP (constant 2010 US$)',
 'Population density (people per sq. km of land area)',
 'Log GDP per capita',
 'Social support',
 'Healthy life expectancy at birth',
 'Freedom to make life choices',
 'Generosity',
 'Perceptions of corruption',
 'Positive affect',
 'Negative affect',
 'Confidence in national government',
 'Democratic Quality',
 'Delivery Quality',
 'Standard deviation of ladder by country-year',
 'Standard deviation/Mean of ladder by country-year',
 'gini of household income reported in Gallup, by wp5-year']

In [16]:
happiness_data_1 = happiness_data_1[column_order]

### Data Analysis

Objective: To review correlation between the Life Ladder & Dependent Variables in the dataset

### Country Correlation Analysis

In [17]:
country_names = list(happiness_data["Country"].unique())
country_names

['Australia',
 'Austria',
 'Belgium',
 'Canada',
 'Czech Republic',
 'Denmark',
 'Finland',
 'France',
 'Germany',
 'Greece',
 'Hungary',
 'Iceland',
 'Ireland',
 'Italy',
 'Japan',
 'Luxembourg',
 'Mexico',
 'Netherlands',
 'New Zealand',
 'Norway',
 'Poland',
 'Portugal',
 'Spain',
 'Sweden',
 'Switzerland',
 'United Kingdom',
 'United States',
 'Israel',
 'Chile',
 'Slovenia',
 'Estonia',
 'Lithuania',
 'Latvia']

In [18]:
len(country_names)

33

In [19]:
country_results = []

for country in country_names:
    
    # to make a dataframe per year
    country_happiness_data = happiness_data.loc[happiness_data['Country'] == country].reset_index()
    
    country_data = country_happiness_data.iloc[:,4:]
    
    country_corr_matrix = country_data.corr(method ='pearson')
    country_corr_matrix = country_corr_matrix[column_order]
    country_corr_matrix = country_corr_matrix.drop(columns=['Standard deviation of ladder by country-year','Standard deviation/Mean of ladder by country-year'])
    print(f"Completed year {country}")
    

    country_results.append(country_corr_matrix)

Completed year Australia
Completed year Austria
Completed year Belgium
Completed year Canada
Completed year Czech Republic
Completed year Denmark
Completed year Finland
Completed year France
Completed year Germany
Completed year Greece
Completed year Hungary
Completed year Iceland
Completed year Ireland
Completed year Italy
Completed year Japan
Completed year Luxembourg
Completed year Mexico
Completed year Netherlands
Completed year New Zealand
Completed year Norway
Completed year Poland
Completed year Portugal
Completed year Spain
Completed year Sweden
Completed year Switzerland
Completed year United Kingdom
Completed year United States
Completed year Israel
Completed year Chile
Completed year Slovenia
Completed year Estonia
Completed year Lithuania
Completed year Latvia


 Going to put the 'Life Ladder' data into a dictionary per country

In [20]:
country_results[1].head()

Unnamed: 0,Life Ladder,Avg. Wages (Annual),GDP (constant 2010 US$),Population density (people per sq. km of land area),Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect,Confidence in national government,Democratic Quality,Delivery Quality,"gini of household income reported in Gallup, by wp5-year"
Avg. Work Hours (Annual),0.573199,-0.735416,-0.795017,-0.864139,-0.572492,0.410419,-0.868564,0.657674,0.249246,0.491501,0.738751,-0.857811,0.127151,0.379303,0.228542,0.565526
Avg. Wages (Annual),-0.543786,1.0,0.787219,0.91012,0.414597,-0.537686,0.872639,-0.775779,-0.508349,-0.798402,-0.76389,0.926506,0.376824,-0.84136,-0.217338,-0.673026
GDP (constant 2010 US$),-0.304245,0.787219,1.0,0.960098,0.879723,-0.29814,0.978521,-0.418787,-0.304038,-0.426622,-0.869123,0.7639,-0.151272,-0.634452,-0.687714,-0.737444
Population density (people per sq. km of land area),-0.483555,0.91012,0.960098,1.0,0.715631,-0.445331,0.994099,-0.621306,-0.395505,-0.631362,-0.906569,0.897475,0.047999,-0.735626,-0.495576,-0.726138
Life Ladder,1.0,-0.543786,-0.304245,-0.483555,0.009904,0.750044,-0.422756,0.834279,0.697922,0.662874,0.419227,-0.66181,-0.129352,0.491941,-0.125525,0.415062


In [None]:
country_corr_value_life_ladder

In [26]:
# to create a dictionary of the correlation values by country
countries_corr_values = []
country_corr_value_life_ladder =  {}

i = 0
j = 0


while i < len(country_results): # iterates over 33 country tables
    while j < len(columns): # iterates over 18 columns
        key_value = country_results[i]['Life Ladder'].index[j] # to get index value of series
        corr_value = country_results[i]['Life Ladder'].values[j] # to get corr value of series
        
        country_corr_value_life_ladder[key_value]: corr_value # to append data to dictionary
        
        j += 1
        
    # once all the data is appended, then it appends the country name
    country_corr_value_life_ladder['Country Name']: country_names[i]
    
    countries_corr_values.append(country_corr_value_life_ladder)
    

    print(f"Completed updating dictionary with {country_names[i]}'s data")
    
    i += 1

Completed updating dictionary with Australia's data
Completed updating dictionary with Austria's data
Completed updating dictionary with Belgium's data
Completed updating dictionary with Canada's data
Completed updating dictionary with Czech Republic's data
Completed updating dictionary with Denmark's data
Completed updating dictionary with Finland's data
Completed updating dictionary with France's data
Completed updating dictionary with Germany's data
Completed updating dictionary with Greece's data
Completed updating dictionary with Hungary's data
Completed updating dictionary with Iceland's data
Completed updating dictionary with Ireland's data
Completed updating dictionary with Italy's data
Completed updating dictionary with Japan's data
Completed updating dictionary with Luxembourg's data
Completed updating dictionary with Mexico's data
Completed updating dictionary with Netherlands's data
Completed updating dictionary with New Zealand's data
Completed updating dictionary with Nor

In [27]:
countries_corr_values

[{},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {}]

In [24]:
hi

NameError: name 'hi' is not defined

In [None]:
item = list(countries_corr_values[3].values())
values = item[:-1]

# to create criteria per the values being analyzed
max_score = max(values).round(3)
min_score = min(values).round(3)

variance_score = max_score - min_score # second value of tuple
average_score =  mean(values).round(3) # first value of tuple

tuple_values = (average_score, variance_score)
print(tuple_values)

In [None]:
# to determine which country has the highest correlated values to the life ladder
tupled_results = []

for data in countries_corr_values:
    values = list(data.values())
    quant_values = values[:-1]
    
    print(values[-1])
    
    
    # to create criteria per the values being analyzed
    max_score = max(quant_values).round(3)
    min_score = min(quant_values).round(3)
    
    variance_score = max_score - min_score # second value of tuple
    average_score =  mean(quant_values).round(3) # first value of tuple
    
    tuple_values = (average_score, variance_score)
    print(tuple_values)
    tupled_results.append(tuple_values)   

In [None]:
tupled_results

### Annual Correlation Analysis

In [None]:
results = []

for year in years:
    
    # to make a dataframe per year
    annual_happiness_data = happiness_data.loc[happiness_data['Year'] == year].reset_index()
    
    yearly_data = annual_happiness_data.iloc[:,4:]
    
    corr_matrix = yearly_data.corr(method ='pearson')
    corr_matrix = corr_matrix[column_order]
    corr_matrix = corr_matrix.drop(columns=['Standard deviation of ladder by country-year','Standard deviation/Mean of ladder by country-year'])
    print(f"Completed year {year}")
    

    results.append(corr_matrix)

In [None]:
results[3]

In [None]:
yearly_corr_table = [] 

i=0 # to iterate through years to provide detail to the table
 
# we are currently just interested in how the life ladder (happiness index) Correlates to the other values in the table
while i < len(years):
    for table in results:
        corr_table = table['Life Ladder'].to_frame() # to reshape data for matrix scatter plot.
        corr_table_rev_1 = corr_table.drop(index=['Life Ladder','Standard deviation of ladder by country-year','Standard deviation/Mean of ladder by country-year'])

        # to rank the columns based on the average of the annual values
        corr_table_rev_1['Average Rank'] = corr_table_rev_1.rank(method='average',ascending=False)
#         corr_table_rev_1['Year'] = int(years[i])
        
        # to tranpose the final table to have rows equal to the year
        corr_table_rev_2 = corr_table_rev_1.T
        corr_table_rev_2['Year'] = int(years[i])
        
        
        # to append results of both the corr table & the analysis table to a list 
        yearly_corr_table.append(corr_table_rev_2) 

        # to concat the correlation table & the analysis into a df
        result = pd.concat(yearly_corr_table).round(3)
        
        print(f"Completed Concat of year {years[i]}")
        
        i += 1

result # print result of loop

In [None]:
annual_life_ladder_corr_table = result.loc[['Life Ladder']]
annual_avg_rank_table = result.loc[['Average Rank']]

In [None]:
annual_life_ladder_corr_table

In [None]:
avg_rank_per_col = annual_avg_rank_table.mean().round(1).sort_values()
avg_rank_per_col

In [None]:
ordered_columns = list(avg_rank_per_col.index)
ordered_columns

In [25]:
# rearranged columns for charting
annual_life_ladder_corr_table_rev1 = annual_life_ladder_corr_table[ordered_columns]
annual_avg_rank_table_rev1 = annual_avg_rank_table[ordered_columns]

NameError: name 'annual_life_ladder_corr_table' is not defined

In [None]:
# to assign the depedent variable columns to a list
dep_var_columns = list(annual_life_ladder_corr_table_rev1.iloc[:,:-1].columns)

In [None]:
# colors for the graph
colors = ['#f21f1f','#066378','#b8f227','#ed61e4','#5f9e11','#61edda','#9ca7ff','#666c6e','#f52c7c','#5f9e11','#543d82','#f5ed16','#a18f08','#ce1dd1','#cdd164','#ed6618']

In [None]:
len(colors)

In [None]:
# to graph the variance in life ladder correlation values year over year
y = dep_var_columns
y_pos = np.arange(len(years))

#chart
ax = annual_life_ladder_corr_table_rev1.plot(x='Year', y=y, kind='bar', color=colors, width=.75)

#legend
ax.legend(bbox_to_anchor=(1, 1.25),loc='lower', ncol=2)

# to highlight no correlation
ax.axhline(y=0, color='mediumvioletred',linewidth=1)

# to insert comment
ax.text(0,1, 'Max Correlation = 1',ha='center', fontsize=14)
ax.text(0,-1, 'Inverse Correlation = -1', ha='center', fontsize=14)
ax.text(-.7,-.05, 'No Correlation = 0', fontsize=14)
ax.text(6,-1.05, '* Variables were ordered by Average Correlation Ranking Year over Year', ha='center', color='midnightblue', fontsize=14)


#axis labels
ax.set_xlabel('Years', fontsize = 16)
ax.set_ylabel('Correlation Value', fontsize=14)
ax.set_title(r'Life Ladder Correlation Value by Variable', fontsize=16);

plt.savefig('../charts/Life_Ladder_Correlation_Value.svg',format="svg",bbox_inches='tight')

#### Conclusion Summary
----------------------------------

The Life Ladder correlation between the dependent variables provides a high-level understanding of the variables' magnitude and volatility regarding 35 countries being reviewed. 

Variables that are suited for a linear regression model would depend on a variable's consistent linear magnitude; year over year. 

Thus, in the chart above, we are looking variables that hold a consistent correlation year over year for a multi-variant linear regression model. 

The magnitude of correlation is a factor, but it can be viewed that values that have low correlation have a higher rate of change year over year. 

Finally, those low and volatle variables would be interpreted as not applicable to our model. 

### <font color='orange'> Correlation Analysis </font>
-----------------------------------------------------------------------------


#### <font color = 'blue'> Step 1:  Spliting Variables into Categories </font>
--------------------------------------------------------

I would be interested in spliting the variables into categories such as economic, socio-political, and social factors. 

From there, I would like to make each category into a plot object of subplots.

Each structure will provide 2 insights:

1. If the categories have varying levels of linear properties or are properties consistent
2. If we can generalize the linear trends by reviewing slope or the qty / the types of minimas and maximas in a line to determine which variables would be appropriate for the model

#### <font color = 'green'> Additonal Notes </font>
----------------------------------

Now that we have compared correlation between Log GDP & GDP per Capita, we notice that Log GDP has a higher consistent correlation than GDP. 

Therefore, we will remove GDP (constant 2010 US$) from the options to avoid repetitive analysis. 

In [None]:
# to review list of columns
columns_to_analyze = list(annual_life_ladder_corr_table_rev1.columns)

In [None]:
columns_to_analyze

In [None]:
# list of columns' indexes
# not including year yet so we can check for duplicates once completed with sublists

# index values
# economic : 2,6,9,13
# socio-political : 3,4,5,8,14,15
# social : 0,1,7,10,11

# lists
economic_cols = ['Avg. Wages (Annual)',
                 'Log GDP per capita',
                 'gini of household income reported in Gallup, by wp5-year',
                 'Avg. Work Hours (Annual)']


socio_political_cols = ['Delivery Quality',
                        'Confidence in national government',
                        'Positive affect',
                        'Positive affect',
                        'Negative affect',
                        'Perceptions of corruption']


social_cols = ['Freedom to make life choices','Generosity','Social support','Healthy life expectancy at birth','Population density (people per sq. km of land area)']

In [None]:
# to combine all the lists for comparison
created_list = economic_cols + socio_political_cols + social_cols
created_list

In [None]:
# to remove 'GDP (constant 2010 US$)' from list before analysis review
 ## We wanted to keep it before, so our index values above aligned with the original data table 
 ## - the second method of assessing the values is there as a proactive measure -  

columns_to_analyze.remove('GDP (constant 2010 US$)')
columns_to_analyze.remove('Year')

In [None]:
columns_to_analyze

In [None]:
# to test value Error in loop below
# created_list.append('Test') 

In [None]:
# If they have the same length & they 
length_comparison = len(created_list) - len(columns_to_analyze)
length_comparison

In [None]:
# to check for differences between list of indexes & created lists
list_differences = list(set(created_list) - set(columns_to_analyze))
list_differences

In [None]:
# to check for differences between list of indexes & created lists
list_differences = list(set(created_list) - set(columns_to_analyze))

# to confirm that each list has all the indexes accounted for and that there are no duplicates
if not list_differences and \
    length_comparison == 0:
    print("There are no duplicates & no values missing")
else:
    print('Missing an Item')
    raise ValueError('list should be empty & may be duplicates')

In [None]:
order_of_analysis = [economic_cols, socio_political_cols, social_cols]
order_of_analysis

In [None]:
## to also include the year in the final dataframe table 
 ## we are including the last column of the annual_life_ladder_corr_table_rev1, 'Year'. 
i = 0

while i < 3:
    order_of_analysis[i].append('Year')
    
    print(f'Completed list {i} of 2')
    
    i += 1

In [None]:
# make dataframes from the indexed list for columns
economic_df = annual_life_ladder_corr_table_rev1.loc[:,order_of_analysis[0]]
social_political_df = annual_life_ladder_corr_table_rev1.loc[:,order_of_analysis[1]]
social_df = annual_life_ladder_corr_table_rev1.loc[:,order_of_analysis[2]]

In [None]:
economic_df = economic_df.rename(columns={'gini of household income reported in Gallup, by wp5-year': 'GINI index'})

In [None]:
economic_df

#### <font color='darkblue'> Step 2: Plot Linear Regression Line & Review Variables Volatility</font>
-------------------------


In [None]:
economic_columns = list(economic_df.columns[:-1])
economic_columns

In [None]:
dep_var_columns

In [None]:
colors

In [None]:
# to use consistent colors for variables across charts
colors_dict = {}

i = 0

while i < len(dep_var_columns):
    colors_dict.update({dep_var_columns[i]:colors[i]})
    
    print(f"Updated Color Dictionary to have variable with color: {dep_var_columns[i]}/{colors[i]}")
    
    i += 1

colors_dict

In [None]:
# to create a color pallet for economic variables that match master correlation chart
economic_colors = []
economic_index_values = [2,6,9,13]

for index in economic_index_values:
    economic_colors.append(colors[index])

economic_colors

In [None]:
X = list(economic_df['Year'].values)
y_avg = list(economic_df['Avg. Wages (Annual)'].values)
y_log_gdp = list(economic_df['Log GDP per capita'].values)
y_gini = list(economic_df['GINI index'].values)
y_hrs = list(economic_df['Avg. Work Hours (Annual)'].values)

In [None]:
# to make a list of category attributes
y_values = [y_avg, y_log_gdp, y_gini, y_hrs]
set_face_colors = ['#828181','#828181','#ffffff','#ffffff']
set_titles_values = ['Avg. Wages (Annual)','Log GDP per capita','Avg. Work Hours (Annual)','GINI of household income']

# to plot a chart for all economic factors
fig = plt.figure()
fig.suptitle('Economic Correlation Trend from 2010 - 2017', fontsize=16)
fig.subplots_adjust(hspace=0.2)
for i in range(1, 5):
    ax = fig.add_subplot(2, 2, i)
    ax.plot(X, y_values[(i-1)], color= economic_colors[(i-1)])
    ax.grid(False)
    ax.set_facecolor(set_face_colors[(i-1)])
    ax.set_title(set_titles_values[(i-1)])

Regression Line Calculation

In [None]:
xs = economic_df['Year'].values
ys = economic_df['Avg. Wages (Annual)'].values

def best_fit_slope_and_intercept(xs,ys):
    m = (((mean(xs)*mean(ys)) - mean(xs*ys)) /
         ((mean(xs)*mean(xs)) - mean(xs*xs)))
    
    b = mean(ys) - m*mean(xs)
    
    return m, b

m, b = best_fit_slope_and_intercept(xs,ys)

print(f'slope: {m}')
print(f'y-intercept {b}')


regression_line = [(m*x)+b for x in xs]

print(f'y-int regression points {regression_line}')

Next steps are to make the the calculation into a loop for all variables per category and then to make subplots per category and determine next steps for analysis



### Multi-Variant Linear Regression

y = mx + b with multiple mx variables