In [None]:
# Start of John's code

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as st

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
happiness_2021 = pd.read_csv('cleaned_happiness_2021.csv')

In [None]:
happiness_2021.set_index('Country')

### Healthy Life Expectancy
Per the World Happiness Report: 
>Healthy Life Expectancy (HLE). Healthy life expectancies at birth are based
on the data extracted from the World Health Organization’s (WHO) Global
Health Observatory data repository (Last updated: 2020-09-28)

In [None]:
x_values = happiness_2021['Healthy life expectancy']
y_values = happiness_2021['Happiness score']

In [None]:
quartiles = x_values.quantile([0.25,0.5,0.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq
lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)
print(f"Q1: {lowerq}\nQ3: {upperq}\nIQR: {iqr}\nLower Bound: {lower_bound}\nUpper Bound: {upper_bound}")

In [None]:
outliers = happiness_2021.loc[x_values < lower_bound,:]
outliers
outliers.sort_values('Healthy life expectancy')

In [None]:
plt.boxplot(x_values, showmeans=True)
plt.title("Life Expectancy")
for country in outliers['Country'].values:
    y = outliers.loc[outliers['Country'] ==country,'Healthy life expectancy']
    plt.annotate(Country, (1.05, y), fontsize=10, color="blue")
plt.show()

### There are no outliers in Healthy life expectancy

In [None]:
(slope, intercept, rvalue, pvalue, stderr) = st.linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))

In [None]:
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(52,7),fontsize=15,color="indigo")
plt.xlabel("Healthy Life Expectancy")
plt.ylabel("Happiness Score")
plt.title("Life Expectancy & Happiness")
plt.show()

# Hypothesis:

>When healthy life expectancy is high, there is a measurable increase in the Cantrill happiness score. <br>Null Hypothesis (H<sub>0</sub>):  When healthy life expectancy is high, there is no measurable impact in the Cantrill happiness score whatsoever.<br>If p-value is < 0.05 then we reject the null hypothesis. 


In [None]:
pr = round(st.pearsonr(x_values,y_values)[0],2)
if pr > 0.7 :
    link = "strong"
else :
    link = "not strong"
    
print(f'The correlation between Life Expectancy and Happiness is {pr}, suggesting a {link} link between the two factors.')

In [None]:
# Sort Data by Healthy life expectancy
hle_test = happiness_2021[['Country', 'Happiness score', "Healthy life expectancy"]]
hle_test = hle_test.sort_values('Healthy life expectancy')
size = hle_test.count()
#size ## 110 countries in total
#groups will have 22 in each, graded A, B, C, D, & F

In [None]:
F = hle_test.iloc[0: 22, :]
F['Healthy life expectancy'].std()  # Highest Deviation

In [None]:
D = hle_test.iloc[22: 44, :]
D['Healthy life expectancy'].std()  # High Deviation

In [None]:
C = hle_test.iloc[44: 66, :]
C['Healthy life expectancy'].std()  # Lowest Deviation

In [None]:
B = hle_test.iloc[66: 88, :]
B['Healthy life expectancy'].std()  # Low Deviation

In [None]:
A = hle_test.iloc[88: 110, :]
A['Healthy life expectancy'].std()  # Low Deviation

In [None]:
st.shapiro(happiness_2021['Healthy life expectancy'])
        

### Overall, we reject the H<sub>0</sub> using the Shapiro-Wilk test

In [None]:
st.shapiro(F['Healthy life expectancy'])

In [None]:
st.shapiro(D['Healthy life expectancy'])

In [None]:
st.shapiro(C['Healthy life expectancy'])

In [None]:
st.shapiro(B['Healthy life expectancy'])

In [None]:
st.shapiro(A['Healthy life expectancy'])

#### Granularly, we fail to reject the H<sub>0</sub> in subset nations with the 20% lowest healthy life expectancy<br>and as we increase life expectancy, we creep closer to our threshold until we reach the nations<br>with the 20% highest expectancy in which we do reject the null hypothesis even within that subset.

In [None]:
grF = happiness_2021[happiness_2021['Healthy life expectancy'] < 60]["Happiness score"]
grD = happiness_2021[happiness_2021['Healthy life expectancy'].between(59.99, 65.40, inclusive='both')]["Happiness score"]
grC = happiness_2021[happiness_2021['Healthy life expectancy'].between(65.40, 67.20, inclusive='both')]["Happiness score"]
grB = happiness_2021[happiness_2021['Healthy life expectancy'].between(67.20, 70.33, inclusive='both')]["Happiness score"]
grA = happiness_2021[happiness_2021['Healthy life expectancy'] > 70.33]["Happiness score"]

In [None]:
st.f_oneway(grA, grB, grC, grD, grF)

### Using the ANOVA test, H<sub>0</sub> is rejected.

In [None]:
st.kruskal(grA, grB, grC, grD, grF)

### Using the Kruskal-Wallis H-test, H<sub>0</sub> is rejected.

In [None]:
happiness_2021.boxplot('Happiness score', by='Healthy life expectancy', figsize=(12, 5))
plt.ylabel('Happiness Score')
plt.xticks(rotation=90)
plt.show()

In [None]:
F.boxplot('Happiness score', by='Healthy life expectancy', figsize=(12, 5))
plt.ylabel('Happiness Score')
plt.xticks(rotation=90)
plt.show()

#### Group F - Lowest 22 life expectancy nations<br>Range 51.3 yrs to 59.4 yrs<br>Happiness scores between 3.0 and 5.5

In [None]:
D.boxplot('Happiness score', by='Healthy life expectancy', figsize=(12, 5))
plt.ylabel('Happiness Score')
plt.xticks(rotation=90)
plt.show()

#### Group D - 2nd Lowest 22 life expectancy nations<br>Range 60.0 yrs to 65.3 yrs<br>Happiness scores between 3.5 and 6.5

In [None]:
C.boxplot('Happiness score', by='Healthy life expectancy', figsize=(12, 5))
plt.ylabel('Happiness Score')
plt.xticks(rotation=90)
plt.show()

#### Group C - Middle 22 life expectancy nations<br>Range 65.5 yrs to 67.1 yrs<br>Happiness scores between 4.0 and 7.0 with one stray scoring 2.2

In [None]:
B.boxplot('Happiness score', by='Healthy life expectancy', figsize=(12, 5))
plt.ylabel('Happiness Score')
plt.xticks(rotation=90)
plt.show()

#### Group B - 2nd Highest 22 life expectancy nations<br>Range 67.3 yrs to 70.3 yrs<br>Happiness scores between 4.0 and 7.0

In [None]:
A.boxplot('Happiness score', by='Healthy life expectancy', figsize=(12, 5))
plt.ylabel('Happiness Score')
plt.xticks(rotation=90)
plt.show()

#### Group A - Highest 22 life expectancy nations<br>Range 70.3 yrs to 74.3 yrs<br>Happiness scores between 6.0 and 8.0

In [None]:
# End of John's code

In [None]:
# Start of Amanda's code

In [None]:
# End of Amanda's code

In [None]:
# Start of Nathan's code

In [None]:
# End of Nathan's code

In [None]:
# Start of Joey's code

In [None]:
# End of Joey's code