In [None]:
#from imports import *

In [1]:
import warnings
warnings.filterwarnings("ignore")


import pandas as pd
import numpy as np


from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
#from sklearn.preprocessing import StandardScaler

from scipy import stats
from scipy.stats import pearsonr, spearmanr

from sklearn.model_selection import train_test_split

# specify decimal places to display
pd.set_option('display.float_format', lambda x: '%.2f' % x)

#import pursuit_of_happiness

<h1 style = "border:10px; border-style:groove; border-color:maroon; padding: 1em; text-align: center;" >  Pursuit of Happyness <br> <img src= "cali.webp" width = "700" align = "center"> </h1>

# Project Planning

- Acquire data from the Codeup Database and store the process as a function for replication. Save the function in a wrangle.py file to import into the Final Report Notebook.

- View data to gain understanding of the dataset and to create the readme.
    
- Create README.md with data dictionary, project and business goals, documentation of the initial hypotheses.
    
- Clean and prepare data for the first iteration through the data pipeline. Store this as a function to automate the process, store the function in the wrangle.py module, and prepare data in Final Report Notebook by importing and using the funtion.

- Clearly define at least two hypotheses, set an alpha, run the statistical tests needed, reject or fail to reject the Null Hypothesis, and document findings and takeaways.
   
- Establish a baseline accuracy and document well.

- Train four different regression models.
    
- Evaluate models on train and validate datasets.
    
- Choose the model with that performs the best and evaluate that single model on the test dataset.    

- Document executive summary, conclusions, takeaways, and next steps in the Final Report Notebook.

- Upload README.md, Data Dictionary, wrangle.py, Project Scratch Notebook, Final Report Notebook

<hr style="border-top: 10px groove lightsteelblue; margin-top: 1px; margin-bottom: 2px"></hr>

# Executive Summary

- The regression models had similar performance, with the GLM Model using Tweedie Regression resulting in the lowest overall Root Mean Squared Error.

    - The features included in this model:
    
        - Area
        - Age 
        - Bedrooms/Bathrooms
   
- Using clustering for feature engineering and feature development proved inconclusive. 

- The model outperformed the baseline accuracy.

- Several insights and statistical testing during the exploratory data analysis revealed that analysis by absolute log error is beneficial for gaining insights into which counties and features produce the most log error volatility.

<h1 style="border-bottom: 10px groove lightsteelblue; margin-top: 1px; margin-bottom: 2px; text-align: left;">
Data Preparation </hr>

#### How it started:

- The original dataframe was ~ 77,381 rows and 67 columns

    - Redundant and unnecessary columns and columns missing entire rows were dropped.
    - Outliers were handled using IQR.
    - Features were added.
    - Some nulls were imputed.
    
#### How it's going:

- In the end, the dataframe before the train, test, and split is 39687 rows and 45 columns.

<h1 style="border-bottom: 10px groove lightsteelblue; margin-top: 1px; margin-bottom: 2px; text-align: left;">
Data Aquisition </hr>

In [2]:
df = pd.read_csv('happy.csv')

df = df.fillna(0)

#train, test, split
train_validate, test = train_test_split(df, test_size = .2, random_state = 123)
train, validate = train_test_split(train_validate, test_size = .3, random_state = 123)

train.head()

Unnamed: 0,country,happiness_rank,happiness_score,gdp_per_cap,social_support,healthy_life_exp,freedom,generosity,corruption_perception,year
806,Taiwan,25.0,6.46,10.78,0.89,69.6,0.77,-0.07,0.73,2020
876,Turkmenistan,95.0,5.12,9.75,0.96,62.21,0.83,0.19,0.88,2020
500,Uruguay,31.0,6.38,1.09,1.46,0.77,0.62,0.13,0.15,2018
1018,China,84.0,5.34,9.67,0.81,69.59,0.9,-0.15,0.76,2021
278,Armenia,121.0,4.36,0.86,0.62,0.64,0.14,0.08,0.04,2016


In [8]:
train.shape

(606, 10)

In [20]:
uber_happy = train[['country', 'happiness_score']].groupby('country').mean().sort_values(by = 'happiness_score', ascending = False)
uber_happy = uber_happy.head(100)

In [21]:
uber_happy.happiness_score.max()

7.682233315890301

In [24]:
so_sad = train[['country', 'happiness_score']].groupby('country').mean().sort_values(by = 'happiness_score', ascending = False)
so_sad = so_sad.tail(100)

In [25]:
so_sad.happiness_score.max()

5.72493342986263

<h1 style="border-bottom: 10px groove lightsteelblue; margin-top: 1px; margin-bottom: 2px; text-align: left;">
Exploratory Analysis/Statistical Testing <br></hr>

#### Initial Questions:

   - What are some of the drivers of happiness? 
   - Are some factors more significant than others?
   - Is it better to be rich in a poor country or poor in a rich country? - Hans Rosling

<hr style="border-top: 5px groove blanchedalmond; margin-top: 1px; margin-bottom: 1px"></hr>

In [3]:
train.describe()

Unnamed: 0,happiness_rank,happiness_score,gdp_per_cap,social_support,healthy_life_exp,freedom,generosity,corruption_perception,year
count,606.0,606.0,606.0,606.0,606.0,606.0,606.0,606.0,606.0
mean,78.31,5.41,3.16,1.01,17.7,0.51,0.15,0.29,2017.95
std,45.15,1.13,3.79,0.31,28.66,0.22,0.16,0.3,1.97
min,1.0,2.52,0.0,0.0,0.0,0.0,-0.26,0.0,2015.0
25%,39.25,4.52,0.77,0.81,0.55,0.36,0.07,0.07,2016.0
50%,78.0,5.37,1.19,0.95,0.76,0.5,0.16,0.14,2018.0
75%,119.0,6.21,7.23,1.26,52.8,0.65,0.25,0.44,2020.0
max,157.0,7.81,11.65,1.62,76.95,0.96,0.82,0.94,2021.0


### Is the happiness score in happy countries significantly higher than the happiness score in sad countries?

Two Sample T-Test

$\alpha$ = .05

$𝐻_{0}$: Happy countries mean happiness score is <= than the mean log error of sad countries.

𝐻𝑎: Happy countries mean happiness score is > than the mean log error of sad countries.

In [26]:
#test for equal variance 
#H0 is that the variances are equal
#Ha is that the variances are not equal
# if p > .05, variances are not significantly different and set argument to equal_var = True
#if p < .05, variances are significantly different and set argument to equal_var = False
#Levene test on two groups

group_1 = so_sad.happiness_score
group_2 = uber_happy.happiness_score

#set alpha
α = 0.05

#perform test to determine variance
f, p = stats.levene(so_sad.happiness_score,
             uber_happy.happiness_score)

#evaluate coefficient and p-value
print(f'Levene\'s F-statistic: {f:.3f}\nP-value: {p:.3f}')


#evaluate if 
if p < α:
    print('Reject the null hypothesis.')
else:
    print('Fail to reject the null hypothesis.')

Levene's F-statistic: 0.184
P-value: 0.669
Fail to reject the null hypothesis.


In [27]:
#set alpha
α = 0.05

#perform test
t, p = stats.ttest_ind(uber_happy.happiness_score, so_sad.happiness_score, equal_var = True)

#print p-value
print(f'P Value: {p/2:.3f}')

#evaluate if mean of the uber_happy countries is significantly higher than so_sad, is p/2 < a and t > 0?
if p/2 < α and t > 0:
    print('Reject the null hypothesis.')
else:
    print('Fail to reject the null hypothesis.')

P Value: 0.000
Reject the null hypothesis.


### 

One Sample T-Test

$\alpha$ = .05

$𝐻_{0}$: Orange County mean log error is <= than the combined mean log errors of Ventura, Los Angeles, and Orange counties.

𝐻𝑎: Orange County mean log error is > than the combined mean log errors of Ventura, Los Angeles, and Orange counties.

In [5]:
#set alpha
α = 0.05

#get sample
happy_sample = train[train.uber_happy].happiness_rank

#get mean
overall_mean = train.happiness_rank.mean()

#perform test
t, p = stats.ttest_1samp(happy_sample, overall_mean)

#print p-value
print(f'P Value: {p/2:.3f}')

#evaluate if mean of countries that scored well on happiness is significantly lower than all of the countries, is p/2 < a and t < 0?
if p/2 < α and t < 0:
    print('Reject the null hypothesis.')
else:
    print('Fail to reject the null hypothesis.')

P Value: 0.000
Reject the null hypothesis.
