In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.tools.tools import add_constant
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('../data/life_expectancy_data_cleaned.csv')
df.head()

In [None]:
y = df.life_expectancy
X = df.drop(columns='life_expectancy')

In [None]:
model=sm.OLS(y,add_constant(X))

In [None]:
model_fit=model.fit()

In [None]:
model_fit.summary()

## P-Hacking and model improvement

In [None]:
# Defining a function to drop the column and return the new model

def phacking(column,X):
    
    if column:
        X=X.drop(column,axis=1)
    
    model=sm.OLS(y,add_constant(X)).fit()
    
    display(model.summary())
    
    return X

# Testing the function

phacking('',X)

In [None]:
dropped_cols = []

In [None]:
dropped_cols.append('alcohol')
dropped_cols[-1]

In [None]:
X=phacking(dropped_cols[-1],X)

In [None]:
"""
Summary of the model:

The pvalue of F-statistic is under 0.05 so we have small chances of being wrong 
if we assume the parameters are not equals to 0 at the same time.

All our parameters have a pvalue under 0.05 that means we have small chances of being wrong is by keeping them.

R-squared value is high (81.7%) and increased a slightly when we removed the parameters.

AIC and BIC are almost zero so we can assume they are low. 

We still have some warnings displayed at the end of the model, especially about multicollinearity,
so we should check our assumptions and transform our data if needed.

"""

## Quick check of Assumptions

In [None]:
# Re-building the model outside of the function 

model = sm.OLS(y,add_constant(X))
model_fit=model.fit()
model_fit.save('fitted_model_1.pickle')
model_fit.summary()

In [None]:
# Building the predictions

y_pred=model_fit.predict(add_constant(X))
y_pred

In [None]:
# Checking if predictions seems to be linear
plt.scatter(y,y_pred);

In [None]:
# Checking residuals
(y-y_pred).mean()

In [None]:
# Checking how residuals are displayed
plt.plot(y-y_pred);

In [None]:
resid=y-y_pred
sns.distplot(resid);

In [None]:
"""
Summary of first checking: 

The predictions seems to be linear even if we can still see some outliers at the begining. 

The average of residuals value is almost 0 so our errors seems to be minimized. 
In the meantime, we can see the noise is not so regular and we can clearly identify outliers. 

Then, we can see residuals look like normally distributed but we should still confirm that with the hypothesis.

Finally, we should check the assumptions mathematically. 

"""

## Iteration on model-1

Following a check of assumptions on Model-1 [assumptions-model-1.ipynb](assumptions-model-1.ipynb). It doesn't meet the assumptions so we should iterate to create a new model and check again the assumptions. 

In [None]:
X2 = X.copy()