In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.tools.tools import add_constant
import matplotlib.pyplot as plt
import seaborn as sns

# Building GLS model for each country's status

With so many autocorrelation the best solution should be to apply the GLS method.

We saw when using OLS model, the performance of the model for specific group of countries based on their status was better (AIC is lower evein if R2 was smaller as well but still acceptable), so we should applied this GLS model for each country's status. 

## 1. Developed countries

In [None]:
df_dev = pd.read_csv('../data/expectancy_dev.csv')
df.head()

### 1.1 Build the model

In [None]:
y_dev = df_dev.life_expectancy
X_dev = df_dev.drop(columns=['life_expectancy','status'])

In [None]:
# First step is to get the residuals using the OLS model

ols_resid = sm.OLS(y_dev,add_constant(X_dev)).fit().resid

In [None]:
# Build a linear regression between each residuals and the previous one

resid_fit = sm.OLS(ols_resid.values[1:], ols_resid.values[:-1]).fit()
rho = resid_fit.params
rho

In [None]:
from scipy.linalg import toeplitz
order = toeplitz(np.arange(X_dev.shape[0]))
sigma = rho**order
sigma

In [None]:
gls_model = sm.GLS(y_dev, X_dev, sigma=sigma)
gls_results = gls_model.fit()
gls_results.summary()

### 1.2 Phacking and model improvement

In [None]:
# Refefining the phacking function to drop the column using GLS model

def gls_phacking(column,X,y):
    global sigma
    
    if column:
        X=X.drop(column,axis=1)
    
    gls_model = sm.GLS(y, X, sigma=sigma).fit()
    
    display(gls_model.summary())
    
    return X

In [None]:
# Create a list of columns to be dropped tp keep track on deleted columns
dropped_cols = []

In [None]:
dropped_cols.append('polio')
dropped_cols[-1]

In [None]:
# Dropping parameters with high pvalue for model improvement

X_dev = gls_phacking(dropped_cols[-1],X_dev,y_dev)

### 1.3 Checking Multicollinearity (assumptions)

Checking Variance Inflation Factor for parameters. The threshold is 10, if the parameters is above 10 we should drop the parameter.

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor as VIF

def drop_check_vif(column, X):
    if column:
        X=X.drop(column, axis=1)
    vifs=pd.Series([VIF(X.values,i) for i in range(X.shape[1])],index=X.columns)
    display(vifs[vifs>10])
    return X

In [None]:
dropped_cols.append('schooling')
dropped_cols[-1]

In [None]:
X_dev = drop_check_vif(dropped_cols[-1], X_dev)

In [None]:
# Rebuilding the model outside of the function
gls_model = sm.GLS(y_dev, X_dev, sigma=sigma)
gls_results = gls_model.fit()
gls_results.summary()

In [None]:
"""
I should check the assumptions for GLS, not sure VIF needs to be checked or maybe the GLS model
should be built after working on the first OLS model and satified the assumptions except for autocorrelation.
"""