# Creating a Model of Life Expectancy with Non-sensitive Data

### Feature engineering and model building based on core model created before; we go through the same steps to choose which features to include in this non-sensitive model.

In [35]:
# Import necessary packages
import pandas as pd
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
import statsmodels.tools
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
import pickle

In [36]:
# Read in clean data
df = pd.read_csv('Life-Expectancy-Data-Updated.csv')
df.head()

Unnamed: 0,Country,Region,Year,Infant_deaths,Under_five_deaths,Adult_mortality,Alcohol_consumption,Hepatitis_B,Measles,BMI,...,Diphtheria,Incidents_HIV,GDP_per_capita,Population_mln,Thinness_ten_nineteen_years,Thinness_five_nine_years,Schooling,Economy_status_Developed,Economy_status_Developing,Life_expectancy
0,Turkiye,Middle East,2015,11.1,13.0,105.824,1.32,97,65,27.8,...,97,0.08,11006,78.53,4.9,4.8,7.8,0,1,76.5
1,Spain,European Union,2015,2.7,3.3,57.9025,10.35,97,94,26.0,...,97,0.09,25742,46.44,0.6,0.5,9.7,1,0,82.8
2,India,Asia,2007,51.5,67.9,201.0765,1.57,60,35,21.2,...,64,0.13,1076,1183.21,27.1,28.0,5.0,0,1,65.4
3,Guyana,South America,2006,32.8,40.5,222.1965,5.68,93,74,25.3,...,93,0.79,4146,0.75,5.7,5.5,7.9,0,1,67.0
4,Israel,Middle East,2012,3.4,4.3,57.951,2.89,97,89,27.0,...,94,0.08,33995,7.91,1.2,1.1,12.8,1,0,81.7


In [37]:
# Create a copy of df
sen_df = df.copy()

# Removing columns that can be regarded as sensitive for the user
These columns include:
'Alcohol_consumption', 'Measles', 'Diphtheria', 'Polio', 'Incidents_HIV', 'Hepatitis_B'
As they contain sensitive information about alcohol usage and diseases

In [38]:
# Drop sensitive columns
sen_df.drop(columns=['Country','Year', 'Alcohol_consumption', 'Measles', 'Diphtheria', 'Polio', 'Incidents_HIV', 'Hepatitis_B'], inplace=True)

In [39]:
# Check columns left
sen_df.columns

Index(['Region', 'Infant_deaths', 'Under_five_deaths', 'Adult_mortality',
       'BMI', 'GDP_per_capita', 'Population_mln',
       'Thinness_ten_nineteen_years', 'Thinness_five_nine_years', 'Schooling',
       'Economy_status_Developed', 'Economy_status_Developing',
       'Life_expectancy'],
      dtype='object')

# Train/Test Split and Feature Engineering

In [40]:
# Define feature columns
feature_cols = list(sen_df.columns)
feature_cols.remove('Life_expectancy') # Take out 'Life_expectancy' as this is the target

# Create X and y
X = sen_df[feature_cols]
y = sen_df['Life_expectancy']

In [41]:
# Split data into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [42]:
# Define feature engineering function
def feature_eng(df):
    df = df.copy() # Copy dataframe

    ''' One hot encoding'''
    df = pd.get_dummies(df, columns = ['Region'], prefix = 'Region')

    ''' MinMax scaling '''
    minmax = MinMaxScaler() # Initialise scaler
    # Fit and transform scaler
    df[['BMI', 'Schooling']] = minmax.fit_transform(df[['BMI', 'Schooling']])

    ''' Robust scaling '''
    # Define list of columns to be robust scaled
    robust_list = ['Infant_deaths',
                   'Under_five_deaths',
                   'Adult_mortality',
                   'GDP_per_capita',
                   'Population_mln',
                   'Thinness_ten_nineteen_years',
                   'Thinness_five_nine_years',]
    rob = RobustScaler() # Initialise scaler
    # Fit and transform scaler
    df[robust_list] = rob.fit_transform(df[robust_list])

    ''' Add constant '''
    df = sm.add_constant(df)

    return df

In [43]:
# Feature engineer X-train
X_train_fe = feature_eng(X_train)
X_train_fe.head() # Check first 5 rows

Unnamed: 0,const,Infant_deaths,Under_five_deaths,Adult_mortality,BMI,GDP_per_capita,Population_mln,Thinness_ten_nineteen_years,Thinness_five_nine_years,Schooling,...,Economy_status_Developing,Region_Africa,Region_Asia,Region_Central America and Caribbean,Region_European Union,Region_Middle East,Region_North America,Region_Oceania,Region_Rest of Europe,Region_South America
2026,1.0,-0.298246,-0.244969,-0.369456,0.252033,-0.045581,0.564994,2.051724,2.017241,0.75969,...,1,0,1,0,0,0,0,0,0,0
651,1.0,-0.403509,-0.327209,-0.34836,0.552846,0.85527,0.0932,-0.206897,-0.189655,0.813953,...,0,0,0,0,1,0,0,0,0,0
2225,1.0,-0.110276,-0.092738,-0.147051,0.552846,1.027696,0.977926,-0.293103,-0.310345,0.689922,...,1,0,0,0,0,0,0,0,0,1
2357,1.0,-0.200501,-0.174978,-0.581719,0.512195,-0.055739,-0.231884,-0.327586,-0.310345,0.635659,...,1,0,0,0,0,0,0,0,1,0
670,1.0,0.588972,0.894138,2.319636,0.276423,-0.079962,-0.277815,1.87931,1.896552,0.364341,...,1,1,0,0,0,0,0,0,0,0


In [44]:
# Feature engineer X-test
X_test_fe = feature_eng(X_test)
X_test_fe.head() # Check first 5 rows

Unnamed: 0,const,Infant_deaths,Under_five_deaths,Adult_mortality,BMI,GDP_per_capita,Population_mln,Thinness_ten_nineteen_years,Thinness_five_nine_years,Schooling,...,Economy_status_Developing,Region_Africa,Region_Asia,Region_Central America and Caribbean,Region_European Union,Region_Middle East,Region_North America,Region_Oceania,Region_Rest of Europe,Region_South America
1590,1.0,-0.131868,-0.111969,0.102023,0.603306,3.081819,-0.374263,-0.173077,-0.169811,0.751938,...,1,0,0,1,0,0,0,0,0,0
1752,1.0,-0.417582,-0.34749,-0.726046,0.595041,3.930425,3.07185,-0.5,-0.54717,0.891473,...,0,0,0,0,0,0,0,0,1,0
772,1.0,0.793956,0.936293,0.415246,0.31405,-0.297444,-0.383914,0.807692,0.792453,0.24031,...,1,0,0,1,0,0,0,0,0,0
1735,1.0,1.81044,2.206564,0.940366,0.181818,-0.315033,-0.023592,1.211538,1.169811,0.108527,...,1,1,0,0,0,0,0,0,0,0
387,1.0,-0.406593,-0.339768,-0.643476,0.578512,1.746186,0.19571,-0.5,-0.490566,0.651163,...,0,0,0,0,1,0,0,0,0,0


# Drop columns based on VIF and p-values
## Dropping columns by their VIF

In [45]:
# Define function that drops columns if VIF scores are higher than 5 in a stepwise manner
def calculate_vif(X, thresh = 5.0):
    variables = list(range(X.shape[1])) # List of number of rows
    dropped = True
    while dropped:
        dropped = False
        # List comprehension to gather all the VIF values of the different variables
        vif = [variance_inflation_factor(X.iloc[:, variables].values, ix)
               for ix in range(X.iloc[:, variables].shape[1])]

        maxloc = vif.index(max(vif)) # Get the index of the highest VIF value
        if max(vif) > thresh:
            print('dropping \'' + X.iloc[:, variables].columns[maxloc] +
                  '\' at index: ' + str(maxloc))
            del variables[maxloc] # Delete the highest VIF value on condition that it's higher than the threshold
            dropped = True # If we deleted anything, we set the 'dropped' value to True to stay in the while loop

            print('Remaining variables:')
            print(X.columns[variables]) # Finally, print the variables that are still in our set
    return X.iloc[:, variables] # Return our X cut down to the remaining variables

In [46]:
X_train_fe_2 = calculate_vif(X_train_fe[['Infant_deaths', 'Under_five_deaths', 'Adult_mortality',
                                         'BMI', 'GDP_per_capita', 'Population_mln',
                                         'Thinness_ten_nineteen_years', 'Thinness_five_nine_years', 'Schooling',
                                         'Economy_status_Developed', 'Economy_status_Developing',
                                         'Region_Africa', 'Region_Asia', 'Region_Central America and Caribbean',
                                         'Region_European Union', 'Region_Middle East', 'Region_North America',
                                         'Region_Oceania', 'Region_Rest of Europe', 'Region_South America']])

  vif = 1. / (1. - r_squared_i)


dropping 'Economy_status_Developed' at index: 9
Remaining variables:
Index(['Infant_deaths', 'Under_five_deaths', 'Adult_mortality', 'BMI',
       'GDP_per_capita', 'Population_mln', 'Thinness_ten_nineteen_years',
       'Thinness_five_nine_years', 'Schooling', 'Economy_status_Developing',
       'Region_Africa', 'Region_Asia', 'Region_Central America and Caribbean',
       'Region_European Union', 'Region_Middle East', 'Region_North America',
       'Region_Oceania', 'Region_Rest of Europe', 'Region_South America'],
      dtype='object')
dropping 'Under_five_deaths' at index: 1
Remaining variables:
Index(['Infant_deaths', 'Adult_mortality', 'BMI', 'GDP_per_capita',
       'Population_mln', 'Thinness_ten_nineteen_years',
       'Thinness_five_nine_years', 'Schooling', 'Economy_status_Developing',
       'Region_Africa', 'Region_Asia', 'Region_Central America and Caribbean',
       'Region_European Union', 'Region_Middle East', 'Region_North America',
       'Region_Oceania', 'Region_Re

In [47]:
# Add back the constant
X_train_fe_2 = sm.add_constant(X_train_fe_2)

In [48]:
# Get first 5 rows
X_train_fe_2.head()

Unnamed: 0,const,Infant_deaths,Adult_mortality,GDP_per_capita,Population_mln,Thinness_ten_nineteen_years,Region_Asia,Region_Central America and Caribbean,Region_European Union,Region_Middle East,Region_North America,Region_Oceania,Region_Rest of Europe,Region_South America
2026,1.0,-0.298246,-0.369456,-0.045581,0.564994,2.051724,1,0,0,0,0,0,0,0
651,1.0,-0.403509,-0.34836,0.85527,0.0932,-0.206897,0,0,1,0,0,0,0,0
2225,1.0,-0.110276,-0.147051,1.027696,0.977926,-0.293103,0,0,0,0,0,0,0,1
2357,1.0,-0.200501,-0.581719,-0.055739,-0.231884,-0.327586,0,0,0,0,0,0,1,0
670,1.0,0.588972,2.319636,-0.079962,-0.277815,1.87931,0,0,0,0,0,0,0,0


## Dropping columns by their p-value

In [49]:
# Define function that drops columns by their p-value in a stepwise manner
def stepwise_selection(X, y, threshold_in = 0.01, threshold_out = 0.05, verbose = True):
    # The function is checking for p-values (whether features are statistically significant) - lower is better

    included = [] # This is going to be the list of features we keep

    while True:
        changed = False
        ''' Forward step '''
        excluded = list(set(X.columns) - set(included)) # Get list of excluded columns
        new_pval = pd.Series(index = excluded, dtype = 'float64') # Create empty series
        for new_column in excluded: # Iterate through each excluded column
            model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included + [new_column]]))).fit() # Fit model using included columns and new_column
            new_pval[new_column] = model.pvalues[new_column] # Put p-value of each column into series
        best_pval = new_pval.min() # Get the best p-value
        # Add the feature with the lowest (best) p-value under the threshold to our 'included' list
        if best_pval < threshold_in:
            best_feature = new_pval.idxmin() # 'Lowest' p-value
            included.append(best_feature) # Append feature to 'included' list
            changed = True
            if verbose:
                print('Add  {:30} with p-value {:.6}'.format(best_feature, best_pval)) # Specifying the verbose text


        ''' Backward step: removing features if new features added to the list make them statistically insignificant '''
        model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit() # Fit model using all included columns
        # Use all coefs except intercept
        pvalues = model.pvalues.iloc[1:] # Get all p-values
        worst_pval = pvalues.max() # Null if pvalues is empty
        # If the p-value exceeds the upper threshold, the feature will be dropped from the 'included' list
        if worst_pval > threshold_out:
            changed = True
            worst_feature = pvalues.idxmax()
            included.remove(worst_feature)
            if verbose:
                print('Drop {:30} with p-value {:.6}'.format(worst_feature, worst_pval)) # Specifying the verbose text
        if not changed:
            break
    return included

In [50]:
# Run stepwise_selection() on remaining features
final_columns = stepwise_selection(X_train_fe_2[list(X_train_fe_2.columns)], y_train)
print('resulting features:')
print(final_columns)

Add  const                          with p-value 0.0
Add  Adult_mortality                with p-value 0.0
Add  Infant_deaths                  with p-value 0.0
Add  GDP_per_capita                 with p-value 2.62469e-74
Add  Region_Central America and Caribbean with p-value 2.49218e-29
Add  Region_South America           with p-value 5.48137e-23
Add  Region_European Union          with p-value 3.18086e-22
Add  Region_Asia                    with p-value 5.40027e-14
Add  Region_North America           with p-value 8.75508e-15
Add  Region_Rest of Europe          with p-value 3.07946e-15
Add  Region_Oceania                 with p-value 0.0058721
resulting features:
['const', 'Adult_mortality', 'Infant_deaths', 'GDP_per_capita', 'Region_Central America and Caribbean', 'Region_South America', 'Region_European Union', 'Region_Asia', 'Region_North America', 'Region_Rest of Europe', 'Region_Oceania']


In [52]:
# Set dataset to remaining columns after VIF and p-value stepwise dropping
X_train_fe_final = X_train_fe_2[final_columns]

In [53]:
# Create model
lin_reg = sm.OLS(y_train, X_train_fe_final) # Use y_train and X_train data
results = lin_reg.fit() # Fit linear regression
results.summary() # Get summary

0,1,2,3
Dep. Variable:,Life_expectancy,R-squared:,0.98
Model:,OLS,Adj. R-squared:,0.98
Method:,Least Squares,F-statistic:,11340.0
Date:,"Mon, 10 Jul 2023",Prob (F-statistic):,0.0
Time:,10:41:44,Log-Likelihood:,-3904.2
No. Observations:,2291,AIC:,7830.0
Df Residuals:,2280,BIC:,7894.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,70.5172,0.067,1059.621,0.000,70.387,70.648
Adult_mortality,-6.4329,0.060,-106.619,0.000,-6.551,-6.315
Infant_deaths,-5.4887,0.074,-74.113,0.000,-5.634,-5.343
GDP_per_capita,0.4880,0.024,20.154,0.000,0.441,0.536
Region_Central America and Caribbean,1.9015,0.109,17.455,0.000,1.688,2.115
Region_South America,1.7785,0.124,14.386,0.000,1.536,2.021
Region_European Union,1.4385,0.108,13.302,0.000,1.226,1.651
Region_Asia,0.8358,0.093,9.026,0.000,0.654,1.017
Region_North America,1.9310,0.223,8.658,0.000,1.494,2.368

0,1,2,3
Omnibus:,1.6,Durbin-Watson:,2.037
Prob(Omnibus):,0.449,Jarque-Bera (JB):,1.58
Skew:,-0.007,Prob(JB):,0.454
Kurtosis:,3.128,Cond. No.,15.1


In [54]:
results.summary()  # Get summary
y_pred = results.predict(X_train_fe_final)  # Get y_pred using predict()
rmse = statsmodels.tools.eval_measures.rmse(y_train, y_pred)  # Calculate RMSE
print(f'RMSE: {rmse}')

RMSE: 1.3300564619503905


- All p-values are < 0.05
- Condition number - 15.1 (No multicollinearity)
- RMSE - 1.33

# Test Model on Test Data

In [55]:
y_test_pred = results.predict(X_test_fe[final_columns]) # Use only the features left after dropping
rmse = statsmodels.tools.eval_measures.rmse(y_test, y_test_pred)
print(f'RMSE: {rmse}')

RMSE: 1.6190894476146727
