In [75]:
#imports
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, MinMaxScaler
import statsmodels.api as sm
import numpy as np

In [76]:
df = pd.read_csv('Clean_Life_Expectancy_Data.csv')
df.head()

Unnamed: 0,Region,Infant_deaths,Under_five_deaths,Adult_mortality,Alcohol_consumption,Hepatitis_B,Measles,BMI,Polio,Diphtheria,Incidents_HIV,GDP_per_capita,Population_mln,Thinness_ten_nineteen_years,Thinness_five_nine_years,Schooling,Economy_status_Developed,Economy_status_Developing,Life_expectancy
0,Middle East,11.1,13.0,105.824,1.32,97,65,27.8,97,97,0.08,11006,78.53,4.9,4.8,7.8,0,1,76.5
1,European Union,2.7,3.3,57.9025,10.35,97,94,26.0,97,97,0.09,25742,46.44,0.6,0.5,9.7,1,0,82.8
2,Asia,51.5,67.9,201.0765,1.57,60,35,21.2,67,64,0.13,1076,1183.21,27.1,28.0,5.0,0,1,65.4
3,South America,32.8,40.5,222.1965,5.68,93,74,25.3,92,93,0.79,4146,0.75,5.7,5.5,7.9,0,1,67.0
4,Middle East,3.4,4.3,57.951,2.89,97,89,27.0,94,94,0.08,33995,7.91,1.2,1.1,12.8,1,0,81.7


In [77]:
#LifeExpectancy()

In [78]:
''' Recreating model '''
df1 = df.copy() # Copy dataframe
df1 = pd.get_dummies(df, columns = ['Region'], prefix = 'Region') # OHE for 'Region' column

# Define features used in the model
feature_cols = ['Infant_deaths', 'Incidents_HIV', 'GDP_per_capita', 'Region_Asia', 'Region_Central America and Caribbean', 'Region_South America', 'Region_European Union', 'Region_Middle East', 'Region_North America', 'Measles', 'Hepatitis_B']

# Create X and y
X = df1[feature_cols]
y = df1['Life_expectancy']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Robust scaling
# Define list of columns to be robust scaled
robust_list = ['Infant_deaths', 'Hepatitis_B', 'Measles', 'Incidents_HIV', 'GDP_per_capita']

rob = RobustScaler() # Initialise scaler
X_train[robust_list] = rob.fit_transform(X_train[robust_list]) # Fit and transform scaler


X_train = sm.add_constant(X_train) # Add constant
lin_reg = sm.OLS(y_train, X_train) # Use y_train and X_train data
results = lin_reg.fit() # Fit linear regression

In [79]:
''' Recreating Ethical Model '''
# Define features used in the model
feature_cols2 =['Adult_mortality', 'Infant_deaths', 'GDP_per_capita', 'Region_Central America and Caribbean', 'Region_South America', 'Region_European Union', 'Region_Asia', 'Region_North America', 'Region_Rest of Europe', 'Region_Oceania']

# Create X and y
X2 = df1[feature_cols2]
y2 = df1['Life_expectancy']

# Train/test split
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size = 0.2, random_state = 42)

# Robust scaling
# Define list of columns to be robust scaled
robust_list2 = ['Infant_deaths', 'Adult_mortality', 'GDP_per_capita']

rob2 = RobustScaler() # Initialise scaler
X_train2[robust_list2] = rob2.fit_transform(X_train2[robust_list2]) # Fit and transform scaler

X_train2 = sm.add_constant(X_train2) # Add constant
lin_reg2 = sm.OLS(y_train2, X_train2) # Use y_train and X_train data
results2 = lin_reg2.fit() # Fit linear regression

# Life Expectancy Function


In [80]:
def LifeExpectancy(consent = 'N'):                      #Consent set to N by default
    robust =RobustScaler()
    msg = 'Do you consent to using advanced population data, which may include protected information, for better accuracy? (Y/N)'
    while True:
        consent_input = input(msg)
        if not consent_input:                           #No input: go with passed value
            break
        if consent_input.upper() not in ['Y', 'N']:     #Ambiguous input: retry
            msg = 'Please enter either Y or N.\nDo you consent to using advanced population data, which may include protected information, for better accuracy? (Y/N)'
            continue
        if consent_input.upper() in ['Y', 'N']:         #Good input: set consent to new value
            consent = consent_input.upper()
            break
    #Gathering data
    msg = 'Please enter infant deaths per 1000 population'
    while True:
        Infant_deaths = input(msg)
        try:
            Infant_deaths = float(Infant_deaths)
            assert 1000 >= Infant_deaths >= 0           #Values have to be within parameters
            break                                       #Exit the loop if good value is given
        except:
            msg = 'Value must be a number between 0 and 1000.\nPlease enter infant deaths per 1000 population.'
            continue                                    #Reattempt input if bad value is given
    msg = 'Please enter your region according to the legend:\nAsia - 1\nCentral America and Caribbean - 2\nSouth America - 3\n European Union - 4\nRest of Europe - 5\nNorth America - 6\n Oceania - 7\n Middle East - 8\n Other - 9'
    while True:
        region_index = input(msg)
        try:
            assert int(region_index) in [1,2,3,4,5,6,7,8,9]
            break
        except:
            continue
    msg = 'Please enter GDP per capita ($).'
    while True:
        GDP_per_capita = input(msg)
        try:
            GDP_per_capita = float(GDP_per_capita)
            assert GDP_per_capita >= 0
            break
        except:
            msg = 'Value must be a number greater than 0.\nPlease enter GDP per capita ($).'
            continue
    msg = 'Please enter adult mortality per 1000 population.'
    while True:
        Adult_mortality = input(msg)
        try:
            Adult_mortality = float(Adult_mortality)
            assert Adult_mortality >= 0
            break
        except:
            msg = 'Value must be a number greater than 0.\nPlease enter adult mortality per 1000 population.'
            continue
    if consent == 'Y':
        msg = 'Please enter incidents of HIV per 1000 population, aged 15-49 '
        while True:
            Incidents_HIV = input(msg)
            try:
                Incidents_HIV = float(Incidents_HIV)
                assert 1000 >= Incidents_HIV >= 0
                break
            except:
                msg = 'Value must be a number between 0 and 1000.\nPlease enter incidents of HIV per 1000 population, aged 15-49.'
                continue
        msg = 'Please enter % of coverage of Measles containing vaccine first dose (MCV1) immunization among 1-year-olds.'
        while True:
            Measles = input(msg)
            try:
                Measles = float(Measles)
                assert 100 >= Measles >= 0
                break
            except:
                msg = 'Value must be a number between 0 and 100.\nPlease enter % of coverage of Measles containing vaccine first dose (MCV1) immunization among 1-year-olds.'
                continue
        msg = 'Please enter % of coverage of Hepatitis B (HepB3) immunization among 1-year-olds.'
        while True:
            Hepatitis_B = input(msg)
            try:
                Hepatitis_B = float(Hepatitis_B)
                assert 100 >= Hepatitis_B >= 0
                break
            except:
                msg = 'Value must be a number between 0 and 100.\nPlease enter % of coverage of Hepatitis B (HepB3) immunization among 1-year-olds.'
                continue
    #Converting region input into OHC
    region_index = int(region_index)
    Region_Asia = 0
    Region_Central_America_and_Caribbean = 0
    Region_South_America = 0
    Region_European_Union = 0
    Region_Middle_East = 0
    Region_North_America = 0
    Region_Oceania = 0
    Region_Rest_of_Europe = 0
    if region_index == 1:
        Region_Asia += 1
    if region_index == 2:
        Region_Central_America_and_Caribbean += 1
    if region_index == 3:
        Region_South_America += 1
    if region_index == 4:
        Region_European_Union += 1
    if region_index == 5:
        Region_Rest_of_Europe += 1
    if region_index == 6:
        Region_North_America += 1
    if region_index == 7:
        Region_Oceania += 1
    if region_index == 8:
        Region_Middle_East += 1
    #Grouping data, scaling and predicting the full model
    if consent == 'Y':
        df_big = pd.DataFrame({'const': 1, 'Infant_deaths': Infant_deaths, 'Incidents_HIV': Incidents_HIV, 'GDP_per_capita': GDP_per_capita, 'Region_Asia': Region_Asia, 'Region_Central America and Caribbean': Region_Central_America_and_Caribbean, 'Region_South America': Region_South_America, 'Region_European Union': Region_European_Union, 'Region_Middle East': Region_Middle_East, 'Region_North America': Region_North_America, 'Measles': Measles, 'Hepatitis_B': Hepatitis_B}, index = [0])                                  #Create df of entires
        df_input = df_big
        robust_list = ['Infant_deaths',                              #List of features to scale
                   'Hepatitis_B',
                   'Measles',
                   'Incidents_HIV',
                   'GDP_per_capita']
        transformer = robust.fit(df[robust_list])
        df_input[robust_list] = transformer.transform(df_input[robust_list])
        df_input = sm.add_constant(df_input)
        pred_age = results.predict(df_input)
        print(f'The predicted life expectancy under the full model is {np.round(pred_age[0], 2)}')      #print and return predicted age
        return pred_age

    #As above - grouping data, scaling and predicting the ethical model
    if consent == 'N':
        df_small = pd.DataFrame({'const': 1, 'Adult_mortality': Adult_mortality, 'Infant_deaths': Infant_deaths, 'GDP_per_capita': GDP_per_capita, 'Region_Central America and Caribbean': Region_Central_America_and_Caribbean, 'Region_South America': Region_South_America, 'Region_European Union': Region_European_Union,'Region_Asia': Region_Asia, 'Region_North America': Region_North_America, 'Region_Rest of Europe': Region_Rest_of_Europe, 'Region_Oceania':Region_Oceania}, index = [0])
        df_input = df_small
        robust_list = ['Infant_deaths',
               'Adult_mortality',
               'GDP_per_capita']
        transformer = robust.fit(df[robust_list])
        df_input[robust_list] = transformer.transform(df_input[robust_list])
        df_input = sm.add_constant(df_input)
        pred_age = results2.predict(df_input)
        print(f'The predicted life expectancy under the ethical model is {np.round(pred_age[0], 2)}')
        return pred_age

In [81]:
LifeExpectancy()

The predicted life expectancy under the full model is 78.08


0    78.079831
dtype: float64