In [1]:
# Importing the relevant libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Use the train-test-split functionality from sklearn
from sklearn.model_selection import train_test_split
from sklearn import metrics

# Use statsmodels for both the model & its evaluation
import statsmodels.api as sm # Where we'll get the model from
import statsmodels.tools     # Get the evaluation metrics
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor

from IPython.display import clear_output

# Code for the function

In [2]:
#Defining the columns used for the complete model and the ethically adjusted model.
main_columns = ['const', 'Infant_deaths', 'Under_five_deaths', 'Adult_mortality', 'Economy_status_Developed', 'Schooling', 'BMI', 'Incidents_HIV', 'Thinness_ten_nineteen_years']
adj_columns = ['const', 'Adult_mortality', 'GDP_per_capita', 'BMI','Region']
adj_columns_output = ['const', 'Adult_mortality', 'GDP_per_capita', 'BMI', 'cut_Asia', 'cut_Central America and Caribbean', 'cut_European Union', 'cut_Middle East', 'cut_North America', 'cut_Oceania', 'cut_Rest of Europe', 'cut_South America']

In [3]:
#Defining the regions of the world in alphabetical order
regions = ['Africa',
 'Asia',
 'Central America and Caribbean',
 'European Union',
 'Middle East',
 'North America',
 'Oceania',
 'Rest of Europe',
 'South America']

In [4]:
#Defining the location of the current dataset.
location = 'Life Expectancy Data.csv'

In [5]:
#https://stackoverflow.com/questions/8924173/how-can-i-print-bold-text-in-python
#Importing a class so that text can be easily changed and emphasised
class color:
    PURPLE = '\033[95m'
    CYAN = '\033[96m'
    DARKCYAN = '\033[36m'
    BLUE = '\033[94m'
    GREEN = '\033[92m'
    YELLOW = '\033[93m'
    RED = '\033[91m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
    END = '\033[0m'
#https://stackoverflow.com/questions/26704806/python-string-join-list-last-entry-with-and
#Importing a function that converts a list into a string that reads English correctly. 
def join_words(words):
    if len(words) > 2:
        return '%s, and %s' % ( ', '.join(words[:-1]), words[-1] )+' were '
    elif len(words)==2:
        return ' and '.join(words)+' were '
    else:
        return ' and '.join(words)+' was '

## Model from external notebooks

In [6]:

def data_collection(location):
    '''
    input(s):
    location (str): the location of the file.
    
    output(s):
    X_train (pandas.dataframe): 
    y_train (pandas.dataframe): 
    X_test (pandas.dataframe): 
    y_test (pandas.dataframe): 
    
    Description:
    
    '''
    df = pd.read_csv(location)
    return splitting(df)

def splitting(df):
    '''
    input(s):
    dataframe (df): the complete set of data.
    
    output(s):
    X_train (pandas.dataframe): 
    y_train (pandas.dataframe): 
    X_test (pandas.dataframe): 
    y_test (pandas.dataframe): 
    
    Description:
    
    '''
    X = df.drop('Life_expectancy',axis=1)
    y = df['Life_expectancy']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 43)
    return X_train, X_test, y_train, y_test

def comp_feature_eng(df):
    '''
    input(s):
    location (str): the location of the file.
    
    output(s):
    X_train (pandas.dataframe): 
    y_train (pandas.dataframe): 
    X_test (pandas.dataframe): 
    y_test (pandas.dataframe): 
    
    Description:
    
    '''
    df_fe = df.copy()
    df_fe.insert(loc=0, column='const', value=1) # COMMENTED OUT AS WE'VE ALREADY SET THE CONSTANT TO 0.5
    #df = pd.get_dummies(df, columns = ['Region'], drop_first = True, prefix = 'cut', dtype=int)
    
    #Removed to make code run:'Population_mln','GDP_per_capita',
    
    df_fe_scale = df_fe[['Adult_mortality', 'Infant_deaths', 'BMI']]
    rob = RobustScaler() ## Initial scaler
    rob.fit(df_fe_scale) ## Fit the data

    ## Transform the data according to the scaler
    ## Save it as a new dataframe called df_scale_rob
    df_fe_scale_rob = rob.transform(df_fe_scale)
    #Removed to make code run:'Population_mln','GDP_per_capita',
    df_fe[[ 'Adult_mortality', 'Infant_deaths', 'BMI']] = df_fe_scale_rob
    #df['log_GDP_per_capita'] = np.log(df['GDP_per_capita'])
    return df_fe

def comp_fit_feature_eng(df,X_train):
    '''
    input(s):
    location (str): the location of the file.
    
    output(s):
    X_train (pandas.dataframe): 
    y_train (pandas.dataframe): 
    X_test (pandas.dataframe): 
    y_test (pandas.dataframe): 
    
    Description:
    
    '''
    df_fe = df.copy()
    df_fe.insert(loc=0, column='const', value=1) # COMMENTED OUT AS WE'VE ALREADY SET THE CONSTANT TO 0.5
    #df = pd.get_dummies(df, columns = ['Region'], drop_first = True, prefix = 'cut', dtype=int)
    
    #Removed to make code run:'Population_mln','GDP_per_capita',
    
    df_fe_scale = pd.concat([df_fe[['Adult_mortality', 'Infant_deaths', 'BMI']],X_train[['Adult_mortality', 'Infant_deaths', 'BMI']]])
    rob = RobustScaler() ## Initial scaler
    rob.fit(df_fe_scale) ## Fit the data

    ## Transform the data according to the scaler
    ## Save it as a new dataframe called df_scale_rob
    df_fe_scale_rob = rob.transform(df_fe_scale)
    #Removed to make code run:'Population_mln','GDP_per_capita',
    df_fe[[ 'Adult_mortality', 'Infant_deaths', 'BMI']] = df_fe_scale_rob[0]
    #df['log_GDP_per_capita'] = np.log(df['GDP_per_capita'])
    return df_fe[main_columns]


def adj_feature_eng(df):
    df.insert(loc=0, column='const', value=1) #adding constant
    df = pd.get_dummies(df, columns = ['Region'], drop_first = True, prefix = 'cut', dtype=int) #one hot encoding region
    df_scale = df[['GDP_per_capita','Adult_mortality','BMI']] # list of features to scale features using Robust Scaler
    rob = RobustScaler() # Initial scaler
    rob.fit(df_scale) # Fitting the data

    ## Transform the data according to the scaler
    ## Save it as a new dataframe called df_scale_rob
    df_scale_rob = rob.transform(df_scale)
    df[['GDP_per_capita','Adult_mortality','BMI']] = df_scale_rob
  
    return df

def adj_fit_feature_eng(df,X_train):
    df.insert(loc=0, column='const', value=1) #adding constant
    for i in regions:
        if df.Region.iloc[0] == i:
            df['cut_'+i]=1
        else:
            df['cut_'+i] = 0
    df.drop('Region', inplace=True, axis=1)
    df_scale = pd.concat([df[['GDP_per_capita','Adult_mortality','BMI']],X_train[['GDP_per_capita','Adult_mortality','BMI']]]) # list of features to scale features using Robust Scaler
    rob = RobustScaler() # Initial scaler
    rob.fit(df_scale) # Fitting the data

    ## Transform the data according to the scaler
    ## Save it as a new dataframe called df_scale_rob
    df_scale_rob = rob.transform(df_scale)
    df[['GDP_per_capita','Adult_mortality','BMI']] = df_scale_rob[0]
  
    return df[adj_columns_output]

In [7]:
def modelselector(val,X_train,y_train):
    '''
    input(s):
    location (str): the location of the file.
    
    output(s):
    X_train (pandas.dataframe): 
    y_train (pandas.dataframe): 
    X_test (pandas.dataframe): 
    y_test (pandas.dataframe): 
    
    Description:
    
    '''
    clear_output(wait=False)
    if val=='comp':
        return complete_model(X_train,y_train)
    elif val=='adj':
        return adjusted_model(X_train,y_train)
    else:
        raise ExceptionError('There has not been a valid model chosen. This execution has been halted.')

def complete_model(X_train,y_train):
    '''
    input(s):
    location (str): the location of the file.
    
    output(s):
    X_train (pandas.dataframe): 
    y_train (pandas.dataframe): 
    X_test (pandas.dataframe): 
    y_test (pandas.dataframe): 
    
    Description:
    
    '''
    X_train_fe = comp_feature_eng(X_train)
    lin_reg = sm.OLS(y_train, X_train_fe[main_columns])
    results = lin_reg.fit()
    #returning the fitted model
    return results

def adjusted_model(X_train,y_train):
    '''
    input(s):
    location (str): the location of the file.
    
    output(s):
    X_train (pandas.dataframe): 
    y_train (pandas.dataframe): 
    X_test (pandas.dataframe): 
    y_test (pandas.dataframe): 
    
    Description:
    
    '''
    X_train_fe = adj_feature_eng(X_train)
    lin_reg = sm.OLS(y_train, X_train_fe[adj_columns_output])
    results = lin_reg.fit()
    #returning the fitted model
    return results

def model_view(model):
    '''
    input(s):
    location (str): the location of the file.
    
    output(s):
    X_train (pandas.dataframe): 
    y_train (pandas.dataframe): 
    X_test (pandas.dataframe): 
    y_test (pandas.dataframe): 
    
    Description:
    
    '''
    return model.summary()

def model_fit(model, input_list, columns, pick,X_train):
    '''
    input(s):
    location (str): the location of the file.
    
    output(s):
    X_train (pandas.dataframe): 
    y_train (pandas.dataframe): 
    X_test (pandas.dataframe): 
    y_test (pandas.dataframe): 
    
    Description:
    
    '''
    
    input_df = pd.DataFrame([input_list],columns = columns)
    if pick == 'comp':
        input_df_fe = comp_fit_feature_eng(input_df,X_train)
    else:
        input_df_fe = adj_fit_feature_eng(input_df,X_train)
    #Remember to add +- RMSE years.
    life_expectency = model.predict(input_df_fe)
    return life_expectency[0]

def mean_value(df, column):
    '''
    input(s):
    location (str): the location of the file.
    
    output(s):
    X_train (pandas.dataframe): 
    y_train (pandas.dataframe): 
    X_test (pandas.dataframe): 
    y_test (pandas.dataframe): 
    
    Description:
    
    '''
    return df[column].mean()

def max_value(df, column):
    return df[column].min()

def min_value(df, column):
    return df[column].max()

In [8]:
def model_choice_output(choice, columns):
    string_a = f"You have selected the {choice} model. Please ensure you have data for these columns before proceeding:\n\n"
    string_b =  "\n".join([f"{i+1}. {col.replace('_',' ').replace('%',' in %').title()}" for i, col in enumerate(columns[1:])])
    string_c = "\n\nNote: If you have missing values, the mean of the data will be used. This may affect the result.\n"
    string_d = "Please type 'yes' to proceed:"
    return string_a+string_b+string_c+string_d

def starting_function():
    start = input("The models have been created to include different data due to sensitivity. Please confirm whether you would like to use the: \n1:Complete model with all data. \n2:Adjusted model with reduced features.\n")
    if start.replace(" ","")=='1':
        clear_output(wait=False)
        main_t = input(model_choice_output('complete', main_columns))
        return 'comp',main_t,main_columns
    elif start.replace(" ","")=='2':
        clear_output(wait=False)
        adj_t = input(model_choice_output('adjusted', adj_columns))
        return 'adj',adj_t,adj_columns
    else:
        clear_output(wait=False)
        raise Exception(f'You typed {start}. As it was not '+ color.BOLD + '1' + color.END + ' or '+ color.BOLD+ '2'+ color.END +' , the code will therefore be stopped. \n \n \t \t \t Please restart')

def column_collection(columns,X_train):
    clear_output(wait=False)
    input_list = []
    col_list = []
    exception_list = []
    for col in columns[1:]:
        col_output = col.replace("_"," ").replace("%"," in %").title()
        if 'Economy_status' in col:
            new_input = input(f'Please enter the value for {col_output}. This has to be either 1 (True) or 0 (False):')
            clear_output(wait=False)
            col_list.append(col)
            try:
                value = int(new_input)
                if value==0 or value==1:
                    input_list.append(value)
                else:
                    input_list.append(round(X_train[col].mean(),2))
                    exception_list.append(col_output)

            except ValueError:
                input_list.append(round(X_train[col].mean(),2))
                exception_list.append(col_output)
        elif 'Region'==col:
            new_input = input(f'Please enter the value for {col_output} (either as text or the associated number) \n' +\
                              "\n".join([f"{i+1}. {col.replace('_',' ').replace('%',' in %').title()}" for i, col in enumerate(regions)])+"\n")
            clear_output(wait=False)
            col_list.append(col)
            try:
                value = new_input
                if value in regions:
                    input_list.append(value)
                elif int(value) in range(1,len(regions)+1):
                    input_list.append(regions[int(value)-1])
                else:
                    raise ValueError(f'This is not a valid Region. The function has been stopped.')

            except ValueError:
                raise ValueError(f'This is not a valid Region. The function has been stopped.')


        else:
            new_input = input(f'Please enter the value for {col_output}:')
            clear_output(wait=False)
            col_list.append(col)
            try:
                value = float(new_input)
                if value<0:
                    input_list.append(round(X_train[col].mean(),2))
                    exception_list.append(col_output)
                else:
                    input_list.append(float(new_input))

            except ValueError:
                input_list.append(round(X_train[col].mean(),2))
                exception_list.append(col_output)

        if exception_list:
            print(f"Note: Inputs of {join_words(exception_list)}not valid, thus the mean has been used.")
        print("Current inputs: \n" +"\n".join([f"{col}: {inp}" for i, col, inp in zip(range(len(col_list)),col_list,input_list)]))
    return input_list,exception_list


In [9]:
#order:
def main_function():
    X_train, X_test, y_train, y_test = data_collection(location)
    
    
    
    satisfy = starting_function() 
    if satisfy[1].lower() == 'yes' or satisfy[1].lower()== 'y':  # Making it case-insensitive
        inputs = column_collection(satisfy[2],X_train)
        model = modelselector(satisfy[0],X_train, y_train)
        print(f"The life expectency in this country is {round(model_fit(model, inputs[0], satisfy[2][1:],satisfy[0],X_train),2)} years old.") 
        if inputs[1]:
            print(f"Please note that the inputs of {join_words(inputs[1])}used were used as the mean. This will have impacted the prediction.")
    else:
        raise ValueError(f'You have stated {satisfy[1]}. As this is not yes, the function has been stopped')

# Function Running

In [13]:
main_function()

The life expectency in this country is 80.4 years old.
Please note that the inputs of Bmi was used were used as the mean. This will have impacted the prediction.
