#### This notebook contains the FULL function for both models. please run all cells before using and ensure that .csv data is a common local file directory


#### Import functions from standard libaries

In [2]:
# Import libraries
import sys #import System-specific functions
import numpy as np  # Numerical computing library
import seaborn as sns  # Data visualisation library based on Matplotlib
import matplotlib.pyplot as plt  # Plotting library
import pandas as pd  # Data analysis and manipulation library

# Import machine learning tools from scikit-learn
from sklearn.model_selection import train_test_split  # Splits data into training and testing sets
from sklearn.preprocessing import RobustScaler  # Scales features using robust statistics to handle outliers
from sklearn.preprocessing import StandardScaler  # Scales features to have zero mean and unit variance
# Import statistical modeling tools from statsmodels
import statsmodels.api as sm  # Provides classes for estimating different statistical models
import statsmodels.tools  # Utility functions for statistical modeling
from statsmodels.stats.outliers_influence import variance_inflation_factor #import VIF

#### Full function

In [1]:
def main():    #main function, setup train test split, gain user input, predict final value

    global X_train_fe, X_train, base_df, Country_df, feature_columns, safe_columns, model_number

    #read in dataframe from csv file
    base_df = pd.read_csv('Life Expectancy Data.csv')

    #generate country dataframe
    Country_df= base_df[['Country','Region']]
    Country_df['Country'] = Country_df['Country'].str.lower()

    #columns to be used in safe and unsafe models
    feature_columns= ['Region', 'Under_five_deaths',
       'Adult_mortality' ,'BMI',  'Incidents_HIV',
       'GDP_per_capita', 'Schooling']

    safe_columns= ['BMI','GDP_per_capita',
        'Adult_mortality','Schooling']

    X = base_df[feature_columns]
    y= base_df['Life_expectancy']

    #train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

    #selects the safe or unsafe model
    model_number = select_model()
    if model_number == '0':
        #feature engineer training data
        X_train_fe = unsafe_feature_engineering(X_train)
    elif model_number == '1':
        #feature engineer training data
        X_train_fe = safe_feature_engineering(X_train)
    else:
        #raise error when incorrect value entered
        raise Exception("Incorrect value entered")

    lin_reg = sm.OLS(y_train, X_train_fe)

    # Fit the LinReg model
    output = lin_reg.fit()

    output.summary()
    User_input =predict_values()

    # Using the model to predict on the training set
    y_pred = output.predict(User_input)
    #print prediction
    print('predicted life expectancy: ',y_pred.iloc[0])


def select_model():     #this function asks the user which model they want to use

    # 'while' loop entered so question is repeated until a valid input is recieved or input is exited
    while True:
        model = input("Type 0 to consent to use of sensitive data or type 1 to revoke consent :\n(or type 'exit' to quit):").strip()
    # Allow user exit
        if model.lower() in ('exit', ''):
            print("Exiting model selection...")
            return None
    # Catches all integer inputs
        try:
            if model == "0":
                print("Unsafe model selected")
                return model
                break
            elif model == "1":
                print("Safe model selected")
                return model
                break
            else:
                print("Please enter a valid input")
        #invalid inputs are caught and user is re-prompted
        except ValueError:
            print("Please enter a valid input")

def predict_values():                   #function used to gain user inputs, and pass values to be feature engineered

    Country_df.index.drop

    #select unsafe model
    if model_number == '0':

        #find metrics using the unsafe columns
        metrics = ask_for_metrics(feature_columns)
        #run prediction
        input_prediction = run_unsafe_model(metrics)

    #select safe model
    elif model_number =='1':

        #find metrics using the safe columns
        metrics = ask_for_metrics(safe_columns)

        #run prediction
        input_prediction = run_safe_model(metrics)
    else:
        #raise error when incorrect value entered
        raise Exception("Incorrect value entered")

    return input_prediction


def ask_for_metrics(columns_list):          #asks the user to input their metrics

    #make output dictonary
    column_values = {'const': 1.0}

    #itterate through all values in feature columns asking for inputs
    for col in columns_list:

        #When region is needed ask user for country then find appropriate region
        if col == 'Region':
            #find_region_from_input(columns_list)
            #fetch user input and pass to lower
            User_Country = input(f'Please input you country ').lower().strip()
            #find region that matches inputter country
            value = Country_df[(Country_df['Country']== User_Country)]['Region'].iloc[0]
            #add region to prediction dictonary
            column_values[col] = value

        #When asking if Economy is developed this is binary (either developed or developing)
        elif col == 'Economy_status_Developed': #this need an if exists check
            #ask user for input
            User_developed = input(f'Please enter 1 for developed, 0 for developing ')

            #1 = developed country
            if User_developed == '1':
                if 'Economy_status_Developed' in columns_list:
                    column_values['Economy_status_Developed'] = User_developed
                if 'Economy_status_Develoing' in columns_list:
                    column_values['Economy_status_Developing'] = '0'

            #0 = developing country
            elif User_developed == '0':
                if 'Economy_status_Developed' in columns_list:
                    column_values['Economy_status_Developed'] = '0'
                if 'Economy_status_Develoing' in columns_list:
                    column_values['Economy_status_Developing'] = User_developed

        elif col == 'Economy_status_Developing':
            #return output as dictonary
            return column_values

        else:

            #min and max values are gained to give a suggested range
            max_value = base_df[col].max()
            min_value = base_df[col].min()
            #itterate through remaining feilds for user input
            #ask user for input
            if col == 'Adult_mortality'or  col == 'Under_five_deaths' or col =='Incidents_HIV':
                value = input(f"Please input your {col}. Values are expected to be scaled per 1,000 ").strip()
            elif col == 'BMI':
                value = input(f"Please input your {col}. Values expected between: 0 and 40 ").strip()
            elif col == 'Schooling':
                value = input(f"Please input your {col}. Values expected between: 0 and 15 ").strip()
            else:
                value = input(f"Please input your {col}. Values expected between: 0 and {max_value} ").strip()

            # Allow early exit
            if value.lower() in ('exit', ''):
                sys.exit("Exiting feature input...")

            #users will be warned when the input value is unexpectedly large or small
            try:
                if float(value) > (max_value*1.5):
                    print(f'****WARNING VALUE INPUT FOR FEILD {col} IS LARGER THAN EXPECTED****')
                elif float(value) < (min_value*0.1):
                    print(f'****WARNING VALUE INPUT FOR FEILD {col} IS SMALLER THAN EXPECTED****')
                value = float(value)
                column_values[col] = value
            except ValueError:
                raise Exception(f'value Inputted in {col} was not numeric. Error: Please enter a numeric value or type "exit" to quit. ')

    #return output as dictonary
    return column_values


def unsafe_feature_engineering(df): #handles the unsafe feature engineering for the training data
    global unsafe_scaler
    df = df.copy()

    df = df[['Under_five_deaths', 'Adult_mortality', 'Schooling', 'BMI', 'GDP_per_capita', 'Incidents_HIV','Region']]
    df = pd.get_dummies(df, columns = ['Region'], drop_first = True, prefix = 'region', dtype=int)
    # Standardize features
    unsafe_scaler = StandardScaler()
    df_scaled = pd.DataFrame(unsafe_scaler.fit_transform(df), columns=df.columns, index=df.index)

    # Add constant for regression
    df_scaled = sm.add_constant(df_scaled)

    return df_scaled

def safe_feature_engineering(df):    #handles the safe feature engineering for the training data
    global safe_scaler
    df = df.copy()
    df = df[['Adult_mortality', 'Schooling', 'BMI', 'GDP_per_capita']]
    # Standardize features
    safe_scaler = StandardScaler()
    df['gdp_log'] = np.log(df['GDP_per_capita'])
    df_scaled = pd.DataFrame(safe_scaler.fit_transform(df), columns=df.columns, index=df.index)
    # Add constant for regression
    df_scaled = sm.add_constant(df_scaled)

    return df_scaled

def safe_feature_engineering_input(df): #handles the safe feature engineering for the user input

    df = df.copy()
    df = df[['Adult_mortality', 'Schooling', 'BMI', 'GDP_per_capita']]
    # Standardize features
    df['gdp_log'] = np.log(df['GDP_per_capita'])
    df_scaled = pd.DataFrame(safe_scaler.transform(df), columns=df.columns, index=df.index)
    # Add constant for regression
    df_scaled['const'] = 1

    return df_scaled

def unsafe_feature_engineering_input(df): #handles the unsafe feature engineering for the user input
    df = df.copy()
    df = df[['Under_five_deaths', 'Adult_mortality', 'Schooling', 'BMI', 'GDP_per_capita', 'Incidents_HIV','Region']]
    df = find_region_from_input(df)
    # Standardize features
    df_scaled = pd.DataFrame(unsafe_scaler.transform(df), columns=df.columns, index=df.index)
    # Add constant for regression
    df_scaled['const'] = 1

    return df_scaled

def run_safe_model(column_values):      #function will take in 'safe' user values and feature engineer them
    #makes a new dataframe object that will be used in for prediction
    column_values = pd.DataFrame(column_values, index=[0])
    #feature engineer user inputted dataframe
    column_values = safe_feature_engineering_input(column_values)
    #re-index user dataframe so that columns are aligned with training the set
    column_values = column_values.reindex(columns=X_train_fe.columns)
    return column_values

def run_unsafe_model(column_values):          #function will take in 'safe' user values and feature engineer them

    #makes a new dataframe object that will be used in for prediction
    column_values = pd.DataFrame(column_values, index=[0])
    #feature engineer user inputted dataframe
    column_values =unsafe_feature_engineering_input(column_values)
    #re-index user dataframe so that columns are aligned with training the set
    column_values = column_values.reindex(columns=X_train_fe.columns)

    return column_values

def find_region_from_input(column_values):  #this will complete the OHE for the inputted user region

    #get all one-hot encoded columns
    region_columns = [col for col in X_train_fe.columns if 'region_' in col]
    #extract region from inputted dataframe
    user_region = column_values['Region'].iloc[0]

    #create a new DataFrame with all region columns
    region_encoding = {col: 0 for col in region_columns}

    #find the correct OHE column for the user's region and set it to 1
    region_col_name = f'region_{user_region}'
    region_encoding[region_col_name] = 1

    #remove the original 'Region' column and update with one-hot encoding
    column_values.drop(columns=['Region'], inplace=True)
    column_values = column_values.assign(**region_encoding)

    return column_values



#### Run the main function

In [3]:
main()

FileNotFoundError: [Errno 2] No such file or directory: 'Life Expectancy Data.csv'