# Analytica - Modelling

## Import Libraries

In [None]:
## Import core libraries

# For data
import pandas as pd
import numpy as np
import joblib

# For train/test splitting and scaling processes
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler

# For modelling (specifically for generating the constant column as part of FE)
import statsmodels.api as sm
import statsmodels.tools

## Importing Data

In [None]:
## Read the data set

# Create data frame from the WHO data
who = pd.read_csv("life_expectancy_data.csv")

In [None]:
## Preview the data frame

who.head()

## Train/Test Splitting

In [None]:
## Seperate features and target for train/test splitting

# Split the target from the columns
feature_cols = list(who.columns)
feature_cols.remove('Life_expectancy')

# Create X (features), and y (target) variables.
X = who[feature_cols]
y = who['Life_expectancy']

In [None]:
## Using the train-test split function from sklearn

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0, stratify= who['Region'])

In [None]:
## Visually check that the number of records between X and y (across Train and Test) both match

print(f"X_train:   {X_train.shape}\ny_train:   {y_train.shape}\n")
print(f"X_test:    {X_test.shape}\ny_test:    {y_test.shape}")

In [None]:
## Assert that the indicies and number of records between X and y (across Train and Test) both match

# Indicies
assert(all(X_train.index == y_train.index)), "There is some index mismatch in Train"
assert(all(X_test.index == y_test.index)), "There is some index mismatch in Test"

# Number of records
assert(X_train.shape[0] == y_train.shape[0]), "There is some records mismatch in Train"
assert(X_test.shape[0] == y_test.shape[0]), "There is some records mismatch in Test"

## Feature Engingeering

In [None]:
# Run code from the FE notebook

In [None]:
## Feature Engingeering process

# Custom function for all FE processes
def feature_eng(who):
        # Create copy of the data frame
        who = who.copy()
        
        # One Hot Encoding (OHE)
        who = pd.get_dummies(who, columns = ['Region'], drop_first = True, prefix = 'Region', dtype = int)

        # Converting features into logarithm
        who['Incidents_HIV_log'] = np.log(who['Incidents_HIV'])
        who['GDP_per_capita_log'] = np.log(who['GDP_per_capita'])

        # Create standard scaler variables           
        standard_scaler_bmi = StandardScaler()
        standard_scaler_schooling = StandardScaler()
        
        # Create minmax scaler variables   
        minmax_scaler_gdp = MinMaxScaler()
        minmax_scaler_hiv = MinMaxScaler()
        
        # Create robust scaler variables 
        robust_scaler_under_five = RobustScaler()
        robust_scaler_adult_mortality = RobustScaler()
    
        # Normally distributed feature: BMI
        who[['BMI']] = standard_scaler_bmi.fit_transform(who[['BMI']])
    
        # Normally distributed feature: Schooling
        who[['Schooling']] = standard_scaler_schooling.fit_transform(who[['Schooling']])
    
        # MinMax scaling for bounded features
        who[['GDP_per_capita_log']] = minmax_scaler_gdp.fit_transform(who[['GDP_per_capita_log']])
        who[['Incidents_HIV_log']] = minmax_scaler_hiv.fit_transform(who[['Incidents_HIV_log']])
    
        # Robust scaling for features with outliers
        who[['Under_five_deaths']] = robust_scaler_under_five.fit_transform(who[['Under_five_deaths']])
        who[['Adult_mortality']] = robust_scaler_adult_mortality.fit_transform(who[['Adult_mortality']])
    
        # Save the scalers individually
        joblib.dump(standard_scaler_bmi, 'scr/standard_scaler_bmi.pkl')
        joblib.dump(standard_scaler_schooling, 'scr/standard_scaler_schooling.pkl')
        joblib.dump(minmax_scaler_gdp, 'scr/minmax_scaler_gdp.pkl')
        joblib.dump(minmax_scaler_hiv, 'scr/minmax_scaler_hiv.pkl')
        joblib.dump(robust_scaler_under_five, 'scr/robust_scaler_under_five.pkl')
        joblib.dump(robust_scaler_adult_mortality, 'scr/robust_scaler_adult_mortality.pkl')
        
        # Created for statsmodeling. Must always be present
        who = sm.add_constant(who)

        # Return the results
        return who

In [None]:
## Transform the X train and test data with the feature engineering

X_train_fe = feature_eng(X_train)
X_test_fe = feature_eng(X_test)

In [None]:
## Visually check the transformation

X_train_fe.head()

In [None]:
## Check that the X_train_fe contains no nulls and that all data types are ready for modelling

# Null values
print(f"Sum of nulls: {sum(X_train_fe.isnull().sum())}")

# Data 
X_train_fe.dtypes

In [None]:
## Precise model feature columns used

feature_cols_pre = [
 'const',
 'Under_five_deaths',
 'Adult_mortality',
 'BMI',
 'Schooling',
 'Region_Asia',
 'Region_Central America and Caribbean',
 'Region_European Union',
 'Region_Middle East',
 'Region_North America',
 'Region_Oceania',
 'Region_Rest of Europe',
 'Region_South America',
 'Incidents_HIV_log',
 'GDP_per_capita_log'
 ]

In [None]:
## Minimalistic model feature columns used

feature_cols_min = [
 'const',
 'Under_five_deaths',
 'Adult_mortality',
 'BMI'
 ]

## Modelling

### Precise Model Train

In [None]:
## Fit train set on the precise model and check summary

lin_reg = sm.OLS(y_train, X_train_fe[feature_cols_pre])
results = lin_reg.fit()
results.summary()

### Precise Model Test


In [None]:
## Predict results of the minimalistic model

# Train set
 
y_pred = results.predict(X_train_fe[feature_cols_pre])
y_pred_rmse = statsmodels.tools.eval_measures.rmse(y_train, y_pred)

# Test test
y_test_pred = results.predict(X_test_fe[feature_cols_pre])
y_test_pred_rmse = statsmodels.tools.eval_measures.rmse(y_test, y_test_pred)

# Print RMSE values
print(f'Train RMSE  = {y_pred_rmse}')
print(f'Test RMSE   = {(y_test_pred_rmse)}')

### Minimalistic Model

In [None]:
## Fit train set on the minimalistic model and check summary

lin_reg = sm.OLS(y_train, X_train_fe[feature_cols_min])
results = lin_reg.fit()
results.summary()

In [None]:
## Predict results of the minimalistic model

# Train set
 
y_pred = results.predict(X_train_fe[feature_cols_min])
y_pred_rmse = statsmodels.tools.eval_measures.rmse(y_train, y_pred)

# Test test
y_test_pred = results.predict(X_test_fe[feature_cols_min])
y_test_pred_rmse = statsmodels.tools.eval_measures.rmse(y_test, y_test_pred)

# Print RMSE values
print(f'Train RMSE  = {y_pred_rmse}')
print(f'Test RMSE   = {(y_test_pred_rmse)}')