# Model Training notebook

## Setup

In [1]:
import sys
sys.path.append('../src')

from preprocessing import load_data
from features import splitting_data, feature_engineering, scaling, add_constant_column
from models import train_model

## Load and prepare our data

In [2]:
df = load_data()


In [3]:
minimal_cols = ['Region',
                'Under_five_deaths',
                'Adult_mortality',
                'GDP_per_capita',
                'Schooling',
                'Economy_status_Developed',
                'Life_expectancy'
                ]

elaborate_cols = minimal_cols + ['Alcohol_consumption',
                  'Hepatitis_B',
                  'Measles',
                  'BMI',
                  'Polio',
                  'Diphtheria',
                  'Incidents_HIV',
                  'Thinness_ten_nineteen_years',
                  'Thinness_five_nine_years',
                 ]

In [4]:
df_m = df[minimal_cols].copy()
df_e = df[elaborate_cols].copy()

In [5]:
X_train_m, X_test_m, y_train_m, y_test_m = splitting_data(df_m, 'Life_expectancy')
X_train_e, X_test_e, y_train_e, y_test_e = splitting_data(df_e, 'Life_expectancy')

In [8]:
# Execute feature engineering and normalisation for minimal model
X_train_m_fe = feature_engineering(X_train_m)
X_train_m_fe = scaling(X_train_m_fe)

# Add a constant (intercept)
X_train_m_fe = add_constant_column(X_train_m_fe)

# Check indices still match
print(all(X_train_m_fe.index == y_train_m.index))

True


In [7]:
# Execute feature engineering and normalisation for minimal model
X_train_e_fe = feature_engineering(X_train_e)
X_train_e_fe = scaling(X_train_e_fe)

# Add a constant (intercept)
X_train_e_fe = add_constant_column(X_train_e_fe)

# Update feature_cols
feature_cols_final = X_train_e_fe.columns

# Check indices still match
print(all(X_train_e_fe.index == y_train_e.index))

True


## Train our Models!

In [9]:
elaborate_results = train_model(y_train_e, X_train_e_fe)
minimal_results = train_model(y_train_m, X_train_m_fe)

In [10]:
elaborate_results.summary()

0,1,2,3
Dep. Variable:,Life_expectancy,R-squared:,0.979
Model:,OLS,Adj. R-squared:,0.978
Method:,Least Squares,F-statistic:,4922.0
Date:,"Thu, 23 Oct 2025",Prob (F-statistic):,0.0
Time:,22:24:57,Log-Likelihood:,-4003.1
No. Observations:,2291,AIC:,8050.0
Df Residuals:,2269,BIC:,8176.0
Df Model:,21,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,69.9099,0.109,642.493,0.000,69.696,70.123
Adult_mortality,-6.6787,0.105,-63.604,0.000,-6.885,-6.473
Schooling,0.7662,0.114,6.749,0.000,0.544,0.989
Economy_status_Developed,0.4747,0.190,2.499,0.013,0.102,0.847
Alcohol_consumption,-0.2455,0.090,-2.737,0.006,-0.421,-0.070
BMI,0.1606,0.084,1.919,0.055,-0.004,0.325
Asia,0.9041,0.126,7.195,0.000,0.658,1.150
Central America and Caribbean,2.2987,0.144,16.009,0.000,2.017,2.580
European Union,-0.7046,0.213,-3.306,0.001,-1.123,-0.287

0,1,2,3
Omnibus:,49.234,Durbin-Watson:,2.007
Prob(Omnibus):,0.0,Jarque-Bera (JB):,54.919
Skew:,-0.321,Prob(JB):,1.19e-12
Kurtosis:,3.405,Cond. No.,1300.0


In [11]:
minimal_results.summary()

0,1,2,3
Dep. Variable:,Life_expectancy,R-squared:,0.978
Model:,OLS,Adj. R-squared:,0.978
Method:,Least Squares,F-statistic:,7682.0
Date:,"Thu, 23 Oct 2025",Prob (F-statistic):,0.0
Time:,22:24:59,Log-Likelihood:,-4045.5
No. Observations:,2291,AIC:,8119.0
Df Residuals:,2277,BIC:,8199.0
Df Model:,13,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,70.0104,0.093,755.741,0.000,69.829,70.192
Adult_mortality,-6.4505,0.072,-89.181,0.000,-6.592,-6.309
Schooling,0.6874,0.103,6.673,0.000,0.485,0.889
Economy_status_Developed,0.3494,0.169,2.069,0.039,0.018,0.680
Asia,0.7564,0.117,6.462,0.000,0.527,0.986
Central America and Caribbean,2.1585,0.132,16.316,0.000,1.899,2.418
European Union,-1.0087,0.195,-5.160,0.000,-1.392,-0.625
Middle East,0.1884,0.151,1.248,0.212,-0.108,0.484
North America,1.3493,0.261,5.170,0.000,0.838,1.861

0,1,2,3
Omnibus:,55.551,Durbin-Watson:,1.993
Prob(Omnibus):,0.0,Jarque-Bera (JB):,67.405
Skew:,-0.314,Prob(JB):,2.31e-15
Kurtosis:,3.559,Cond. No.,14.6
