# Model Training notebook

## Setup

In [14]:
import sys
sys.path.append('../src')

from preprocessing import load_data
from features import splitting_data, feature_engineering, scaling, add_constant_column
from models import train_model

## Load and prepare our data

In [15]:
df = load_data()


In [16]:
minimal_cols = ['Region',
                'Under_five_deaths',
                'Adult_mortality',
                'GDP_per_capita',
                'Schooling',
                'Economy_status_Developed',
                'Life_expectancy'
                ]

elaborate_cols = minimal_cols + ['Alcohol_consumption',
                  'Hepatitis_B',
                  'Measles',
                  'BMI',
                  'Polio',
                  'Diphtheria',
                  'Incidents_HIV',
                  'Thinness_ten_nineteen_years',
                  'Thinness_five_nine_years',
                 ]

In [17]:
df_m = df[minimal_cols].copy()
df_e = df[elaborate_cols].copy()

In [18]:
X_train_m, X_test_m, y_train_m, y_test_m = splitting_data(df_m, 'Life_expectancy')
X_train_e, X_test_e, y_train_e, y_test_e = splitting_data(df_e, 'Life_expectancy')

In [19]:
# Execute feature engineering and normalisation for minimal model
X_train_m_fe = feature_engineering(X_train_m)
X_train_m_fe, _ = scaling(X_train_m_fe)

# Add a constant (intercept)
X_train_m_fe = add_constant_column(X_train_m_fe)

# Check indices still match
print(all(X_train_m_fe.index == y_train_m.index))

True


In [20]:
# Execute feature engineering and normalisation for minimal model
X_train_e_fe = feature_engineering(X_train_e)
X_train_e_fe, _ = scaling(X_train_e_fe)

# Add a constant (intercept)
X_train_e_fe = add_constant_column(X_train_e_fe)

# Update feature_cols
feature_cols_final = X_train_e_fe.columns

# Check indices still match
print(all(X_train_e_fe.index == y_train_e.index))

True


## Train our Models!

In [21]:
elaborate_results = train_model(y_train_e, X_train_e_fe)
minimal_results = train_model(y_train_m, X_train_m_fe)

In [22]:
elaborate_results.summary()

0,1,2,3
Dep. Variable:,Life_expectancy,R-squared:,0.978
Model:,OLS,Adj. R-squared:,0.978
Method:,Least Squares,F-statistic:,4825.0
Date:,"Sun, 26 Oct 2025",Prob (F-statistic):,0.0
Time:,15:06:56,Log-Likelihood:,-4012.8
No. Observations:,2291,AIC:,8070.0
Df Residuals:,2269,BIC:,8196.0
Df Model:,21,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,70.0680,0.109,640.847,0.000,69.854,70.282
Adult_mortality,-6.4719,0.103,-62.642,0.000,-6.674,-6.269
Schooling,0.7460,0.114,6.525,0.000,0.522,0.970
Economy_status_Developed,0.5015,0.190,2.639,0.008,0.129,0.874
Alcohol_consumption,-0.2314,0.091,-2.544,0.011,-0.410,-0.053
BMI,0.1860,0.084,2.212,0.027,0.021,0.351
Asia,0.8791,0.125,7.030,0.000,0.634,1.124
Central America and Caribbean,2.1816,0.141,15.484,0.000,1.905,2.458
European Union,-0.8679,0.213,-4.072,0.000,-1.286,-0.450

0,1,2,3
Omnibus:,62.852,Durbin-Watson:,1.929
Prob(Omnibus):,0.0,Jarque-Bera (JB):,73.26
Skew:,-0.357,Prob(JB):,1.24e-16
Kurtosis:,3.508,Cond. No.,1320.0


In [23]:
minimal_results.summary()

0,1,2,3
Dep. Variable:,Life_expectancy,R-squared:,0.977
Model:,OLS,Adj. R-squared:,0.977
Method:,Least Squares,F-statistic:,7534.0
Date:,"Sun, 26 Oct 2025",Prob (F-statistic):,0.0
Time:,15:06:56,Log-Likelihood:,-4054.6
No. Observations:,2291,AIC:,8137.0
Df Residuals:,2277,BIC:,8218.0
Df Model:,13,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,70.0885,0.092,759.639,0.000,69.908,70.269
Adult_mortality,-6.3419,0.070,-89.991,0.000,-6.480,-6.204
Schooling,0.6478,0.104,6.255,0.000,0.445,0.851
Economy_status_Developed,0.3887,0.168,2.315,0.021,0.059,0.718
Asia,0.7737,0.117,6.636,0.000,0.545,1.002
Central America and Caribbean,2.1190,0.130,16.253,0.000,1.863,2.375
European Union,-1.0889,0.195,-5.584,0.000,-1.471,-0.706
Middle East,0.1848,0.148,1.246,0.213,-0.106,0.476
North America,1.2807,0.269,4.766,0.000,0.754,1.808

0,1,2,3
Omnibus:,74.199,Durbin-Watson:,1.932
Prob(Omnibus):,0.0,Jarque-Bera (JB):,96.554
Skew:,-0.358,Prob(JB):,1.08e-21
Kurtosis:,3.707,Cond. No.,14.7
