# Model Training notebook

## Setup

In [7]:
import sys
sys.path.append('../src')

from preprocessing import load_data
from features import splitting_data, feature_engineering, scaling, add_constant_column
from models import train_model

## Load and prepare our data

In [8]:
df = load_data()


In [9]:
minimal_cols = ['Region',
                'Under_five_deaths',
                'Adult_mortality',
                'GDP_per_capita',
                'Schooling',
                'Economy_status_Developed',
                'Life_expectancy'
                ]

elaborate_cols = minimal_cols + ['Alcohol_consumption',
                  'Hepatitis_B',
                  'Measles',
                  'BMI',
                  'Polio',
                  'Diphtheria',
                  'Incidents_HIV',
                  'Thinness_ten_nineteen_years',
                  'Thinness_five_nine_years',
                 ]

In [10]:
df_copy = df[elaborate_cols].copy()

In [11]:
X_train, X_test, y_train, y_test = splitting_data(df_copy, 'Life_expectancy')

In [12]:
# Execute feature engineering and normalisation
X_train_fe = feature_engineering(X_train)
X_train_fe = scaling(X_train_fe)

# Add a constant (intercept)
X_train_fe = add_constant_column(X_train_fe)

# Update feature_cols
feature_cols_final = X_train_fe.columns

# Check indices still match
print(all(X_train_fe.index == y_train.index))

True


## Train our Models!

In [16]:
X_train_fe

Unnamed: 0,const,Adult_mortality,Schooling,Economy_status_Developed,Alcohol_consumption,BMI,Asia,Central America and Caribbean,European Union,Middle East,...,Rest of Europe,South America,Under_five_deaths_log,Incidents_HIV_log,GDP_per_capita_log,Thinness_metric_log,Hepatitis_B_exp,Measles_exp,Polio_exp,Diphtheria_exp
2026,1.0,-0.369456,0.566038,0.0,-0.243902,-0.81250,1.0,0.0,0.0,0.0,...,0.0,0.0,-0.460733,-0.439473,-0.059503,1.110856,0.595282,0.623665,0.401822,0.401822
651,1.0,-0.348360,0.698113,1.0,1.428354,0.34375,0.0,0.0,1.0,0.0,...,0.0,0.0,-0.787833,-0.226796,0.539377,-0.290270,0.533040,0.581694,0.197898,0.333166
2225,1.0,-0.147051,0.396226,0.0,0.388720,0.34375,0.0,0.0,0.0,0.0,...,0.0,1.0,-0.132067,0.596838,0.598510,-0.460829,-0.589593,0.000000,-0.848928,-0.905138
2357,1.0,-0.581719,0.264151,0.0,0.126524,0.18750,0.0,0.0,0.0,0.0,...,1.0,0.0,-0.284608,-0.377240,-0.073915,-0.506439,0.595282,0.581694,0.401822,0.401822
670,1.0,2.319636,-0.396226,0.0,-0.268293,-0.71875,0.0,0.0,0.0,0.0,...,0.0,0.0,0.607809,7.063437,-0.110276,1.061779,-0.329621,-0.621975,-0.676884,-0.848928
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1638,1.0,0.145921,-0.584906,0.0,-0.277439,1.03125,0.0,0.0,0.0,0.0,...,0.0,0.0,0.251181,0.027243,-0.168630,-1.042421,-0.112078,-0.592102,-0.499600,-0.499600
1095,1.0,0.838154,-1.056604,0.0,-0.400915,-1.00000,1.0,0.0,0.0,0.0,...,0.0,0.0,0.597023,0.185965,-0.571886,1.300224,0.000000,-0.209321,-0.316918,-0.316918
1130,1.0,-0.203470,-0.584906,0.0,0.030488,-0.43750,0.0,0.0,0.0,0.0,...,0.0,0.0,0.085577,0.287535,-0.141041,0.592624,0.533040,0.499003,0.401822,0.401822
1294,1.0,0.788393,0.622642,0.0,0.422256,0.15625,0.0,0.0,0.0,0.0,...,1.0,0.0,-0.241354,0.596838,-0.303080,-0.140475,0.471417,0.499003,0.131271,0.197898


In [13]:
elaborate_results = train_model(y_train, X_train_fe)
minimal_results = train_model(y_train, X_train_fe[minimal_cols])

KeyError: "['Region', 'Under_five_deaths', 'GDP_per_capita', 'Life_expectancy'] not in index"

In [None]:
elaborate_results.summary()

0,1,2,3
Dep. Variable:,Life_expectancy,R-squared:,0.985
Model:,OLS,Adj. R-squared:,0.985
Method:,Least Squares,F-statistic:,6746.0
Date:,"Thu, 23 Oct 2025",Prob (F-statistic):,0.0
Time:,13:38:34,Log-Likelihood:,-3595.7
No. Observations:,2291,AIC:,7237.0
Df Residuals:,2268,BIC:,7369.0
Df Model:,22,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,70.5437,0.090,783.890,0.000,70.367,70.720
Infant_deaths,-3.4587,0.140,-24.773,0.000,-3.733,-3.185
Adult_mortality,-6.3392,0.088,-71.843,0.000,-6.512,-6.166
BMI,-0.1207,0.073,-1.656,0.098,-0.264,0.022
Schooling,0.2536,0.094,2.694,0.007,0.069,0.438
Economy_status_Developed,1.5513,0.161,9.620,0.000,1.235,1.868
Asia,0.5514,0.106,5.195,0.000,0.343,0.760
Central America and Caribbean,2.0346,0.117,17.341,0.000,1.804,2.265
European Union,-0.9651,0.172,-5.624,0.000,-1.302,-0.629

0,1,2,3
Omnibus:,1.853,Durbin-Watson:,2.062
Prob(Omnibus):,0.396,Jarque-Bera (JB):,1.905
Skew:,-0.065,Prob(JB):,0.386
Kurtosis:,2.946,Cond. No.,32.5


In [14]:
minimal_results.summary()

NameError: name 'minimal_results' is not defined