# Model Training

### Enviroment


In [1]:
import logging
import pandas as pd
import os
import joblib
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [2]:
print("Current directory before change:", os.getcwd())

try:
    
    os.chdir("../")
    print("Current directory after change:", os.getcwd())

except FileNotFoundError:
    print("""
        FileNotFoundError - The specified directory does not exist or you are already in the root.
        If the code already worked once, do not run it again.
    """)

Current directory before change: c:\Users\Administrador\Desktop\workshop_03_kafka\notebooks
Current directory after change: c:\Users\Administrador\Desktop\workshop_03_kafka


### Load Combined Data


In [3]:
df = pd.read_csv('data/combined_happiness.csv')
logger.info(f'df shape: {df.shape}')

INFO:__main__:df shape: (781, 10)


In [4]:
df.head()

Unnamed: 0,country,freedom,gdp_per_capita,generosity,happiness_rank,happiness_score,healthy_life_expectancy,social_support,trust_government_corruption,year
0,Switzerland,0.66557,1.39651,0.29678,1,7.587,0.94143,1.34951,0.41978,2015
1,Iceland,0.62877,1.30232,0.4363,2,7.561,0.94784,1.40223,0.14145,2015
2,Denmark,0.64938,1.32548,0.34139,3,7.527,0.87464,1.36058,0.48357,2015
3,Norway,0.66973,1.459,0.34699,4,7.522,0.88521,1.33095,0.36503,2015
4,Canada,0.63297,1.32629,0.45811,5,7.427,0.90563,1.32261,0.32957,2015


In [5]:
feature_cols = [
    'freedom',
    'gdp_per_capita',
    'healthy_life_expectancy',
    'social_support',
    'generosity',
    'trust_government_corruption',
    'year'
]

X = df[feature_cols]
y = df['happiness_score']

### Train-Test Split (70-30)


In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)
logger.info(f'X_train shape: {X_train.shape}, X_test shape: {X_test.shape}')

INFO:__main__:X_train shape: (546, 7), X_test shape: (235, 7)


### Pipeline with .fit() and entretemint

In [7]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('ridge', Ridge(alpha=1.0))
])
pipeline.fit(X_train, y_train)
logger.info('Pipeline fitted with Ridge(alpha=1.0)')

INFO:__main__:Pipeline fitted with Ridge(alpha=1.0)


### metrics

In [8]:
y_pred = pipeline.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2  = r2_score(y_test, y_pred)
logger.info(f'MAE: {mae:.4f}, MSE: {mse:.4f}, R²: {r2:.4f}')

INFO:__main__:MAE: 0.4202, MSE: 0.2957, R²: 0.7632


In [9]:
logger = logging.getLogger(__name__)

models = {
    'LinearRegression': LinearRegression(),
    'Lasso': Lasso(alpha=0.1),
    'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42),
    'GradientBoosting': GradientBoostingRegressor(n_estimators=100, random_state=42)
}

results = []
for name, model in models.items():
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', model)
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    results.append({
        'model': name,
        'MAE': mean_absolute_error(y_test, y_pred),
        'MSE': mean_squared_error(y_test, y_pred),
        'R2': r2_score(y_test, y_pred)
    })

results_df = pd.DataFrame(results)
logger.info('Model comparison results:')
logger.info(f'\n{results_df}')


INFO:__main__:Model comparison results:
INFO:__main__:
              model       MAE       MSE        R2
0  LinearRegression  0.420102  0.295687  0.763202
1             Lasso  0.455495  0.328934  0.736577
2      RandomForest  0.374348  0.230232  0.815621
3  GradientBoosting  0.392682  0.247517  0.801779


### Other way
Let's try to use the __'country'__ column, because it may help the model and make its prediction more effective.
then, we can map the unique values of __‘country’__ and group them by continents: Europe, America, Asia, Africa, Oceania.

In [10]:
unique_countries = df['country'].unique()
print(unique_countries)

['Switzerland' 'Iceland' 'Denmark' 'Norway' 'Canada' 'Finland'
 'Netherlands' 'Sweden' 'New Zealand' 'Australia' 'Israel' 'Costa Rica'
 'Austria' 'Mexico' 'United States' 'Brazil' 'Luxembourg' 'Ireland'
 'Belgium' 'United Arab Emirates' 'United Kingdom' 'Oman' 'Venezuela'
 'Singapore' 'Panama' 'Germany' 'Chile' 'Qatar' 'France' 'Argentina'
 'Czech Republic' 'Uruguay' 'Colombia' 'Thailand' 'Saudi Arabia' 'Spain'
 'Malta' 'Taiwan' 'Kuwait' 'Suriname' 'Trinidad and Tobago' 'El Salvador'
 'Guatemala' 'Uzbekistan' 'Slovakia' 'Japan' 'South Korea' 'Ecuador'
 'Bahrain' 'Italy' 'Bolivia' 'Moldova' 'Paraguay' 'Kazakhstan' 'Slovenia'
 'Lithuania' 'Nicaragua' 'Peru' 'Belarus' 'Poland' 'Malaysia' 'Croatia'
 'Libya' 'Russia' 'Jamaica' 'North Cyprus' 'Cyprus' 'Algeria' 'Kosovo'
 'Turkmenistan' 'Mauritius' 'Hong Kong' 'Estonia' 'Indonesia' 'Vietnam'
 'Turkey' 'Kyrgyzstan' 'Nigeria' 'Bhutan' 'Azerbaijan' 'Pakistan' 'Jordan'
 'Montenegro' 'China' 'Zambia' 'Romania' 'Serbia' 'Portugal' 'Latvia'
 'Philip

### Country-to-Continent Lists


In [11]:
europe = [
    'Switzerland','Iceland','Denmark','Norway','Finland',
    'Netherlands','Sweden','Austria','Luxembourg','Ireland',
    'Belgium','United Kingdom','France','Germany',
    'Czech Republic','Slovakia','Slovenia','Croatia',
    'Cyprus','North Cyprus','Northern Cyprus','Macedonia',
    'North Macedonia','Montenegro','Kosovo','Serbia',
    'Poland','Lithuania','Latvia','Estonia','Hungary',
    'Romania','Bulgaria','Belarus','Moldova','Ukraine','Russia',
    'Spain','Portugal','Italy','Malta','Greece'
]

america = [
    'Canada','United States','Mexico','Costa Rica','Panama',
    'Colombia','Venezuela','Ecuador','Peru','Bolivia','Chile',
    'Brazil','Argentina','Uruguay','Paraguay','Suriname',
    'Trinidad and Tobago','Trinidad & Tobago','Belize','Guatemala',
    'Honduras','El Salvador','Nicaragua','Jamaica','Haiti',
    'Dominican Republic','Puerto Rico'
]

asia = [
    'Israel','United Arab Emirates','Oman','Qatar','Saudi Arabia',
    'Kuwait','Bahrain','Turkey','Kazakhstan','Uzbekistan',
    'Kyrgyzstan','Turkmenistan','Azerbaijan','Armenia','Georgia',
    'India','Pakistan','Bangladesh','Nepal','Sri Lanka','Bhutan',
    'Afghanistan','China','Hong Kong','Hong Kong S.A.R., China',
    'Taiwan','Taiwan Province of China','Japan','South Korea',
    'Singapore','Malaysia','Thailand','Philippines','Indonesia',
    'Vietnam','Cambodia','Laos','Myanmar','Iraq','Iran','Syria',
    'Lebanon','Jordan','Palestinian Territories'
]

africa = [
    'Algeria','Libya','Morocco','Tunisia','Egypt','Sudan',
    'South Sudan','Mauritania','Mali','Niger','Chad',
    'Central African Republic','Burkina Faso','Ivory Coast',
    'Ghana','Togo','Benin','Nigeria','Cameroon','Gabon',
    'Congo (Brazzaville)','Congo (Kinshasa)','Angola','Zambia',
    'Zimbabwe','Mozambique','Madagascar','Tanzania','Kenya',
    'Uganda','Rwanda','Burundi','Ethiopia','Somalia',
    'Somaliland region','Somaliland Region','Lesotho','Swaziland',
    'South Africa','Djibouti','Comoros','Mauritius','Namibia',
    'Senegal','Sierra Leone','Guinea','Liberia'
]

oceania = [
    'Australia','New Zealand'
]

### Map Countries to Continents

In [12]:
continent_map = {c: 'Europe' for c in europe}
continent_map.update({c: 'America' for c in america})
continent_map.update({c: 'Asia' for c in asia})
continent_map.update({c: 'Africa' for c in africa})
continent_map.update({c: 'Oceania' for c in oceania})

df['continent'] = df['country'].map(continent_map).fillna('Other')

### Create Interaction Feature


In [13]:
df['gdp_support'] = df['gdp_per_capita'] * df['social_support']

### Train-Test Split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    df[feature_cols + ['continent', 'year', 'gdp_support']],
    df['happiness_score'],
    test_size=0.3,
    random_state=42
)

### One-Hot Encode `continent` and `year`


In [15]:
X_train_enc = pd.get_dummies(X_train, columns=['continent','year'], drop_first=True)
X_test_enc  = pd.get_dummies(X_test,  columns=['continent','year'], drop_first=True)
X_train_enc, X_test_enc = X_train_enc.align(X_test_enc, join='left', axis=1, fill_value=0)

## Final Random Forest Training and Evaluation


In [16]:
pipeline_rf_final = pipeline
pipeline_rf_final.fit(X_train_enc, y_train)
y_pred = pipeline_rf_final.predict(X_test_enc)
mae = mean_absolute_error(y_test, y_pred)
r2  = r2_score(y_test, y_pred)
logger.info(f'RF MAE: {mae:.4f}, R²: {r2:.4f}')

INFO:__main__:RF MAE: 0.3370, R²: 0.8474


Save Final Model to `model/model_random_forest.pkl`


In [17]:
joblib.dump(pipeline_rf_final, 'model/model_random_forest.pkl')
logger.info('Saved final RF model to model_random_forest.pkl')

INFO:__main__:Saved final RF model to model_random_forest.pkl
