# Age Difference: XGB Experiment based on 59 Variables 1513k Rows 14 Waves

In [1]:
%pwd

'/home/GPU/esg09-wellbeing/Code'

In [2]:
%cd ..

/home/GPU/esg09-wellbeing


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


## Import Packagee

In [57]:
import numpy as np
import os 
import pandas as pd
from scipy import stats
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import xgboost as xgb

## Load and Make Datasets

In [58]:
Df_Filename = os.path.join("Data", "GallupWB_Ml64var1911k14wave_v1.parquet")

In [59]:
Df = pd.read_parquet(Df_Filename)

In [60]:
Df.shape

(1911212, 64)

In [61]:
Df.columns

Index(['wave', 'INCOME_2', 'Cantril_ladder', 'Health_disable', 'Relative_have',
       'Living_standard_change', 'Enough_food', 'Enough_shelter',
       'Well_rested', 'Respected', 'Smile', 'Interesting_thing', 'Enjoyment',
       'Physical_pain', 'Worry', 'Sadness', 'Stress', 'Anger',
       'City_satisficied', 'Economic_change', 'Goodtime_job', 'Sat_pubtran',
       'Sat_road', 'Sat_edu', 'Sat_qualityair', 'Sat_qualitywater',
       'Sat_healthcare', 'Sat_affhouse', 'Sat_oppofriend', 'Good_minorities',
       'Good_gayles', 'Good_immigrants', 'Donated', 'Volunteer',
       'Help_stranger', 'Voice_official', 'Local_police', 'Safety_walk',
       'Stolen', 'Assualted', 'Religion_importance', 'Children_respected',
       'Children_learn', 'Women_respected', 'Sat_dealpoor', 'Sat_perserveenv',
       'Freedom_chooselife', 'Conf_military', 'Conf_judicial',
       'Conf_government', 'Conf_financial', 'Conf_honestyelections',
       'Freedom_media', 'Corruption_business', 'Corruption_governm

### Check the Difference between Age Group

In [62]:
young_cantrilladder = Df.loc[Df['Age']<=40, 'Cantril_ladder'].mean()

In [63]:
young_cantrilladder

np.float64(5.46157389538526)

In [64]:
middle_cantrilladder = Df.loc[(Df['Age']>40)&(Df['Age']<=65), 'Cantril_ladder'].mean()

In [65]:
middle_cantrilladder

np.float64(5.546090633585152)

In [66]:
old_cantrilladder = Df.loc[Df['Age']>65, 'Cantril_ladder'].mean()

In [67]:
old_cantrilladder

np.float64(5.727577206476798)

In [68]:
t_stat, p_value = stats.ttest_ind(Df.loc[Df['Age']<=40, 'Cantril_ladder'], Df.loc[(Df['Age']>40)&(Df['Age']<=65), 'Cantril_ladder'])

In [69]:
print(f"T-statistic: {t_stat}, P-value: {p_value}")

T-statistic: -22.394787205917154, P-value: 4.592108979551353e-111


In [70]:
t_stat, p_value = stats.ttest_ind(Df.loc[Df['Age']<=40, 'Cantril_ladder'],  Df.loc[Df['Age']>65, 'Cantril_ladder'])

In [71]:
print(f"T-statistic: {t_stat}, P-value: {p_value}")

T-statistic: -46.54255508076877, P-value: 0.0


In [72]:
t_stat, p_value = stats.ttest_ind(Df.loc[Df['Age']>65, 'Cantril_ladder'], Df.loc[(Df['Age']>40)&(Df['Age']<=65), 'Cantril_ladder'])

In [73]:
print(f"T-statistic: {t_stat}, P-value: {p_value}")

T-statistic: 30.141361019016323, P-value: 1.760592592819179e-199


### Shuffle Conversion

In [74]:
Df = Df.sample(frac=1, random_state=42).reset_index(drop=True)

In [75]:
Df['COUNTRY_ISO3'] = Df['COUNTRY_ISO3'].astype('category')

In [76]:
Df.isna().any().any()

np.False_

### Df_young

In [77]:
Df_young = Df.loc[Df['Age']<=40, :]

In [78]:
Df_young.shape

(1031174, 64)

In [79]:
yy = Df_young['Cantril_ladder']

In [80]:
Xy = Df_young.drop(columns=['Cantril_ladder'])

In [81]:
Xy_train, Xy_test, yy_train, yy_test = train_test_split(Xy, yy, test_size=0.1, random_state=42)

### Df_middle

In [82]:
Df_middle = Df.loc[(Df['Age']>40)&(Df['Age']<=65), :]

In [83]:
Df_middle.shape

(663573, 64)

In [84]:
ym = Df_middle['Cantril_ladder']

In [85]:
Xm = Df_middle.drop(columns=['Cantril_ladder'])

In [86]:
Xm_train, Xm_test, ym_train, ym_test = train_test_split(Xm, ym, test_size=0.1, random_state=42)

### Df_old

In [87]:
Df_old = Df.loc[Df['Age']>65, :]

In [88]:
Df_old.shape

(216465, 64)

In [89]:
yo = Df_old['Cantril_ladder']

In [90]:
Xo = Df_old.drop(columns=['Cantril_ladder'])

In [91]:
Xo_train, Xo_test, yo_train, yo_test = train_test_split(Xo, yo, test_size=0.1, random_state=42)

## Model and Tuning Hyper

### Young Model

In [95]:
model = xgb.XGBRegressor(objective='reg:squarederror', tree_method='hist', 
                         n_estimators=500, learning_rate=0.01, max_depth=5, 
                         random_state=42, enable_categorical=True)
model.fit(Xy_train, yy_train)

AttributeError: `np.NaN` was removed in the NumPy 2.0 release. Use `np.nan` instead.

In [27]:
ym_pred = model.predict(Xm_test)

In [28]:
r2_score(ym_test, ym_pred) * 100

35.345814761344485

In [29]:
ym_train_pred = model.predict(Xm_train)

In [30]:
r2_score(ym_train, ym_train_pred) * 100

36.214464768212785

In [31]:
n_estimators_list = list(range(100, 1_100, 100))
learning_rate_list = [0.001, 0.01, 0.1]
max_depth_list = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
subsample_list = [0.6, 0.7, 0.8, 0.9, 1]

In [32]:
results_list = []
for n_estimators in n_estimators_list:
    for learning_rate in learning_rate_list:
        for max_depth in max_depth_list:
            for subsample in subsample_list:
                model = xgb.XGBRegressor(objective='reg:squarederror', 
                                         tree_method='gpu_hist',  ## on linux: device = 'cuda', tree_method='hist', 
                                         n_estimators = n_estimators, 
                                         learning_rate = learning_rate, 
                                         max_depth = max_depth, 
                                         subsample = subsample,
                                         random_state=42, enable_categorical=True  )
                model.fit(Xm_train, ym_train)
                ym_pred = model.predict(Xm_test)
                test_r2 = r2_score(ym_test, ym_pred) * 100
                ym_train_pred = model.predict(Xm_train)
                train_r2 = r2_score(ym_train, ym_train_pred) * 100
                row = [n_estimators, learning_rate, max_depth, 
                       subsample, train_r2, test_r2]
                print(row)
                model = None
                ym_pred = None
                ym_train_pred = None
                results_list.append(row)

[100, 0.001, 3, 0.6, -346.9465146057738, -345.6614864532208]
[100, 0.001, 3, 0.7, -346.94511815918577, -345.6588582299794]
[100, 0.001, 3, 0.8, -346.945979315638, -345.66037200174094]
[100, 0.001, 3, 0.9, -346.9450669262052, -345.65906585005706]
[100, 0.001, 3, 1, -346.9437464794587, -345.65778195979783]
[100, 0.001, 4, 0.6, -346.46283379904037, -345.1830119581036]
[100, 0.001, 4, 0.7, -346.45976203634837, -345.18123516374874]
[100, 0.001, 4, 0.8, -346.46218569776295, -345.18408347586444]
[100, 0.001, 4, 0.9, -346.4604923609569, -345.1822236620651]
[100, 0.001, 4, 1, -346.4580370637822, -345.1797350393475]
[100, 0.001, 5, 0.6, -346.128897802993, -344.84157661217193]
[100, 0.001, 5, 0.7, -346.127099842922, -344.83896621101104]
[100, 0.001, 5, 0.8, -346.1288610316, -344.84036834428633]
[100, 0.001, 5, 0.9, -346.12730377965005, -344.8380758381987]
[100, 0.001, 5, 1, -346.12494584822446, -344.834050753722]
[100, 0.001, 6, 0.6, -345.89156683411034, -344.59520775945026]
[100, 0.001, 6, 0.7, 

In [33]:
results_list_df = pd.DataFrame(results_list, columns = ['n_estimators', 'learning_rate', 'max_depth', 'subsample', 'train_r2', 'test_r2'])

In [40]:
results_list_df.to_parquet(os.path.join('Results', "Hyperparameter_GenderMaleXgboost_ML.parquet"))

In [41]:
results_list_df.head()

Unnamed: 0,n_estimators,learning_rate,max_depth,subsample,train_r2,test_r2
0,100,0.001,3,0.6,-346.946515,-345.661486
1,100,0.001,3,0.7,-346.945118,-345.658858
2,100,0.001,3,0.8,-346.945979,-345.660372
3,100,0.001,3,0.9,-346.945067,-345.659066
4,100,0.001,3,1.0,-346.943746,-345.657782


In [43]:
results_list_df.sort_values(by='test_r2', ascending=False).head(20)

Unnamed: 0,n_estimators,learning_rate,max_depth,subsample,train_r2,test_r2
1868,1000,0.01,12,0.9,65.259347,37.972949
1865,1000,0.01,12,0.6,63.688727,37.949387
1862,1000,0.01,11,0.8,56.286785,37.944039
1867,1000,0.01,12,0.8,64.856366,37.932941
1673,900,0.01,12,0.9,63.580487,37.917952
1872,1000,0.01,13,0.8,74.39485,37.917832
1866,1000,0.01,12,0.7,64.327114,37.917203
1670,900,0.01,12,0.6,62.041453,37.906973
1873,1000,0.01,13,0.9,74.956253,37.905652
1677,900,0.01,13,0.8,72.50999,37.900075


**Fine tune other hyperparameter**

In [45]:
gamma_list = [0, 0.01, 0.1, 1, 10]
min_child_weight_list = [0, 0.1, 1, 10]
max_delta_step_list = [0, 0.1, 1, 10]
reg_lambda_list = [0, 0.1, 1, 10]
reg_alpha_list = [0, 0.1, 1, 10]

In [None]:
# 1000	0.01	12	0.9	65.259347	37.972949
results_list_otherhyper = []
for gamma in gamma_list:
    for min_child_weight in min_child_weight_list:
        for max_delta_step in max_delta_step_list:
            for reg_lambda in reg_lambda_list:
                for reg_alpha in reg_alpha_list:
                    model = xgb.XGBRegressor(objective='reg:squarederror',  tree_method='gpu_hist',  ## on linux: device = 'cuda', tree_method='hist', 
                                             n_estimators = 1000,  learning_rate = 0.01, max_depth = 12,   subsample = 0.9,
                                             gamma = gamma, min_child_weight = min_child_weight, max_delta_step = max_delta_step,
                                             reg_lambda = reg_lambda, reg_alpha = reg_alpha, 
                                             random_state=42, enable_categorical=True  )
                    model.fit(Xm_train, ym_train)
                    ym_pred = model.predict(Xm_test)
                    test_r2 = r2_score(ym_test, ym_pred) * 100
                    ym_train_pred = model.predict(Xm_train)
                    train_r2 = r2_score(ym_train, ym_train_pred) * 100
                    row = [gamma, min_child_weight, max_delta_step, reg_lambda, 
                           reg_alpha, train_r2, test_r2]
                    print(row)
                    model = None
                    ym_pred = None
                    ym_train_pred = None
                    results_list_otherhyper.append(row)

[0, 0, 0, 0, 0, 66.32583976200496, 37.92534171377735]
[0, 0, 0, 0, 0.1, 66.25929018662178, 37.89396674883224]
[0, 0, 0, 0, 1, 65.77513090428084, 37.92659717073056]
[0, 0, 0, 0, 10, 57.76528690574637, 37.959529768840085]
[0, 0, 0, 0.1, 0, 66.14473734107358, 37.91347779740685]
[0, 0, 0, 0.1, 0.1, 66.11109036118917, 37.903010747107345]
[0, 0, 0, 0.1, 1, 65.64327235851457, 37.913586773826104]
[0, 0, 0, 0.1, 10, 57.73111159250488, 37.934744396040244]
[0, 0, 0, 1, 0, 65.2593470125833, 37.972948793923756]
[0, 0, 0, 1, 0.1, 65.2311686285684, 37.98462798746666]
[0, 0, 0, 1, 1, 64.68336611746707, 37.962690044963246]
[0, 0, 0, 1, 10, 57.25202947338671, 37.93972695444805]
[0, 0, 0, 10, 0, 59.587659852857456, 38.049672756904464]
[0, 0, 0, 10, 0.1, 59.56645364193989, 38.03215426078196]
[0, 0, 0, 10, 1, 59.11353450893302, 38.05629920865351]
[0, 0, 0, 10, 10, 54.593747394480815, 37.99005296392986]
[0, 0, 0.1, 0, 0, 66.28456097496328, 37.97191901178252]
[0, 0, 0.1, 0, 0.1, 66.19149802930379, 37.9021953