# Age Difference: XGB Experiment based on 59 Variables 1513k Rows 14 Waves

In [1]:
%pwd

'/home/GPU/esg09-wellbeing/Code'

In [2]:
%cd ..

/home/GPU/esg09-wellbeing


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


## Import Package

In [70]:
import numpy as np
import os 
import pandas as pd
from scipy import stats
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from skopt import BayesSearchCV
from skopt.space import Real, Integer
import xgboost as xgb

## Load and Make Datasets

In [4]:
Df_Filename = os.path.join("Data", "GallupWB_Ml64var1911k14wave_v1.parquet")

In [5]:
Df = pd.read_parquet(Df_Filename)

In [6]:
Df.shape

(1911212, 64)

In [7]:
Df.columns

Index(['wave', 'INCOME_2', 'Cantril_ladder', 'Health_disable', 'Relative_have',
       'Living_standard_change', 'Enough_food', 'Enough_shelter',
       'Well_rested', 'Respected', 'Smile', 'Interesting_thing', 'Enjoyment',
       'Physical_pain', 'Worry', 'Sadness', 'Stress', 'Anger',
       'City_satisficied', 'Economic_change', 'Goodtime_job', 'Sat_pubtran',
       'Sat_road', 'Sat_edu', 'Sat_qualityair', 'Sat_qualitywater',
       'Sat_healthcare', 'Sat_affhouse', 'Sat_oppofriend', 'Good_minorities',
       'Good_gayles', 'Good_immigrants', 'Donated', 'Volunteer',
       'Help_stranger', 'Voice_official', 'Local_police', 'Safety_walk',
       'Stolen', 'Assualted', 'Religion_importance', 'Children_respected',
       'Children_learn', 'Women_respected', 'Sat_dealpoor', 'Sat_perserveenv',
       'Freedom_chooselife', 'Conf_military', 'Conf_judicial',
       'Conf_government', 'Conf_financial', 'Conf_honestyelections',
       'Freedom_media', 'Corruption_business', 'Corruption_governm

### Check the Difference between Age Group

In [16]:
young_cantrilladder = Df.loc[Df['Age']<=40, 'Cantril_ladder']

In [17]:
young_cantrilladder.mean()

5.46157389538526

In [18]:
middle_cantrilladder = Df.loc[(Df['Age']>40)&(Df['Age']<=65), 'Cantril_ladder']

In [19]:
middle_cantrilladder.mean()

5.546090633585152

In [20]:
old_cantrilladder = Df.loc[Df['Age']>65, 'Cantril_ladder']

In [21]:
old_cantrilladder.mean()

5.727577206476798

In [22]:
t_stat, p_value = stats.ttest_ind(young_cantrilladder, middle_cantrilladder)

In [23]:
print(f"T-statistic: {t_stat}, P-value: {p_value}")

T-statistic: -22.394787205917154, P-value: 4.592108979551353e-111


In [24]:
t_stat, p_value = stats.ttest_ind(young_cantrilladder, old_cantrilladder)

In [25]:
print(f"T-statistic: {t_stat}, P-value: {p_value}")

T-statistic: -46.54255508076877, P-value: 0.0


In [26]:
t_stat, p_value = stats.ttest_ind(middle_cantrilladder, old_cantrilladder)

In [27]:
print(f"T-statistic: {t_stat}, P-value: {p_value}")

T-statistic: -30.141361019016323, P-value: 1.760592592819179e-199


### Shuffle Conversion

In [28]:
Df = Df.sample(frac=1, random_state=42).reset_index(drop=True)

In [29]:
Df['COUNTRY_ISO3'] = Df['COUNTRY_ISO3'].astype('category')

In [30]:
Df.isna().any().any()

False

### Df_young

In [31]:
Df_young = Df.loc[Df['Age']<=40, :]

In [32]:
Df_young.shape

(1031174, 64)

In [33]:
yy = Df_young['Cantril_ladder']

In [34]:
Xy = Df_young.drop(columns=['Cantril_ladder'])

In [35]:
Xy_train, Xy_test, yy_train, yy_test = train_test_split(Xy, yy, test_size=0.1, random_state=42)

### Df_middle

In [36]:
Df_middle = Df.loc[(Df['Age']>40)&(Df['Age']<=65), :]

In [37]:
Df_middle.shape

(663573, 64)

In [38]:
ym = Df_middle['Cantril_ladder']

In [39]:
Xm = Df_middle.drop(columns=['Cantril_ladder'])

In [40]:
Xm_train, Xm_test, ym_train, ym_test = train_test_split(Xm, ym, test_size=0.1, random_state=42)

### Df_old

In [41]:
Df_old = Df.loc[Df['Age']>65, :]

In [42]:
Df_old.shape

(216465, 64)

In [43]:
yo = Df_old['Cantril_ladder']

In [44]:
Xo = Df_old.drop(columns=['Cantril_ladder'])

In [45]:
Xo_train, Xo_test, yo_train, yo_test = train_test_split(Xo, yo, test_size=0.1, random_state=42)

## Model and Tuning Hyper

### Young Model

**test 500 0.001 10**

In [58]:
model = xgb.XGBRegressor(objective='reg:squarederror', tree_method='hist', device = "cuda:1",
                         n_estimators=500, learning_rate=0.001, max_depth=10, 
                         random_state=42, enable_categorical=True)
model.fit(Xy_train, yy_train)

In [59]:
yy_pred = model.predict(Xy_test)

In [60]:
r2_score(yy_test, yy_pred) * 100

19.725288247454742

In [61]:
yy_train_pred = model.predict(Xy_train)

In [62]:
r2_score(yy_train, yy_train_pred) * 100

20.252878198543478

**test 100 0.1 10**

In [63]:
model = xgb.XGBRegressor(objective='reg:squarederror', tree_method='hist', device = "cuda:1",
                         n_estimators=100, learning_rate=0.1, max_depth=10, 
                         random_state=42, enable_categorical=True)
model.fit(Xy_train, yy_train)

In [64]:
yy_pred = model.predict(Xy_test)

In [65]:
r2_score(yy_test, yy_pred) * 100

34.87419236981112

In [66]:
yy_train_pred = model.predict(Xy_train)

In [67]:
r2_score(yy_train, yy_train_pred) * 100

45.26317548464791

### Fune-tune major hyperparameter for young

In [68]:
n_estimators_list = list(range(100, 1_100, 100))
learning_rate_list = [0.001, 0.01, 0.1]
max_depth_list = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
subsample_list = [0.6, 0.7, 0.8, 0.9, 1]

In [69]:
results_list = []
for n_estimators in n_estimators_list:
    for learning_rate in learning_rate_list:
        for max_depth in max_depth_list:
            for subsample in subsample_list:
                model = xgb.XGBRegressor(objective='reg:squarederror', 
                                         device = 'cuda:1', tree_method='hist', 
                                         n_estimators = n_estimators, 
                                         learning_rate = learning_rate, 
                                         max_depth = max_depth, 
                                         subsample = subsample,
                                         random_state=42, enable_categorical=True  )
                model.fit(Xy_train, yy_train)
                yy_pred = model.predict(Xy_test)
                test_r2 = r2_score(yy_test, yy_pred) * 100
                yy_train_pred = model.predict(Xy_train)
                train_r2 = r2_score(yy_train, yy_train_pred) * 100
                row = [n_estimators, learning_rate, max_depth, 
                       subsample, train_r2, test_r2]
                print(row)
                model = None
                ym_pred = None
                ym_train_pred = None
                results_list.append(row)

[100, 0.001, 3, 0.6, 4.132094869777458, 4.167488479792669]
[100, 0.001, 3, 0.7, 4.132287675452828, 4.167573199279506]
[100, 0.001, 3, 0.8, 4.133011290335231, 4.168480907964788]
[100, 0.001, 3, 0.9, 4.132782681273561, 4.168118147129807]
[100, 0.001, 3, 1, 4.132646590331079, 4.16844152602498]
[100, 0.001, 4, 0.6, 4.563180763049646, 4.60279632290993]
[100, 0.001, 4, 0.7, 4.562718296890777, 4.60253200286761]
[100, 0.001, 4, 0.8, 4.563044844393572, 4.602967332025143]
[100, 0.001, 4, 0.9, 4.562082893836883, 4.601945485670256]
[100, 0.001, 4, 1, 4.5613158025474725, 4.601819866124046]
[100, 0.001, 5, 0.6, 4.831414497086928, 4.874112495648331]
[100, 0.001, 5, 0.7, 4.8314175612530486, 4.874800298534998]
[100, 0.001, 5, 0.8, 4.8314085019053765, 4.8746523586929325]
[100, 0.001, 5, 0.9, 4.83097256103624, 4.874109476492128]
[100, 0.001, 5, 1, 4.829445267748278, 4.872389050161097]
[100, 0.001, 6, 0.6, 5.063372304731251, 5.101008186180245]
[100, 0.001, 6, 0.7, 5.063244107203701, 5.102159031809428]
[10

KeyboardInterrupt: 

In [33]:
results_list_df = pd.DataFrame(results_list, columns = ['n_estimators', 'learning_rate', 'max_depth', 'subsample', 'train_r2', 'test_r2'])

In [40]:
results_list_df.to_parquet(os.path.join('Results', "Hyperparameter_AgeYoungXgboost_ML.parquet"))

In [None]:
results_list_df.head()

In [None]:
results_list_df.sort_values(by='test_r2', ascending=False).head(20)

### Fine tune other minor hyperparameter

In [45]:
gamma_list = [0, 0.01, 0.1, 1, 10]
min_child_weight_list = [0, 0.1, 1, 10]
max_delta_step_list = [0, 0.1, 1, 10]
reg_lambda_list = [0, 0.1, 1, 10]
reg_alpha_list = [0, 0.1, 1, 10]

### Bayes Search Hyperparameter (Test)

In [109]:
param_space = {
    'n_estimators': Integer(100, 1000),
    'learning_rate': Real(0.001, 0.5, prior='log-uniform'),
    'max_depth': Integer(3, 16),
    'subsample': Real(0.5, 1.0),
    'min_child_weight': Real(0.001, 10, prior='log-uniform'),
    'max_delta_step': Real(0.001, 10, prior='log-uniform'),
    'reg_lambda': Real(0.001, 10, prior='log-uniform'),
    'reg_alpha': Real(0.001, 10, prior='log-uniform'),
    'gamma': Real(0.001, 10, prior='log-uniform')
}

In [110]:
xgb_reg = xgb.XGBRegressor(objective='reg:squarederror',  device = 'cuda:1', tree_method='hist', random_state=42, enable_categorical=True)

In [111]:
bayes_search = BayesSearchCV(
    estimator=xgb_reg,
    search_spaces=param_space,
    n_iter=50,
    scoring='r2',
    cv=5,
    n_jobs=10,
    verbose=2,
    random_state=42
)

In [None]:
bayes_search.fit(Xy, yy)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




Fitting 5 folds for each of 1 candidates, totalling 5 fits


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END gamma=2.2364202820542705, learning_rate=0.2421260059210535, max_delta_step=0.01635461931468242, max_depth=15, min_child_weight=2.860959183286342, n_estimators=156, reg_alpha=0.003574655164089067, reg_lambda=0.02596268418373733, subsample=0.8178645509395852; total time=  19.8s
[CV] END gamma=1.7765766649807682, learning_rate=0.002909887432203276, max_delta_step=0.24671088435225733, max_depth=13, min_child_weight=0.12365432532396153, n_estimators=186, reg_al

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




Fitting 5 folds for each of 1 candidates, totalling 5 fits


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END gamma=2.7535764626996952, learning_rate=0.17531948275047832, max_delta_step=0.6539052504695974, max_depth=4, min_child_weight=0.0014529287182387094, n_estimators=878, reg_alpha=0.17750068769434926, reg_lambda=0.0222471531084

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




Fitting 5 folds for each of 1 candidates, totalling 5 fits


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END gamma=0.07575848684009592, learning_rate=0.028585556070468157, max_delta_step=0.001, max_depth=11, min_child_weight=0.17759794734588458, n_estimators=521, reg_alpha=10.0, reg_lambda=9.974644105863872, subsample=0.8449958418844496; total time=  42.8s
[CV] END gamma=0.0076810054430872955, learning_rate=0.013827299326732946, max_delta_step=10.0, max_depth=8, min_child_weight=7.18360390293982, n_estimators=845, reg_alpha=2.6413569102831236, reg_lambda=0.00295707365403543, subsample=0.6250896437325223; total time= 1.3min
[CV] END gamma=0.001, learning_rate=0.01150274201606613, max_delta_step=10.0, max_depth=8, min_child_weight=0.02336

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END gamma=0.001, learning_rate=0.49999999999999994, max_delta_step=0.03585612350420298, max_depth=14, min_child_weight=10.0, n_estimators=100, reg_alpha=10.0, reg_lambda=0.012280776517983167, subsample=1.0; total time=  21.8s
[CV] END gamma=0.001, learning_rate=0.49999999999999994, max_delta_step=0.03585612350420298, max_depth=14, min_child_weight=10.0, n_estimators=100, reg_alpha=10.0, reg_lambda=0.012280776517983167, subsample=1.0; total time=  26.6s
[CV] END gamma=0.001, learning_rate=0.49999999999999994, max_delta_step=0.03585612350420298, max_depth=14, min_child_weight=10.0, n_estimators=100, reg_alpha=10.0, reg_lambda=0.012280776517983167, subsample=1.0; total time=  26.7s
[CV] END gamma=0.001, learning_rate=0.49999999999999994, max_delta_step=0.03585612350420298, max_depth=14, min_child_weight=10.0, n_estimators=100, reg_alpha=10.0, reg_lambda=0.012280776517983167, subsample=1.0; total time=  27.1s
[CV] END gamma=0.

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits


In [None]:
CV_result = bayes_search.cv_results_

In [None]:
pd.DataFrame(CV_result).sort_values(by='rank_test_score', ascending=True).head(10)