# Age Difference: XGB Experiment based on 59 Variables 1513k Rows 14 Waves

In [1]:
%pwd

'/mnt/d/OneDrive - Kyushu University/ESG09_Article/Code'

In [2]:
%cd ..

/mnt/d/OneDrive - Kyushu University/ESG09_Article


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


## Import Package

In [5]:
from autogluon.tabular import TabularDataset, TabularPredictor
from joblib import dump
import os 
import pandas as pd
import random
from scipy import stats
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, KFold
from skopt import BayesSearchCV
from skopt.space import Real, Integer
import xgboost as xgb

## Load and Make Datasets

In [6]:
Df_Filename = os.path.join("Data", "GallupWB_Ml64var1911k14wave_v1.parquet")

In [7]:
Df = pd.read_parquet(Df_Filename)

In [8]:
Df.shape

(1911212, 64)

In [9]:
Df.columns

Index(['wave', 'INCOME_2', 'Cantril_ladder', 'Health_disable', 'Relative_have',
       'Living_standard_change', 'Enough_food', 'Enough_shelter',
       'Well_rested', 'Respected', 'Smile', 'Interesting_thing', 'Enjoyment',
       'Physical_pain', 'Worry', 'Sadness', 'Stress', 'Anger',
       'City_satisficied', 'Economic_change', 'Goodtime_job', 'Sat_pubtran',
       'Sat_road', 'Sat_edu', 'Sat_qualityair', 'Sat_qualitywater',
       'Sat_healthcare', 'Sat_affhouse', 'Sat_oppofriend', 'Good_minorities',
       'Good_gayles', 'Good_immigrants', 'Donated', 'Volunteer',
       'Help_stranger', 'Voice_official', 'Local_police', 'Safety_walk',
       'Stolen', 'Assualted', 'Religion_importance', 'Children_respected',
       'Children_learn', 'Women_respected', 'Sat_dealpoor', 'Sat_perserveenv',
       'Freedom_chooselife', 'Conf_military', 'Conf_judicial',
       'Conf_government', 'Conf_financial', 'Conf_honestyelections',
       'Freedom_media', 'Corruption_business', 'Corruption_governm

### Check the Difference between Age Group

In [10]:
young_cantrilladder = Df.loc[Df['Age']<=40, 'Cantril_ladder']

In [11]:
young_cantrilladder.mean()

5.46157389538526

In [12]:
middle_cantrilladder = Df.loc[(Df['Age']>40)&(Df['Age']<=65), 'Cantril_ladder']

In [13]:
middle_cantrilladder.mean()

5.546090633585152

In [14]:
old_cantrilladder = Df.loc[Df['Age']>65, 'Cantril_ladder']

In [15]:
old_cantrilladder.mean()

5.727577206476798

In [16]:
t_stat, p_value = stats.ttest_ind(young_cantrilladder, middle_cantrilladder)

In [17]:
print(f"T-statistic: {t_stat}, P-value: {p_value}")

T-statistic: -22.394787205917154, P-value: 4.592108979551354e-111


In [18]:
t_stat, p_value = stats.ttest_ind(young_cantrilladder, old_cantrilladder)

In [19]:
print(f"T-statistic: {t_stat}, P-value: {p_value}")

T-statistic: -46.54255508076877, P-value: 0.0


In [20]:
t_stat, p_value = stats.ttest_ind(middle_cantrilladder, old_cantrilladder)

In [21]:
print(f"T-statistic: {t_stat}, P-value: {p_value}")

T-statistic: -30.141361019016323, P-value: 1.760592592819179e-199


### Shuffle Conversion

In [22]:
Df = Df.sample(frac=1, random_state=42).reset_index(drop=True)

In [23]:
Df['COUNTRY_ISO3'] = Df['COUNTRY_ISO3'].astype('category')

In [24]:
Df.isna().any().any()

False

### Df_young

In [25]:
Df_young = Df.loc[Df['Age']<=40, :].drop(columns=['Age'])

In [26]:
Df_young.shape

(1031174, 63)

In [27]:
y_train, y_test = train_test_split(Df_young, test_size=0.1, random_state=42)

### Df_middle

In [28]:
Df_middle = Df.loc[(Df['Age']>40)&(Df['Age']<=65), :].drop(columns=['Age'])

In [29]:
Df_middle.shape

(663573, 63)

In [30]:
m_train, m_test = train_test_split(Df_middle, test_size=0.1, random_state=42)

### Df_old

In [31]:
Df_old = Df.loc[Df['Age']>65, :].drop(columns=['Age'])

In [32]:
Df_old.shape

(216465, 63)

In [33]:
o_train, o_test = train_test_split(Df_old, test_size=0.1, random_state=42)

## AutoML Test

### Young Model

In [36]:
label = 'Cantril_ladder'

In [37]:
y_predictor = TabularPredictor(label=label, eval_metric='r2', problem_type = 'regression').fit(y_train)

No path specified. Models will be saved in: "AutogluonModels/ag-20240622_104037"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.1.1
Python Version:     3.9.19
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Fri Mar 29 23:14:13 UTC 2024
CPU Count:          10
Memory Avail:       120.26 GB / 125.66 GB (95.7%)
Disk Space Avail:   190.77 GB / 1863.00 GB (10.2%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='best_quality'   : Maximize accuracy. Default time_limit=3600.
	presets='high_quality'   : Strong accuracy with fast inference speed. Default time_limit=3600.
	presets='good_quality'   : Good accuracy with very fast inference speed. Default time_limit=3600.
	presets='medium_quality' : Fast training time, ideal for initial prototyping.
	Consider sett

[1000]	valid_set's l2: 3.82052	valid_set's r2: 0.332089
[2000]	valid_set's l2: 3.80369	valid_set's r2: 0.335032
[3000]	valid_set's l2: 3.79316	valid_set's r2: 0.336872
[4000]	valid_set's l2: 3.78847	valid_set's r2: 0.337692
[5000]	valid_set's l2: 3.78704	valid_set's r2: 0.337942
[6000]	valid_set's l2: 3.78725	valid_set's r2: 0.337906
[7000]	valid_set's l2: 3.78552	valid_set's r2: 0.338207
[8000]	valid_set's l2: 3.78802	valid_set's r2: 0.337772
[9000]	valid_set's l2: 3.79007	valid_set's r2: 0.337412


	0.3383	 = Validation score   (r2)
	195.89s	 = Training   runtime
	0.66s	 = Validation runtime
Fitting model: LightGBM ...


[1000]	valid_set's l2: 3.77369	valid_set's r2: 0.340277
[2000]	valid_set's l2: 3.7687	valid_set's r2: 0.341149
[3000]	valid_set's l2: 3.76368	valid_set's r2: 0.342026
[4000]	valid_set's l2: 3.76038	valid_set's r2: 0.342603
[5000]	valid_set's l2: 3.76339	valid_set's r2: 0.342076


	0.3427	 = Validation score   (r2)
	116.05s	 = Training   runtime
	0.32s	 = Validation runtime
Fitting model: RandomForestMSE ...
	0.2956	 = Validation score   (r2)
	557.06s	 = Training   runtime
	0.14s	 = Validation runtime
Fitting model: CatBoost ...
	0.3397	 = Validation score   (r2)
	834.13s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: ExtraTreesMSE ...
	0.2696	 = Validation score   (r2)
	504.99s	 = Training   runtime
	0.12s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
Matplotlib is building the font cache; this may take a moment.
	0.3335	 = Validation score   (r2)
	356.84s	 = Training   runtime
	0.05s	 = Validation runtime
Fitting model: XGBoost ...
	0.3351	 = Validation score   (r2)
	83.84s	 = Training   runtime
	0.18s	 = Validation runtime
Fitting model: NeuralNetTorch ...
	0.3096	 = Validation score   (r2)
	569.7s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: LightGBMLarge ...


[1000]	valid_set's l2: 3.74666	valid_set's r2: 0.345001
[2000]	valid_set's l2: 3.73805	valid_set's r2: 0.346507


	0.3471	 = Validation score   (r2)
	77.32s	 = Training   runtime
	0.21s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	Ensemble Weights: {'LightGBMLarge': 0.56, 'LightGBMXT': 0.12, 'XGBoost': 0.12, 'CatBoost': 0.08, 'NeuralNetFastAI': 0.08, 'LightGBM': 0.04}
	0.3484	 = Validation score   (r2)
	0.08s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 3314.43s ... Best model: WeightedEnsemble_L2 | Estimated inference throughput: 6428.1 rows/s (9281 batch size)
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20240622_104037")


In [38]:
y_predictor.evaluate(y_test, silent=True)

{'r2': 0.35417279016135605,
 'root_mean_squared_error': -1.9230941547753664,
 'mean_squared_error': -3.698291128131181,
 'mean_absolute_error': -1.4517836237657942,
 'pearsonr': 0.5952217488143414,
 'median_absolute_error': -1.1045880317687988}

In [39]:
y_predictor.leaderboard(y_test)

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,0.354173,0.348362,r2,17.521234,1.443814,1664.146551,0.009365,0.000513,0.075659,2,True,12
1,LightGBM,0.352381,0.342718,r2,3.990122,0.315581,116.046508,3.990122,0.315581,116.046508,1,True,4
2,LightGBMLarge,0.352111,0.347091,r2,2.656945,0.209865,77.320442,2.656945,0.209865,77.320442,1,True,11
3,LightGBMXT,0.34786,0.338331,r2,7.70382,0.661076,195.893915,7.70382,0.661076,195.893915,1,True,3
4,CatBoost,0.345274,0.339741,r2,0.33855,0.023228,834.129205,0.33855,0.023228,834.129205,1,True,6
5,XGBoost,0.339657,0.335078,r2,2.206861,0.180854,83.837374,2.206861,0.180854,83.837374,1,True,9
6,NeuralNetFastAI,0.338254,0.333492,r2,0.615572,0.052697,356.843449,0.615572,0.052697,356.843449,1,True,8
7,NeuralNetTorch,0.31065,0.309606,r2,0.278528,0.021147,569.700333,0.278528,0.021147,569.700333,1,True,10
8,RandomForestMSE,0.296504,0.295558,r2,4.513988,0.142281,557.057962,4.513988,0.142281,557.057962,1,True,5
9,ExtraTreesMSE,0.273831,0.269597,r2,4.1012,0.124419,504.989644,4.1012,0.124419,504.989644,1,True,7


In [40]:
for key in y_predictor.info()['model_info'].keys():
    print(y_predictor.info()['model_info'][key]['name'])
    print(y_predictor.info()['model_info'][key]['val_score'])
    print(y_predictor.info()['model_info'][key]['hyperparameters'])
    print("--------------------")

KNeighborsUnif
0.1200818037315341
{'weights': 'uniform'}
--------------------
KNeighborsDist
0.0676703671566482
{'weights': 'distance'}
--------------------
LightGBMXT
0.33833092946293963
{'learning_rate': 0.05, 'extra_trees': True}
--------------------
LightGBM
0.34271758696507315
{'learning_rate': 0.05}
--------------------
RandomForestMSE
0.29555838165620585
{'n_estimators': 300, 'max_leaf_nodes': 15000, 'n_jobs': -1, 'random_state': 0, 'bootstrap': True, 'criterion': 'squared_error'}
--------------------
CatBoost
0.3397405292008704
{'iterations': 10000, 'learning_rate': 0.05, 'random_seed': 0, 'allow_writing_files': False, 'eval_metric': 'R2'}
--------------------
ExtraTreesMSE
0.269596601021037
{'n_estimators': 300, 'max_leaf_nodes': 15000, 'n_jobs': -1, 'random_state': 0, 'bootstrap': True, 'criterion': 'squared_error'}
--------------------
NeuralNetFastAI
0.33349214047479403
{'layers': None, 'emb_drop': 0.1, 'ps': 0.1, 'bs': 'auto', 'lr': 0.01, 'epochs': 'auto', 'early.stopping.

In [41]:
y_predictor.info()['model_info'].keys()

dict_keys(['KNeighborsUnif', 'KNeighborsDist', 'LightGBMXT', 'LightGBM', 'RandomForestMSE', 'CatBoost', 'ExtraTreesMSE', 'NeuralNetFastAI', 'XGBoost', 'NeuralNetTorch', 'LightGBMLarge', 'WeightedEnsemble_L2'])

In [None]:
y_predictor = None
y_train = None
y_test = None

### Young Bayes Search Hyperparameter (Test 3 times 1:9 CV)

In [42]:
yy = Df_young['Cantril_ladder']

In [43]:
Xy = Df_young.drop(columns=['Cantril_ladder'])

In [44]:
param_space = {
    'n_estimators': Integer(100, 5000),
    'learning_rate': Real(0.001, 0.1, prior='log-uniform'),
    'max_depth': Integer(3, 16),
    'subsample': Real(0.5, 1.0),
    'min_child_weight': Real(0.001, 10, prior='log-uniform'),
    'max_delta_step': Real(0.001, 10, prior='log-uniform'),
    'reg_lambda': Real(0.001, 10, prior='log-uniform'),
    'reg_alpha': Real(0.001, 10, prior='log-uniform'),
    'gamma': Real(0.001, 10, prior='log-uniform')
}

In [45]:
xgb_reg = xgb.XGBRegressor(objective='reg:squarederror',  device = 'cuda', tree_method='hist', random_state=42, enable_categorical=True)

In [46]:
class RandomRunNFoldsKFold(KFold):
    def __init__(self, n_splits=10, random_state=None, run_splits=3, **kwargs):
        super().__init__(n_splits=n_splits, shuffle=True, random_state=random_state, **kwargs)
        self.random_state = random_state
        self.actual_splits = run_splits  # Number of actual splits to use

    def split(self, X, y=None, groups=None):
        folds = list(super().split(X, y, groups))
        if self.random_state is not None:
            random.seed(self.random_state)
        selected_folds = random.sample(folds, self.actual_splits)
        for train_index, test_index in selected_folds:
            yield train_index, test_index

    def get_n_splits(self, X=None, y=None, groups=None):
        return self.actual_splits

In [47]:
rkfcv = RandomRunNFoldsKFold(n_splits=10, run_splits=3, random_state=42)

In [48]:
bayes_search = BayesSearchCV(
    estimator=xgb_reg,
    search_spaces=param_space,
    n_iter=20,
    scoring='r2',
    cv=rkfcv,
    n_jobs = 1,
    n_points = 1,
    verbose=2,
    random_state=42,
    return_train_score = True
)

In [49]:
bayes_search.fit(Xy, yy)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




[CV] END gamma=0.04369339947510315, learning_rate=0.02853983686604182, max_delta_step=5.388550972627239, max_depth=7, min_child_weight=0.47928274405969296, n_estimators=2129, reg_alpha=0.025335258486348353, reg_lambda=0.9078559343576645, subsample=0.6522316555182531; total time=  32.6s
[CV] END gamma=0.04369339947510315, learning_rate=0.02853983686604182, max_delta_step=5.388550972627239, max_depth=7, min_child_weight=0.47928274405969296, n_estimators=2129, reg_alpha=0.025335258486348353, reg_lambda=0.9078559343576645, subsample=0.6522316555182531; total time=  30.8s
[CV] END gamma=0.04369339947510315, learning_rate=0.02853983686604182, max_delta_step=5.388550972627239, max_depth=7, min_child_weight=0.47928274405969296, n_estimators=2129, reg_alpha=0.025335258486348353, reg_lambda=0.9078559343576645, subsample=0.6522316555182531; total time=  30.4s
Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] END gamma=2.236420282054271, learning_rate=0.058429282697611454, max_delta_

In [50]:
CV_result = bayes_search.cv_results_

In [51]:
pd.DataFrame(CV_result).sort_values(by='rank_test_score', ascending=True).head(10)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_gamma,param_learning_rate,param_max_delta_step,param_max_depth,param_min_child_weight,param_n_estimators,...,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score,rank_train_score
8,21.459076,0.223054,0.098571,0.001349,6.636085,0.025191,3.066251,8,0.033439,1136,...,0.353873,0.352299,0.003107,1,0.427777,0.42752,0.427362,0.427553,0.000171,6
18,685.5612,2.023588,1.387342,0.141341,0.242101,0.019534,0.174,13,3.04935,4104,...,0.353463,0.351308,0.003416,2,0.821163,0.820584,0.820593,0.82078,0.000271,4
0,31.113476,0.874491,0.162181,0.066385,0.043693,0.02854,5.388551,7,0.479283,2129,...,0.352348,0.350655,0.003205,3,0.424155,0.423304,0.423278,0.423579,0.000408,7
9,18.084488,0.276208,0.096471,0.001598,0.001034,0.043218,0.92285,6,0.206375,1446,...,0.351273,0.349657,0.003159,4,0.394655,0.39405,0.394083,0.394263,0.000278,9
4,244.502706,1.556659,0.445763,0.005727,1.578388,0.007517,0.127785,12,4.124851,3613,...,0.349255,0.347882,0.002695,5,0.435151,0.43481,0.434683,0.434881,0.000198,5
10,31.134863,0.780979,0.133191,0.014681,10.0,0.1,10.0,3,0.001,4020,...,0.341419,0.340607,0.002872,6,0.352615,0.351648,0.351893,0.352052,0.000411,11
6,226.110818,3.882657,0.3166,0.003008,0.29398,0.035541,0.027296,14,0.234681,3014,...,0.340237,0.33872,0.002487,7,0.391259,0.390446,0.390839,0.390848,0.000332,10
12,27.053499,0.377467,0.12464,0.005364,10.0,0.020375,10.0,3,10.0,3393,...,0.339416,0.338604,0.002779,8,0.347772,0.347076,0.347256,0.347368,0.000295,13
11,332.954356,1.05401,0.465451,0.020862,0.031341,0.04961,0.27949,13,0.004996,1633,...,0.339392,0.337729,0.002541,9,0.941575,0.941276,0.940728,0.941193,0.000351,2
14,22.282337,0.384618,0.131388,0.017242,0.451437,0.015358,0.309126,4,0.181371,2447,...,0.338672,0.337675,0.002917,10,0.34893,0.348307,0.348438,0.348558,0.000268,12


In [52]:
dump(bayes_search, 'Results/BayesSearchAgeYoung20iter.joblib')

['Results/BayesSearchAgeYoung20iter.joblib']

### Middle Model

In [53]:
label = 'Cantril_ladder'

In [54]:
m_predictor = TabularPredictor(label=label, eval_metric='r2', problem_type = 'regression').fit(m_train)

No path specified. Models will be saved in: "AutogluonModels/ag-20240622_140326"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.1.1
Python Version:     3.9.19
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Fri Mar 29 23:14:13 UTC 2024
CPU Count:          10
Memory Avail:       108.41 GB / 125.66 GB (86.3%)
Disk Space Avail:   189.10 GB / 1863.00 GB (10.2%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='best_quality'   : Maximize accuracy. Default time_limit=3600.
	presets='high_quality'   : Strong accuracy with fast inference speed. Default time_limit=3600.
	presets='good_quality'   : Good accuracy with very fast inference speed. Default time_limit=3600.
	presets='medium_quality' : Fast training time, ideal for initial prototyping.
	Consider sett

[1000]	valid_set's l2: 3.67925	valid_set's r2: 0.387841


	0.3889	 = Validation score   (r2)
	32.63s	 = Training   runtime
	0.09s	 = Validation runtime
Fitting model: LightGBM ...


[1000]	valid_set's l2: 3.62661	valid_set's r2: 0.396599
[2000]	valid_set's l2: 3.61762	valid_set's r2: 0.398095


	0.3983	 = Validation score   (r2)
	46.6s	 = Training   runtime
	0.12s	 = Validation runtime
Fitting model: RandomForestMSE ...
	0.3479	 = Validation score   (r2)
	328.9s	 = Training   runtime
	0.1s	 = Validation runtime
Fitting model: CatBoost ...
	0.3949	 = Validation score   (r2)
	553.62s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: ExtraTreesMSE ...
	0.3345	 = Validation score   (r2)
	291.19s	 = Training   runtime
	0.1s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
	0.3869	 = Validation score   (r2)
	234.77s	 = Training   runtime
	0.04s	 = Validation runtime
Fitting model: XGBoost ...
	0.3833	 = Validation score   (r2)
	45.74s	 = Training   runtime
	0.09s	 = Validation runtime
Fitting model: NeuralNetTorch ...
	0.3584	 = Validation score   (r2)
	266.48s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: LightGBMLarge ...


[1000]	valid_set's l2: 3.63698	valid_set's r2: 0.394874
[2000]	valid_set's l2: 3.62002	valid_set's r2: 0.397695
[3000]	valid_set's l2: 3.61329	valid_set's r2: 0.398815
[4000]	valid_set's l2: 3.61009	valid_set's r2: 0.399347
[5000]	valid_set's l2: 3.6186	valid_set's r2: 0.397931


	0.3994	 = Validation score   (r2)
	100.77s	 = Training   runtime
	0.29s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	Ensemble Weights: {'LightGBMLarge': 0.471, 'CatBoost': 0.235, 'LightGBM': 0.176, 'NeuralNetFastAI': 0.118}
	0.4025	 = Validation score   (r2)
	0.07s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 1915.93s ... Best model: WeightedEnsemble_L2 | Estimated inference throughput: 12964.4 rows/s (5973 batch size)
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20240622_140326")


In [55]:
m_predictor.evaluate(m_test, silent=True)

{'r2': 0.4065648968667013,
 'root_mean_squared_error': -1.8498671918083323,
 'mean_squared_error': -3.4220086273288453,
 'mean_absolute_error': -1.3907243476647781,
 'pearsonr': 0.6376429378412016,
 'median_absolute_error': -1.0624613761901855}

In [56]:
m_predictor.leaderboard(m_test)

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,0.406565,0.402497,r2,6.335968,0.460723,935.827766,0.010032,0.000476,0.06614,2,True,12
1,LightGBMLarge,0.402963,0.399424,r2,4.163403,0.285875,100.771887,4.163403,0.285875,100.771887,1,True,11
2,LightGBM,0.402133,0.398296,r2,1.501353,0.12282,46.601321,1.501353,0.12282,46.601321,1,True,4
3,CatBoost,0.399752,0.394906,r2,0.261763,0.015495,553.617184,0.261763,0.015495,553.617184,1,True,6
4,LightGBMXT,0.396888,0.388896,r2,1.107345,0.090605,32.634315,1.107345,0.090605,32.634315,1,True,3
5,XGBoost,0.39242,0.383299,r2,1.173206,0.094652,45.739672,1.173206,0.094652,45.739672,1,True,9
6,NeuralNetFastAI,0.389804,0.38688,r2,0.399417,0.036058,234.771234,0.399417,0.036058,234.771234,1,True,8
7,NeuralNetTorch,0.370763,0.358375,r2,0.165226,0.014527,266.480672,0.165226,0.014527,266.480672,1,True,10
8,RandomForestMSE,0.353944,0.347919,r2,3.8606,0.10235,328.904921,3.8606,0.10235,328.904921,1,True,5
9,ExtraTreesMSE,0.340644,0.334484,r2,3.640697,0.101482,291.189616,3.640697,0.101482,291.189616,1,True,7


In [57]:
for key in m_predictor.info()['model_info'].keys():
    print(m_predictor.info()['model_info'][key]['name'])
    print(m_predictor.info()['model_info'][key]['val_score'])
    print(m_predictor.info()['model_info'][key]['hyperparameters'])
    print("--------------------")

KNeighborsUnif
0.15104854404497292
{'weights': 'uniform'}
--------------------
KNeighborsDist
0.11062859928441104
{'weights': 'distance'}
--------------------
LightGBMXT
0.3888959572992885
{'learning_rate': 0.05, 'extra_trees': True}
--------------------
LightGBM
0.398296386013673
{'learning_rate': 0.05}
--------------------
RandomForestMSE
0.3479190356052031
{'n_estimators': 300, 'max_leaf_nodes': 15000, 'n_jobs': -1, 'random_state': 0, 'bootstrap': True, 'criterion': 'squared_error'}
--------------------
CatBoost
0.39490625144960345
{'iterations': 10000, 'learning_rate': 0.05, 'random_seed': 0, 'allow_writing_files': False, 'eval_metric': 'R2'}
--------------------
ExtraTreesMSE
0.3344837468473997
{'n_estimators': 300, 'max_leaf_nodes': 15000, 'n_jobs': -1, 'random_state': 0, 'bootstrap': True, 'criterion': 'squared_error'}
--------------------
NeuralNetFastAI
0.38687966797076356
{'layers': None, 'emb_drop': 0.1, 'ps': 0.1, 'bs': 'auto', 'lr': 0.01, 'epochs': 'auto', 'early.stopping.

In [58]:
m_predictor.info()['model_info'].keys()

dict_keys(['KNeighborsUnif', 'KNeighborsDist', 'LightGBMXT', 'LightGBM', 'RandomForestMSE', 'CatBoost', 'ExtraTreesMSE', 'NeuralNetFastAI', 'XGBoost', 'NeuralNetTorch', 'LightGBMLarge', 'WeightedEnsemble_L2'])

In [59]:
m_predictor = None
m_train = None
m_test = None

### Middle Bayes Search Hyperparameter (Test 3 times 1:9 CV)

In [60]:
ym = Df_middle['Cantril_ladder']

In [61]:
Xm = Df_middle.drop(columns=['Cantril_ladder'])

In [62]:
param_space = {
    'n_estimators': Integer(100, 5000),
    'learning_rate': Real(0.001, 0.1, prior='log-uniform'),
    'max_depth': Integer(3, 16),
    'subsample': Real(0.5, 1.0),
    'min_child_weight': Real(0.001, 10, prior='log-uniform'),
    'max_delta_step': Real(0.001, 10, prior='log-uniform'),
    'reg_lambda': Real(0.001, 10, prior='log-uniform'),
    'reg_alpha': Real(0.001, 10, prior='log-uniform'),
    'gamma': Real(0.001, 10, prior='log-uniform')
}

In [63]:
xgb_reg = xgb.XGBRegressor(objective='reg:squarederror',  device = 'cuda', tree_method='hist', random_state=42, enable_categorical=True)

In [64]:
class RandomRunNFoldsKFold(KFold):
    def __init__(self, n_splits=10, random_state=None, run_splits=3, **kwargs):
        super().__init__(n_splits=n_splits, shuffle=True, random_state=random_state, **kwargs)
        self.random_state = random_state
        self.actual_splits = run_splits  # Number of actual splits to use

    def split(self, X, y=None, groups=None):
        folds = list(super().split(X, y, groups))
        if self.random_state is not None:
            random.seed(self.random_state)
        selected_folds = random.sample(folds, self.actual_splits)
        for train_index, test_index in selected_folds:
            yield train_index, test_index

    def get_n_splits(self, X=None, y=None, groups=None):
        return self.actual_splits

In [65]:
rkfcv = RandomRunNFoldsKFold(n_splits=10, run_splits=3, random_state=42)

In [66]:
bayes_search = BayesSearchCV(
    estimator=xgb_reg,
    search_spaces=param_space,
    n_iter=20,
    scoring='r2',
    cv=rkfcv,
    n_jobs = 1,
    n_points = 1,
    verbose=2,
    random_state=42,
    return_train_score = True
)

In [67]:
bayes_search.fit(Xm, ym)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] END gamma=0.04369339947510315, learning_rate=0.02853983686604182, max_delta_step=5.388550972627239, max_depth=7, min_child_weight=0.47928274405969296, n_estimators=2129, reg_alpha=0.025335258486348353, reg_lambda=0.9078559343576645, subsample=0.6522316555182531; total time=  25.3s
[CV] END gamma=0.04369339947510315, learning_rate=0.02853983686604182, max_delta_step=5.388550972627239, max_depth=7, min_child_weight=0.47928274405969296, n_estimators=2129, reg_alpha=0.025335258486348353, reg_lambda=0.9078559343576645, subsample=0.6522316555182531; total time=  24.5s
[CV] END gamma=0.04369339947510315, learning_rate=0.02853983686604182, max_delta_step=5.388550972627239, max_depth=7, min_child_weight=0.47928274405969296, n_estimators=2129, reg_alpha=0.025335258486348353, reg_lambda=0.9078559343576645, subsample=0.6522316555182531; total time=  24.4s
Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] END gamma=2.2364

In [68]:
CV_result = bayes_search.cv_results_

In [69]:
pd.DataFrame(CV_result).sort_values(by='rank_test_score', ascending=True).head(10)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_gamma,param_learning_rate,param_max_delta_step,param_max_depth,param_min_child_weight,param_n_estimators,...,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score,rank_train_score
16,260.862294,1.711591,0.424207,0.010861,0.059651,0.01615,0.100424,13,1.762463,3304,...,0.409534,0.409484,0.002884,1,0.603081,0.603507,0.602626,0.603071,0.00036,5
8,18.248148,0.258135,0.072841,0.006537,6.636085,0.025191,3.066251,8,0.033439,1136,...,0.408097,0.408494,0.002911,2,0.510752,0.511241,0.511645,0.511213,0.000365,7
18,349.207772,4.081135,0.368426,0.013327,0.305094,0.028841,0.041116,15,0.111621,3456,...,0.407433,0.407721,0.002816,3,0.539866,0.539876,0.540185,0.539976,0.000148,6
17,208.355793,2.414097,0.389715,0.003508,0.001,0.015209,0.180591,11,2.116261,3643,...,0.407909,0.407206,0.003293,4,0.640808,0.641861,0.641575,0.641415,0.000445,2
0,24.621885,0.406539,0.093761,0.003183,0.043693,0.02854,5.388551,7,0.479283,2129,...,0.406219,0.406247,0.002979,5,0.507362,0.508281,0.507866,0.507836,0.000376,8
9,14.480482,0.582251,0.068304,0.001646,0.001034,0.043218,0.92285,6,0.206375,1446,...,0.405235,0.406215,0.003111,6,0.470383,0.47119,0.470495,0.470689,0.000357,11
4,187.321864,2.423789,0.27303,0.003884,1.578388,0.007517,0.127785,12,4.124851,3613,...,0.403955,0.404077,0.002602,7,0.490351,0.490948,0.490809,0.490703,0.000255,9
13,66.883969,0.84949,0.142458,0.004686,4.748083,0.007415,0.237224,10,0.002007,2286,...,0.403889,0.404003,0.002629,8,0.478312,0.478928,0.478735,0.478658,0.000257,10
11,29.329787,0.407253,0.087458,0.004085,0.074479,0.051166,0.257594,9,0.005075,1266,...,0.402534,0.402282,0.003668,9,0.611567,0.611505,0.61115,0.611407,0.000184,4
10,24.689996,0.657948,0.116226,0.010542,10.0,0.1,10.0,3,0.001,3793,...,0.400948,0.402227,0.002759,10,0.414497,0.415115,0.414986,0.414866,0.000266,14


In [70]:
dump(bayes_search, 'Results/BayesSearchAgeMiddle20iter.joblib')

['Results/BayesSearchAgeMiddle20iter.joblib']

### Old Model

In [71]:
label = 'Cantril_ladder'

In [72]:
o_predictor = TabularPredictor(label=label, eval_metric='r2', problem_type = 'regression').fit(o_train)

No path specified. Models will be saved in: "AutogluonModels/ag-20240622_161157"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.1.1
Python Version:     3.9.19
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Fri Mar 29 23:14:13 UTC 2024
CPU Count:          10
Memory Avail:       106.94 GB / 125.66 GB (85.1%)
Disk Space Avail:   187.01 GB / 1863.00 GB (10.0%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='best_quality'   : Maximize accuracy. Default time_limit=3600.
	presets='high_quality'   : Strong accuracy with fast inference speed. Default time_limit=3600.
	presets='good_quality'   : Good accuracy with very fast inference speed. Default time_limit=3600.
	presets='medium_quality' : Fast training time, ideal for initial prototyping.
	Consider sett

In [73]:
o_predictor.evaluate(o_test, silent=True)

{'r2': 0.4091811565472627,
 'root_mean_squared_error': -1.9389696415833295,
 'mean_squared_error': -3.7596032709817857,
 'mean_absolute_error': -1.470345265104979,
 'pearsonr': 0.6396988015205557,
 'median_absolute_error': -1.1309094429016113}

In [74]:
o_predictor.leaderboard(o_test)

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,0.409181,0.420653,r2,0.501896,0.04932,184.451424,0.006771,0.000418,0.061175,2,True,12
1,LightGBM,0.406333,0.418502,r2,0.084028,0.010414,3.341617,0.084028,0.010414,3.341617,1,True,4
2,CatBoost,0.405035,0.41492,r2,0.069346,0.006182,60.035336,0.069346,0.006182,60.035336,1,True,6
3,LightGBMLarge,0.404382,0.414507,r2,0.12585,0.011309,3.89984,0.12585,0.011309,3.89984,1,True,11
4,LightGBMXT,0.401434,0.414173,r2,0.074586,0.007113,2.581644,0.074586,0.007113,2.581644,1,True,3
5,XGBoost,0.400381,0.406121,r2,0.131844,0.014667,5.422534,0.131844,0.014667,5.422534,1,True,9
6,NeuralNetFastAI,0.394414,0.405077,r2,0.267165,0.025193,118.431652,0.267165,0.025193,118.431652,1,True,8
7,NeuralNetTorch,0.373904,0.385839,r2,0.059115,0.011439,80.581688,0.059115,0.011439,80.581688,1,True,10
8,RandomForestMSE,0.367585,0.374342,r2,3.270016,0.113011,91.906038,3.270016,0.113011,91.906038,1,True,5
9,ExtraTreesMSE,0.356211,0.361113,r2,3.054047,0.107785,79.747095,3.054047,0.107785,79.747095,1,True,7


In [75]:
for key in o_predictor.info()['model_info'].keys():
    print(o_predictor.info()['model_info'][key]['name'])
    print(o_predictor.info()['model_info'][key]['val_score'])
    print(o_predictor.info()['model_info'][key]['hyperparameters'])
    print("--------------------")

KNeighborsUnif
0.1228731531441769
{'weights': 'uniform'}
--------------------
KNeighborsDist
0.09317201281982823
{'weights': 'distance'}
--------------------
LightGBMXT
0.4141734629709759
{'learning_rate': 0.05, 'extra_trees': True}
--------------------
LightGBM
0.41850232384736175
{'learning_rate': 0.05}
--------------------
RandomForestMSE
0.37434198414076314
{'n_estimators': 300, 'max_leaf_nodes': 15000, 'n_jobs': -1, 'random_state': 0, 'bootstrap': True, 'criterion': 'squared_error'}
--------------------
CatBoost
0.414920284944005
{'iterations': 10000, 'learning_rate': 0.05, 'random_seed': 0, 'allow_writing_files': False, 'eval_metric': 'R2'}
--------------------
ExtraTreesMSE
0.36111280334931484
{'n_estimators': 300, 'max_leaf_nodes': 15000, 'n_jobs': -1, 'random_state': 0, 'bootstrap': True, 'criterion': 'squared_error'}
--------------------
NeuralNetFastAI
0.40507733591091477
{'layers': None, 'emb_drop': 0.1, 'ps': 0.1, 'bs': 'auto', 'lr': 0.01, 'epochs': 'auto', 'early.stopping

In [76]:
o_predictor.info()['model_info'].keys()

dict_keys(['KNeighborsUnif', 'KNeighborsDist', 'LightGBMXT', 'LightGBM', 'RandomForestMSE', 'CatBoost', 'ExtraTreesMSE', 'NeuralNetFastAI', 'XGBoost', 'NeuralNetTorch', 'LightGBMLarge', 'WeightedEnsemble_L2'])

In [77]:
o_predictor = None
o_train = None
o_test = None

### Old Bayes Search Hyperparameter (Test 3 times 1:9 CV)

In [82]:
yo = Df_old['Cantril_ladder']

In [83]:
Xo = Df_old.drop(columns=['Cantril_ladder'])

In [84]:
param_space = {
    'n_estimators': Integer(100, 5000),
    'learning_rate': Real(0.001, 0.1, prior='log-uniform'),
    'max_depth': Integer(3, 16),
    'subsample': Real(0.5, 1.0),
    'min_child_weight': Real(0.001, 10, prior='log-uniform'),
    'max_delta_step': Real(0.001, 10, prior='log-uniform'),
    'reg_lambda': Real(0.001, 10, prior='log-uniform'),
    'reg_alpha': Real(0.001, 10, prior='log-uniform'),
    'gamma': Real(0.001, 10, prior='log-uniform')
}

In [85]:
xgb_reg = xgb.XGBRegressor(objective='reg:squarederror',  device = 'cuda', tree_method='hist', random_state=42, enable_categorical=True)

In [86]:
class RandomRunNFoldsKFold(KFold):
    def __init__(self, n_splits=10, random_state=None, run_splits=3, **kwargs):
        super().__init__(n_splits=n_splits, shuffle=True, random_state=random_state, **kwargs)
        self.random_state = random_state
        self.actual_splits = run_splits  # Number of actual splits to use

    def split(self, X, y=None, groups=None):
        folds = list(super().split(X, y, groups))
        if self.random_state is not None:
            random.seed(self.random_state)
        selected_folds = random.sample(folds, self.actual_splits)
        for train_index, test_index in selected_folds:
            yield train_index, test_index

    def get_n_splits(self, X=None, y=None, groups=None):
        return self.actual_splits

In [87]:
rkfcv = RandomRunNFoldsKFold(n_splits=10, run_splits=3, random_state=42)

In [88]:
bayes_search = BayesSearchCV(
    estimator=xgb_reg,
    search_spaces=param_space,
    n_iter=20,
    scoring='r2',
    cv=rkfcv,
    n_jobs = 1,
    n_points = 1,
    verbose=2,
    random_state=42,
    return_train_score = True
)

In [89]:
bayes_search.fit(Xo, yo)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] END gamma=0.04369339947510315, learning_rate=0.02853983686604182, max_delta_step=5.388550972627239, max_depth=7, min_child_weight=0.47928274405969296, n_estimators=2129, reg_alpha=0.025335258486348353, reg_lambda=0.9078559343576645, subsample=0.6522316555182531; total time=  19.2s
[CV] END gamma=0.04369339947510315, learning_rate=0.02853983686604182, max_delta_step=5.388550972627239, max_depth=7, min_child_weight=0.47928274405969296, n_estimators=2129, reg_alpha=0.025335258486348353, reg_lambda=0.9078559343576645, subsample=0.6522316555182531; total time=  19.2s
[CV] END gamma=0.04369339947510315, learning_rate=0.02853983686604182, max_delta_step=5.388550972627239, max_depth=7, min_child_weight=0.47928274405969296, n_estimators=2129, reg_alpha=0.025335258486348353, reg_lambda=0.9078559343576645, subsample=0.6522316555182531; total time=  19.3s
Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] END gamma=2.2364

In [90]:
CV_result = bayes_search.cv_results_

In [91]:
pd.DataFrame(CV_result).sort_values(by='rank_test_score', ascending=True).head(10)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_gamma,param_learning_rate,param_max_delta_step,param_max_depth,param_min_child_weight,param_n_estimators,...,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score,rank_train_score
11,34.022171,0.825355,0.056066,0.001606,1.204548,0.030685,0.380578,13,0.002362,540,...,0.407185,0.404628,0.002094,1,0.671441,0.672732,0.672991,0.672388,0.000678,5
12,19.295416,0.385987,0.102948,0.01359,0.013327,0.017682,0.230974,5,0.006523,2963,...,0.405702,0.404047,0.001245,2,0.462536,0.462291,0.462452,0.462426,0.000101,12
4,114.631698,0.919842,0.145908,0.005955,1.578388,0.007517,0.127785,12,4.124851,3613,...,0.405466,0.403964,0.001318,3,0.51697,0.516895,0.516755,0.516873,8.9e-05,10
16,36.734434,0.998943,0.098995,0.016352,0.05975,0.078823,0.017177,10,0.009662,2675,...,0.402766,0.401637,0.000994,4,0.451411,0.451293,0.451099,0.451268,0.000128,14
8,13.326068,0.314739,0.047749,0.002803,6.636085,0.025191,3.066251,8,0.033439,1136,...,0.402558,0.400894,0.001782,5,0.609858,0.609187,0.609968,0.609671,0.000345,8
15,83.710865,0.464596,0.150129,0.012521,0.001804,0.009201,0.723354,10,1.506823,2617,...,0.400922,0.399224,0.001206,6,0.769948,0.768992,0.770053,0.769664,0.000478,4
6,86.718273,1.347586,0.11231,0.004167,0.29398,0.035541,0.027296,14,0.234681,3014,...,0.398283,0.397309,0.000878,7,0.461502,0.461248,0.460855,0.461201,0.000266,13
9,11.583722,0.085463,0.049511,0.003244,0.001034,0.043218,0.92285,6,0.206375,1446,...,0.399324,0.396914,0.001704,8,0.559198,0.558981,0.558576,0.558919,0.000258,9
17,25.098668,0.913748,0.07897,0.011291,4.274584,0.029096,0.032728,14,0.025074,3154,...,0.39485,0.393637,0.001118,9,0.411829,0.411731,0.411391,0.41165,0.000188,15
18,379.988634,0.431316,0.673549,0.086981,1.380333,0.010964,0.206836,14,2.063847,4920,...,0.393636,0.391162,0.001854,10,0.8813,0.88133,0.881162,0.881264,7.3e-05,2


In [92]:
dump(bayes_search, 'Results/BayesSearchAgeOld20iter.joblib')

['Results/BayesSearchAgeOld20iter.joblib']