# Gender Difference: XGB Experiment based on 64 Variables 1911k Rows 14 Waves

In [2]:
%pwd

'/mnt/d/OneDrive - Kyushu University/ESG09_Article/Code'

In [3]:
%cd ..

/mnt/d/OneDrive - Kyushu University/ESG09_Article


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


## Import Package

In [4]:
from autogluon.tabular import TabularDataset, TabularPredictor
from joblib import dump
import os 
import pandas as pd
import random
from scipy import stats
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, KFold
from skopt import BayesSearchCV
from skopt.space import Real, Integer
import xgboost as xgb

## Load and Make Datasets

In [5]:
Df_Filename = os.path.join("Data", "GallupWB_Ml64var1911k14wave_v1.parquet")

In [6]:
Df = pd.read_parquet(Df_Filename)

In [7]:
Df.shape

(1911212, 64)

In [8]:
Df.columns

Index(['wave', 'INCOME_2', 'Cantril_ladder', 'Health_disable', 'Relative_have',
       'Living_standard_change', 'Enough_food', 'Enough_shelter',
       'Well_rested', 'Respected', 'Smile', 'Interesting_thing', 'Enjoyment',
       'Physical_pain', 'Worry', 'Sadness', 'Stress', 'Anger',
       'City_satisficied', 'Economic_change', 'Goodtime_job', 'Sat_pubtran',
       'Sat_road', 'Sat_edu', 'Sat_qualityair', 'Sat_qualitywater',
       'Sat_healthcare', 'Sat_affhouse', 'Sat_oppofriend', 'Good_minorities',
       'Good_gayles', 'Good_immigrants', 'Donated', 'Volunteer',
       'Help_stranger', 'Voice_official', 'Local_police', 'Safety_walk',
       'Stolen', 'Assualted', 'Religion_importance', 'Children_respected',
       'Children_learn', 'Women_respected', 'Sat_dealpoor', 'Sat_perserveenv',
       'Freedom_chooselife', 'Conf_military', 'Conf_judicial',
       'Conf_government', 'Conf_financial', 'Conf_honestyelections',
       'Freedom_media', 'Corruption_business', 'Corruption_governm

### Check the Difference between Gender

In [9]:
female_cantrilladder = Df.loc[Df['Gender_female']==1, 'Cantril_ladder'].mean()

In [10]:
female_cantrilladder

5.569313150299246

In [11]:
male_cantrilladder = Df.loc[Df['Gender_female']==0, 'Cantril_ladder'].mean()

In [12]:
male_cantrilladder

5.466124824941722

In [13]:
t_stat, p_value = stats.ttest_ind(Df.loc[Df['Gender_female']==1, 'Cantril_ladder'], Df.loc[Df['Gender_female']==0, 'Cantril_ladder'])

In [14]:
print(f"T-statistic: {t_stat}, P-value: {p_value}")

T-statistic: 29.49502322170667, P-value: 3.682499213688914e-191


### Shuffle Conversion

In [15]:
Df = Df.sample(frac=1, random_state=42).reset_index(drop=True)

In [16]:
Df['COUNTRY_ISO3'] = Df['COUNTRY_ISO3'].astype('category')

### Df_male 

In [16]:
Df_male = Df.loc[Df['Gender_female']==0, :].drop(columns=['Gender_female'])

In [17]:
Df_male.shape

(893988, 63)

In [18]:
ym = Df_male['Cantril_ladder']

In [19]:
Xm = Df_male.drop(columns=['Cantril_ladder'])

In [20]:
Xm_train, Xm_test, ym_train, ym_test = train_test_split(Xm, ym, test_size=0.1, random_state=42)

### Df_female 

In [21]:
Df_female = Df.loc[Df['Gender_female']==1, :].drop(columns=['Gender_female'])

In [22]:
Df_female.shape

(1017224, 63)

In [23]:
yf = Df_female['Cantril_ladder']

In [24]:
Xf = Df_female.drop(columns=['Cantril_ladder'])

In [25]:
Xf_train, Xf_test, yf_train, yf_test = train_test_split(Xf, yf, test_size=0.1, random_state=42)

## Model and Tuning Hyper

### Male Model

In [26]:
model = xgb.XGBRegressor(objective='reg:squarederror', device = 'cuda', tree_method='gpu_hist', 
                         n_estimators=500, learning_rate=0.01, max_depth=5, 
                         random_state=42, enable_categorical=True  )
model.fit(Xm_train, ym_train)

Parameters: { "device" } are not used.



In [27]:
ym_pred = model.predict(Xm_test)

In [28]:
r2_score(ym_test, ym_pred) * 100

35.345814761344485

In [29]:
ym_train_pred = model.predict(Xm_train)

In [30]:
r2_score(ym_train, ym_train_pred) * 100

36.214464768212785

In [31]:
n_estimators_list = list(range(100, 1_100, 100))
learning_rate_list = [0.001, 0.01, 0.1]
max_depth_list = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
subsample_list = [0.6, 0.7, 0.8, 0.9, 1]

In [32]:
results_list = []
for n_estimators in n_estimators_list:
    for learning_rate in learning_rate_list:
        for max_depth in max_depth_list:
            for subsample in subsample_list:
                model = xgb.XGBRegressor(objective='reg:squarederror', 
                                         tree_method='gpu_hist',  ## on linux: device = 'cuda', tree_method='hist', 
                                         n_estimators = n_estimators, 
                                         learning_rate = learning_rate, 
                                         max_depth = max_depth, 
                                         subsample = subsample,
                                         random_state=42, enable_categorical=True  )
                model.fit(Xm_train, ym_train)
                ym_pred = model.predict(Xm_test)
                test_r2 = r2_score(ym_test, ym_pred) * 100
                ym_train_pred = model.predict(Xm_train)
                train_r2 = r2_score(ym_train, ym_train_pred) * 100
                row = [n_estimators, learning_rate, max_depth, 
                       subsample, train_r2, test_r2]
                print(row)
                model = None
                ym_pred = None
                ym_train_pred = None
                results_list.append(row)

[100, 0.001, 3, 0.6, -346.9465146057738, -345.6614864532208]
[100, 0.001, 3, 0.7, -346.94511815918577, -345.6588582299794]
[100, 0.001, 3, 0.8, -346.945979315638, -345.66037200174094]
[100, 0.001, 3, 0.9, -346.9450669262052, -345.65906585005706]
[100, 0.001, 3, 1, -346.9437464794587, -345.65778195979783]
[100, 0.001, 4, 0.6, -346.46283379904037, -345.1830119581036]
[100, 0.001, 4, 0.7, -346.45976203634837, -345.18123516374874]
[100, 0.001, 4, 0.8, -346.46218569776295, -345.18408347586444]
[100, 0.001, 4, 0.9, -346.4604923609569, -345.1822236620651]
[100, 0.001, 4, 1, -346.4580370637822, -345.1797350393475]
[100, 0.001, 5, 0.6, -346.128897802993, -344.84157661217193]
[100, 0.001, 5, 0.7, -346.127099842922, -344.83896621101104]
[100, 0.001, 5, 0.8, -346.1288610316, -344.84036834428633]
[100, 0.001, 5, 0.9, -346.12730377965005, -344.8380758381987]
[100, 0.001, 5, 1, -346.12494584822446, -344.834050753722]
[100, 0.001, 6, 0.6, -345.89156683411034, -344.59520775945026]
[100, 0.001, 6, 0.7, 

In [33]:
results_list_df = pd.DataFrame(results_list, columns = ['n_estimators', 'learning_rate', 'max_depth', 'subsample', 'train_r2', 'test_r2'])

In [40]:
results_list_df.to_parquet(os.path.join('Results', "Hyperparameter_GenderMaleXgboost_ML.parquet"))

In [41]:
results_list_df.head()

Unnamed: 0,n_estimators,learning_rate,max_depth,subsample,train_r2,test_r2
0,100,0.001,3,0.6,-346.946515,-345.661486
1,100,0.001,3,0.7,-346.945118,-345.658858
2,100,0.001,3,0.8,-346.945979,-345.660372
3,100,0.001,3,0.9,-346.945067,-345.659066
4,100,0.001,3,1.0,-346.943746,-345.657782


In [43]:
results_list_df.sort_values(by='test_r2', ascending=False).head(20)

Unnamed: 0,n_estimators,learning_rate,max_depth,subsample,train_r2,test_r2
1868,1000,0.01,12,0.9,65.259347,37.972949
1865,1000,0.01,12,0.6,63.688727,37.949387
1862,1000,0.01,11,0.8,56.286785,37.944039
1867,1000,0.01,12,0.8,64.856366,37.932941
1673,900,0.01,12,0.9,63.580487,37.917952
1872,1000,0.01,13,0.8,74.39485,37.917832
1866,1000,0.01,12,0.7,64.327114,37.917203
1670,900,0.01,12,0.6,62.041453,37.906973
1873,1000,0.01,13,0.9,74.956253,37.905652
1677,900,0.01,13,0.8,72.50999,37.900075


**Fine tune other hyperparameter**

In [45]:
gamma_list = [0, 0.01, 0.1, 1, 10]
min_child_weight_list = [0, 0.1, 1, 10]
max_delta_step_list = [0, 0.1, 1, 10]
reg_lambda_list = [0, 0.1, 1, 10]
reg_alpha_list = [0, 0.1, 1, 10]

In [46]:
# 1000	0.01	12	0.9	65.259347	37.972949
results_list_otherhyper = []
for gamma in gamma_list:
    for min_child_weight in min_child_weight_list:
        for max_delta_step in max_delta_step_list:
            for reg_lambda in reg_lambda_list:
                for reg_alpha in reg_alpha_list:
                    model = xgb.XGBRegressor(objective='reg:squarederror',  tree_method='gpu_hist',  ## on linux: device = 'cuda', tree_method='hist', 
                                             n_estimators = 1000,  learning_rate = 0.01, max_depth = 12,   subsample = 0.9,
                                             gamma = gamma, min_child_weight = min_child_weight, max_delta_step = max_delta_step,
                                             reg_lambda = reg_lambda, reg_alpha = reg_alpha, 
                                             random_state=42, enable_categorical=True  )
                    model.fit(Xm_train, ym_train)
                    ym_pred = model.predict(Xm_test)
                    test_r2 = r2_score(ym_test, ym_pred) * 100
                    ym_train_pred = model.predict(Xm_train)
                    train_r2 = r2_score(ym_train, ym_train_pred) * 100
                    row = [gamma, min_child_weight, max_delta_step, reg_lambda, 
                           reg_alpha, train_r2, test_r2]
                    print(row)
                    model = None
                    ym_pred = None
                    ym_train_pred = None
                    results_list_otherhyper.append(row)

[0, 0, 0, 0, 0, 66.32583976200496, 37.92534171377735]
[0, 0, 0, 0, 0.1, 66.25929018662178, 37.89396674883224]
[0, 0, 0, 0, 1, 65.77513090428084, 37.92659717073056]
[0, 0, 0, 0, 10, 57.76528690574637, 37.959529768840085]
[0, 0, 0, 0.1, 0, 66.14473734107358, 37.91347779740685]
[0, 0, 0, 0.1, 0.1, 66.11109036118917, 37.903010747107345]
[0, 0, 0, 0.1, 1, 65.64327235851457, 37.913586773826104]
[0, 0, 0, 0.1, 10, 57.73111159250488, 37.934744396040244]
[0, 0, 0, 1, 0, 65.2593470125833, 37.972948793923756]
[0, 0, 0, 1, 0.1, 65.2311686285684, 37.98462798746666]
[0, 0, 0, 1, 1, 64.68336611746707, 37.962690044963246]
[0, 0, 0, 1, 10, 57.25202947338671, 37.93972695444805]
[0, 0, 0, 10, 0, 59.587659852857456, 38.049672756904464]
[0, 0, 0, 10, 0.1, 59.56645364193989, 38.03215426078196]
[0, 0, 0, 10, 1, 59.11353450893302, 38.05629920865351]
[0, 0, 0, 10, 10, 54.593747394480815, 37.99005296392986]
[0, 0, 0.1, 0, 0, 66.28456097496328, 37.97191901178252]
[0, 0, 0.1, 0, 0.1, 66.19149802930379, 37.9021953

KeyboardInterrupt: 

## AutoML Model

### load and make dataset

In [5]:
Df_Filename = os.path.join("Data", "GallupWB_Ml64var1911k14wave_v1.parquet")

In [6]:
Df = pd.read_parquet(Df_Filename)

In [7]:
Df.shape

(1911212, 64)

In [8]:
Df.columns

Index(['wave', 'INCOME_2', 'Cantril_ladder', 'Health_disable', 'Relative_have',
       'Living_standard_change', 'Enough_food', 'Enough_shelter',
       'Well_rested', 'Respected', 'Smile', 'Interesting_thing', 'Enjoyment',
       'Physical_pain', 'Worry', 'Sadness', 'Stress', 'Anger',
       'City_satisficied', 'Economic_change', 'Goodtime_job', 'Sat_pubtran',
       'Sat_road', 'Sat_edu', 'Sat_qualityair', 'Sat_qualitywater',
       'Sat_healthcare', 'Sat_affhouse', 'Sat_oppofriend', 'Good_minorities',
       'Good_gayles', 'Good_immigrants', 'Donated', 'Volunteer',
       'Help_stranger', 'Voice_official', 'Local_police', 'Safety_walk',
       'Stolen', 'Assualted', 'Religion_importance', 'Children_respected',
       'Children_learn', 'Women_respected', 'Sat_dealpoor', 'Sat_perserveenv',
       'Freedom_chooselife', 'Conf_military', 'Conf_judicial',
       'Conf_government', 'Conf_financial', 'Conf_honestyelections',
       'Freedom_media', 'Corruption_business', 'Corruption_governm

### Shuffle Conversion

In [9]:
Df = Df.sample(frac=1, random_state=42).reset_index(drop=True)

In [10]:
Df['COUNTRY_ISO3'] = Df['COUNTRY_ISO3'].astype('category')

### Df_male 

In [11]:
Df_male = Df.loc[Df['Gender_female']==0, :].drop(columns=['Gender_female'])

In [12]:
Df_male.shape

(893988, 63)

In [13]:
m_train, m_test = train_test_split(Df_male, test_size=0.1, random_state=42)

### Df_female 

In [14]:
Df_female = Df.loc[Df['Gender_female']==1, :].drop(columns=['Gender_female'])

In [15]:
Df_female.shape

(1017224, 63)

In [16]:
f_train, f_test = train_test_split(Df_female, test_size=0.1, random_state=42)

### Df_total

In [60]:
Df_total = Df.drop(columns=['Gender_female'])

In [61]:
Df_total.shape

(1911212, 63)

In [62]:
tot_train, tot_test = train_test_split(Df_total, test_size=0.1, random_state=42)

### Male Model Automl

In [16]:
label = 'Cantril_ladder'

In [18]:
m_predictor = TabularPredictor(label=label, eval_metric='r2', problem_type = 'regression').fit(m_train)

No path specified. Models will be saved in: "AutogluonModels\ag-20240620_085533"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.1.1
Python Version:     3.9.19
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.19045
CPU Count:          12
Memory Avail:       39.63 GB / 63.92 GB (62.0%)
Disk Space Avail:   387.29 GB / 1863.00 GB (20.8%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='best_quality'   : Maximize accuracy. Default time_limit=3600.
	presets='high_quality'   : Strong accuracy with fast inference speed. Default time_limit=3600.
	presets='good_quality'   : Good accuracy with very fast inference speed. Default time_limit=3600.
	presets='medium_quality' : Fast training time, ideal for initial prototyping.
	Consider setting `time_limit` to ensure

[1000]	valid_set's l2: 3.60994	valid_set's r2: 0.366948
[2000]	valid_set's l2: 3.59733	valid_set's r2: 0.36916
[3000]	valid_set's l2: 3.5941	valid_set's r2: 0.369728


	0.3698	 = Validation score   (r2)
	84.57s	 = Training   runtime
	0.23s	 = Validation runtime
Fitting model: LightGBM ...


[1000]	valid_set's l2: 3.58657	valid_set's r2: 0.371047
[2000]	valid_set's l2: 3.57131	valid_set's r2: 0.373723
[3000]	valid_set's l2: 3.5662	valid_set's r2: 0.37462
[4000]	valid_set's l2: 3.56599	valid_set's r2: 0.374656


	0.3749	 = Validation score   (r2)
	80.89s	 = Training   runtime
	0.23s	 = Validation runtime
Fitting model: RandomForestMSE ...
	0.3198	 = Validation score   (r2)
	654.47s	 = Training   runtime
	0.1s	 = Validation runtime
Fitting model: CatBoost ...
	0.3701	 = Validation score   (r2)
	1393.41s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: ExtraTreesMSE ...
	0.3018	 = Validation score   (r2)
	678.81s	 = Training   runtime
	0.09s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
	0.3607	 = Validation score   (r2)
	695.09s	 = Training   runtime
	0.11s	 = Validation runtime
Fitting model: XGBoost ...
	0.3615	 = Validation score   (r2)
	90.76s	 = Training   runtime
	0.2s	 = Validation runtime
Fitting model: NeuralNetTorch ...
	0.3412	 = Validation score   (r2)
	944.11s	 = Training   runtime
	0.04s	 = Validation runtime
Fitting model: LightGBMLarge ...


[1000]	valid_set's l2: 3.58005	valid_set's r2: 0.37219
[2000]	valid_set's l2: 3.56735	valid_set's r2: 0.374418
[3000]	valid_set's l2: 3.5576	valid_set's r2: 0.376127
[4000]	valid_set's l2: 3.55515	valid_set's r2: 0.376557
[5000]	valid_set's l2: 3.55113	valid_set's r2: 0.377263
[6000]	valid_set's l2: 3.55279	valid_set's r2: 0.376972
[7000]	valid_set's l2: 3.55482	valid_set's r2: 0.376616


	0.3778	 = Validation score   (r2)
	184.1s	 = Training   runtime
	0.71s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	Ensemble Weights: {'LightGBMLarge': 0.565, 'CatBoost': 0.13, 'NeuralNetFastAI': 0.13, 'LightGBM': 0.087, 'LightGBMXT': 0.043, 'XGBoost': 0.043}
	0.3801	 = Validation score   (r2)
	0.11s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 4826.51s ... Best model: WeightedEnsemble_L2 | Estimated inference throughput: 5367.6 rows/s (8046 batch size)
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels\ag-20240620_085533")


In [19]:
m_predictor.evaluate(m_test, silent=True)

{'r2': 0.37985965786928644,
 'root_mean_squared_error': -1.8875276982572986,
 'mean_squared_error': -3.5627608116884955,
 'mean_absolute_error': -1.4191708827073135,
 'pearsonr': 0.6163880049514061,
 'median_absolute_error': -1.0703115463256836}

In [21]:
m_predictor.leaderboard(m_test)

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,0.37986,0.380059,r2,17.00253,1.498989,2528.928082,0.013963,0.000997,0.105746,2,True,12
1,LightGBMLarge,0.377575,0.377785,r2,8.260907,0.705113,184.09568,8.260907,0.705113,184.09568,1,True,11
2,LightGBM,0.374333,0.374873,r2,2.504303,0.228388,80.891675,2.504303,0.228388,80.891675,1,True,4
3,CatBoost,0.37061,0.370082,r2,0.32513,0.024934,1393.411067,0.32513,0.024934,1393.411067,1,True,6
4,LightGBMXT,0.369363,0.369777,r2,2.458425,0.225397,84.572939,2.458425,0.225397,84.572939,1,True,3
5,XGBoost,0.360725,0.36147,r2,2.299849,0.203456,90.760282,2.299849,0.203456,90.760282,1,True,9
6,NeuralNetFastAI,0.359402,0.360715,r2,1.139952,0.110704,695.090694,1.139952,0.110704,695.090694,1,True,8
7,NeuralNetTorch,0.337016,0.34116,r2,0.380982,0.044879,944.110167,0.380982,0.044879,944.110167,1,True,10
8,RandomForestMSE,0.322827,0.319784,r2,1.279578,0.099752,654.471886,1.279578,0.099752,654.471886,1,True,5
9,ExtraTreesMSE,0.305731,0.301751,r2,1.261626,0.089735,678.811251,1.261626,0.089735,678.811251,1,True,7


In [37]:
for key in m_predictor.info()['model_info'].keys():
    print(m_predictor.info()['model_info'][key]['name'])
    print(m_predictor.info()['model_info'][key]['val_score'])
    print(m_predictor.info()['model_info'][key]['hyperparameters'])
    print("--------------------")

KNeighborsUnif
0.10674743655818908
{'weights': 'uniform'}
KNeighborsDist
0.09836589452010058
{'weights': 'distance'}
LightGBMXT
0.36977675120272957
{'learning_rate': 0.05, 'extra_trees': True}
LightGBM
0.37487347606403787
{'learning_rate': 0.05}
RandomForestMSE
0.31978355567125105
{'n_estimators': 300, 'max_leaf_nodes': 15000, 'n_jobs': -1, 'random_state': 0, 'bootstrap': True, 'criterion': 'squared_error'}
CatBoost
0.3700815953597658
{'iterations': 10000, 'learning_rate': 0.05, 'random_seed': 0, 'allow_writing_files': False, 'eval_metric': 'R2'}
ExtraTreesMSE
0.3017510354484697
{'n_estimators': 300, 'max_leaf_nodes': 15000, 'n_jobs': -1, 'random_state': 0, 'bootstrap': True, 'criterion': 'squared_error'}
NeuralNetFastAI
0.3607149606986212
{'layers': None, 'emb_drop': 0.1, 'ps': 0.1, 'bs': 'auto', 'lr': 0.01, 'epochs': 'auto', 'early.stopping.min_delta': 0.0001, 'early.stopping.patience': 20, 'smoothing': 0.0}
XGBoost
0.3614697252337269
{'n_estimators': 10000, 'learning_rate': 0.1, 'n_

In [34]:
m_predictor.info()['model_info'].keys()

dict_keys(['KNeighborsUnif', 'KNeighborsDist', 'LightGBMXT', 'LightGBM', 'RandomForestMSE', 'CatBoost', 'ExtraTreesMSE', 'NeuralNetFastAI', 'XGBoost', 'NeuralNetTorch', 'LightGBMLarge', 'WeightedEnsemble_L2'])

### Female Model Automl

In [40]:
label = 'Cantril_ladder'

In [41]:
f_predictor = TabularPredictor(label=label, eval_metric='r2', problem_type = 'regression').fit(f_train)

No path specified. Models will be saved in: "AutogluonModels\ag-20240622_044707"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.1.1
Python Version:     3.9.19
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.19045
CPU Count:          12
Memory Avail:       27.59 GB / 63.92 GB (43.2%)
Disk Space Avail:   382.93 GB / 1863.00 GB (20.6%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='best_quality'   : Maximize accuracy. Default time_limit=3600.
	presets='high_quality'   : Strong accuracy with fast inference speed. Default time_limit=3600.
	presets='good_quality'   : Good accuracy with very fast inference speed. Default time_limit=3600.
	presets='medium_quality' : Fast training time, ideal for initial prototyping.
	Consider setting `time_limit` to ensure

[1000]	valid_set's l2: 3.91447	valid_set's r2: 0.345927
[2000]	valid_set's l2: 3.89653	valid_set's r2: 0.348925
[3000]	valid_set's l2: 3.89209	valid_set's r2: 0.349666
[4000]	valid_set's l2: 3.88664	valid_set's r2: 0.350577
[5000]	valid_set's l2: 3.88519	valid_set's r2: 0.35082


	0.3509	 = Validation score   (r2)
	120.09s	 = Training   runtime
	0.37s	 = Validation runtime
Fitting model: LightGBM ...


[1000]	valid_set's l2: 3.86297	valid_set's r2: 0.354532
[2000]	valid_set's l2: 3.84891	valid_set's r2: 0.356882
[3000]	valid_set's l2: 3.8357	valid_set's r2: 0.359088
[4000]	valid_set's l2: 3.83213	valid_set's r2: 0.359685


	0.36	 = Validation score   (r2)
	97.58s	 = Training   runtime
	0.28s	 = Validation runtime
Fitting model: RandomForestMSE ...
	0.2988	 = Validation score   (r2)
	756.89s	 = Training   runtime
	0.13s	 = Validation runtime
Fitting model: CatBoost ...
	0.3523	 = Validation score   (r2)
	1804.59s	 = Training   runtime
	0.03s	 = Validation runtime
Fitting model: ExtraTreesMSE ...
	0.2816	 = Validation score   (r2)
	783.19s	 = Training   runtime
	0.13s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
	0.3447	 = Validation score   (r2)
	838.59s	 = Training   runtime
	0.14s	 = Validation runtime
Fitting model: XGBoost ...
	0.3422	 = Validation score   (r2)
	120.55s	 = Training   runtime
	0.24s	 = Validation runtime
Fitting model: NeuralNetTorch ...
	0.312	 = Validation score   (r2)
	831.28s	 = Training   runtime
	0.06s	 = Validation runtime
Fitting model: LightGBMLarge ...


[1000]	valid_set's l2: 3.85603	valid_set's r2: 0.355692
[2000]	valid_set's l2: 3.84228	valid_set's r2: 0.35799
[3000]	valid_set's l2: 3.83587	valid_set's r2: 0.35906
[4000]	valid_set's l2: 3.8318	valid_set's r2: 0.359741
[5000]	valid_set's l2: 3.82476	valid_set's r2: 0.360918
[6000]	valid_set's l2: 3.82516	valid_set's r2: 0.360849
[7000]	valid_set's l2: 3.82338	valid_set's r2: 0.361147
[8000]	valid_set's l2: 3.82308	valid_set's r2: 0.361198
[9000]	valid_set's l2: 3.81855	valid_set's r2: 0.361955
[10000]	valid_set's l2: 3.8214	valid_set's r2: 0.361478


	0.3621	 = Validation score   (r2)
	262.08s	 = Training   runtime
	1.57s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	Ensemble Weights: {'LightGBMLarge': 0.565, 'LightGBM': 0.217, 'CatBoost': 0.087, 'NeuralNetFastAI': 0.087, 'XGBoost': 0.043}
	0.3643	 = Validation score   (r2)
	0.13s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 5651.57s ... Best model: WeightedEnsemble_L2 | Estimated inference throughput: 4031.4 rows/s (9156 batch size)
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels\ag-20240622_044707")


In [42]:
f_predictor.evaluate(f_test, silent=True)

{'r2': 0.3783355193450282,
 'root_mean_squared_error': -1.9125450070802117,
 'mean_squared_error': -3.657828404107447,
 'mean_absolute_error': -1.4442252192035814,
 'pearsonr': 0.6151210286298121,
 'median_absolute_error': -1.1051397323608398}

In [43]:
f_predictor.leaderboard(f_test)

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,0.378336,0.364257,r2,26.614578,2.271193,3123.508393,0.019055,0.004003,0.130675,2,True,12
1,LightGBMLarge,0.375048,0.362079,r2,19.234764,1.573077,262.075801,19.234764,1.573077,262.075801,1,True,11
2,LightGBM,0.373795,0.359975,r2,3.007708,0.278254,97.575058,3.007708,0.278254,97.575058,1,True,4
3,CatBoost,0.369958,0.352286,r2,0.363998,0.034878,1804.592079,0.363998,0.034878,1804.592079,1,True,6
4,LightGBMXT,0.368,0.350906,r2,3.834374,0.374997,120.087334,3.834374,0.374997,120.087334,1,True,3
5,XGBoost,0.360878,0.342171,r2,2.647361,0.241354,120.545991,2.647361,0.241354,120.545991,1,True,9
6,NeuralNetFastAI,0.36056,0.344707,r2,1.341692,0.139627,838.588788,1.341692,0.139627,838.588788,1,True,8
7,NeuralNetTorch,0.334469,0.312045,r2,0.453094,0.056,831.277538,0.453094,0.056,831.277538,1,True,10
8,RandomForestMSE,0.32093,0.298834,r2,1.421332,0.126661,756.887929,1.421332,0.126661,756.887929,1,True,5
9,ExtraTreesMSE,0.305435,0.281606,r2,1.35704,0.128318,783.194993,1.35704,0.128318,783.194993,1,True,7


In [44]:
for key in f_predictor.info()['model_info'].keys():
    print(f_predictor.info()['model_info'][key]['name'])
    print(f_predictor.info()['model_info'][key]['val_score'])
    print(f_predictor.info()['model_info'][key]['hyperparameters'])
    print("--------------------")

KNeighborsUnif
0.07576371756514011
{'weights': 'uniform'}
--------------------
KNeighborsDist
0.06688228984684463
{'weights': 'distance'}
--------------------
LightGBMXT
0.35090615434957884
{'learning_rate': 0.05, 'extra_trees': True}
--------------------
LightGBM
0.3599748869516278
{'learning_rate': 0.05}
--------------------
RandomForestMSE
0.29883418329992983
{'n_estimators': 300, 'max_leaf_nodes': 15000, 'n_jobs': -1, 'random_state': 0, 'bootstrap': True, 'criterion': 'squared_error'}
--------------------
CatBoost
0.35228552771083355
{'iterations': 10000, 'learning_rate': 0.05, 'random_seed': 0, 'allow_writing_files': False, 'eval_metric': 'R2'}
--------------------
ExtraTreesMSE
0.28160619930391073
{'n_estimators': 300, 'max_leaf_nodes': 15000, 'n_jobs': -1, 'random_state': 0, 'bootstrap': True, 'criterion': 'squared_error'}
--------------------
NeuralNetFastAI
0.3447065259566804
{'layers': None, 'emb_drop': 0.1, 'ps': 0.1, 'bs': 'auto', 'lr': 0.01, 'epochs': 'auto', 'early.stoppi

In [45]:
f_predictor.info()['model_info'].keys()

dict_keys(['KNeighborsUnif', 'KNeighborsDist', 'LightGBMXT', 'LightGBM', 'RandomForestMSE', 'CatBoost', 'ExtraTreesMSE', 'NeuralNetFastAI', 'XGBoost', 'NeuralNetTorch', 'LightGBMLarge', 'WeightedEnsemble_L2'])

### Total Model Automl

In [63]:
label = 'Cantril_ladder'

In [64]:
tot_predictor = TabularPredictor(label=label, eval_metric='r2', problem_type = 'regression').fit(tot_train)

No path specified. Models will be saved in: "AutogluonModels\ag-20240622_100019"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.1.1
Python Version:     3.9.19
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.19045
CPU Count:          12
Memory Avail:       33.16 GB / 63.92 GB (51.9%)
Disk Space Avail:   378.58 GB / 1863.00 GB (20.3%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='best_quality'   : Maximize accuracy. Default time_limit=3600.
	presets='high_quality'   : Strong accuracy with fast inference speed. Default time_limit=3600.
	presets='good_quality'   : Good accuracy with very fast inference speed. Default time_limit=3600.
	presets='medium_quality' : Fast training time, ideal for initial prototyping.
	Consider setting `time_limit` to ensure

[1000]	valid_set's l2: 3.65619	valid_set's r2: 0.37068
[2000]	valid_set's l2: 3.64103	valid_set's r2: 0.373289
[3000]	valid_set's l2: 3.63435	valid_set's r2: 0.374439
[4000]	valid_set's l2: 3.63203	valid_set's r2: 0.374838
[5000]	valid_set's l2: 3.62859	valid_set's r2: 0.375429
[6000]	valid_set's l2: 3.62476	valid_set's r2: 0.376089
[7000]	valid_set's l2: 3.62176	valid_set's r2: 0.376605
[8000]	valid_set's l2: 3.61963	valid_set's r2: 0.376972
[9000]	valid_set's l2: 3.617	valid_set's r2: 0.377425
[10000]	valid_set's l2: 3.61817	valid_set's r2: 0.377223


	0.3775	 = Validation score   (r2)
	379.97s	 = Training   runtime
	2.16s	 = Validation runtime
Fitting model: LightGBM ...


[1000]	valid_set's l2: 3.61909	valid_set's r2: 0.377065
[2000]	valid_set's l2: 3.60307	valid_set's r2: 0.379823
[3000]	valid_set's l2: 3.59438	valid_set's r2: 0.381319
[4000]	valid_set's l2: 3.59062	valid_set's r2: 0.381966
[5000]	valid_set's l2: 3.58934	valid_set's r2: 0.382185
[6000]	valid_set's l2: 3.59205	valid_set's r2: 0.38172


	0.3822	 = Validation score   (r2)
	255.5s	 = Training   runtime
	0.74s	 = Validation runtime
Fitting model: RandomForestMSE ...
	0.3238	 = Validation score   (r2)
	1610.51s	 = Training   runtime
	0.23s	 = Validation runtime
Fitting model: CatBoost ...
	0.3773	 = Validation score   (r2)
	2871.93s	 = Training   runtime
	0.06s	 = Validation runtime
Fitting model: ExtraTreesMSE ...
	0.3008	 = Validation score   (r2)
	1637.45s	 = Training   runtime
	0.21s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
	0.3698	 = Validation score   (r2)
	1632.71s	 = Training   runtime
	0.3s	 = Validation runtime
Fitting model: XGBoost ...
	0.3712	 = Validation score   (r2)
	660.16s	 = Training   runtime
	0.95s	 = Validation runtime
Fitting model: NeuralNetTorch ...
	0.3388	 = Validation score   (r2)
	1774.93s	 = Training   runtime
	0.12s	 = Validation runtime
Fitting model: LightGBMLarge ...


[1000]	valid_set's l2: 3.60874	valid_set's r2: 0.378847
[2000]	valid_set's l2: 3.58925	valid_set's r2: 0.382201
[3000]	valid_set's l2: 3.58361	valid_set's r2: 0.383173
[4000]	valid_set's l2: 3.58034	valid_set's r2: 0.383735
[5000]	valid_set's l2: 3.57671	valid_set's r2: 0.38436
[6000]	valid_set's l2: 3.57591	valid_set's r2: 0.384498
[7000]	valid_set's l2: 3.57427	valid_set's r2: 0.38478
[8000]	valid_set's l2: 3.57411	valid_set's r2: 0.384807
[9000]	valid_set's l2: 3.57234	valid_set's r2: 0.385113
[10000]	valid_set's l2: 3.57124	valid_set's r2: 0.385302


	0.3853	 = Validation score   (r2)
	489.24s	 = Training   runtime
	3.4s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	Ensemble Weights: {'LightGBMLarge': 0.579, 'XGBoost': 0.158, 'LightGBMXT': 0.105, 'CatBoost': 0.105, 'LightGBM': 0.053}
	0.3872	 = Validation score   (r2)
	0.36s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 11357.75s ... Best model: WeightedEnsemble_L2 | Estimated inference throughput: 2354.2 rows/s (17201 batch size)
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels\ag-20240622_100019")


In [65]:
tot_predictor.evaluate(tot_test, silent=True)

{'r2': 0.38501674074137604,
 'root_mean_squared_error': -1.8942772956847196,
 'mean_squared_error': -3.5882864729466144,
 'mean_absolute_error': -1.4274289587112077,
 'pearsonr': 0.620528077624859,
 'median_absolute_error': -1.0853021144866943}

In [66]:
tot_predictor.leaderboard(tot_test)

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,0.385017,0.387172,r2,82.825503,7.306402,4657.163372,0.042888,0.000997,0.361036,2,True,12
1,LightGBMLarge,0.383519,0.385321,r2,38.184882,3.39589,489.237224,38.184882,3.39589,489.237224,1,True,11
2,LightGBM,0.380917,0.382211,r2,8.140232,0.738026,255.500744,8.140232,0.738026,255.500744,1,True,4
3,LightGBMXT,0.376238,0.377466,r2,25.051006,2.158228,379.971878,25.051006,2.158228,379.971878,1,True,3
4,CatBoost,0.375133,0.377256,r2,0.745007,0.061835,2871.927806,0.745007,0.061835,2871.927806,1,True,6
5,NeuralNetFastAI,0.367542,0.369805,r2,2.86434,0.298447,1632.714047,2.86434,0.298447,1632.714047,1,True,8
6,XGBoost,0.367486,0.371168,r2,10.661488,0.951427,660.164684,10.661488,0.951427,660.164684,1,True,9
7,NeuralNetTorch,0.335611,0.338786,r2,0.922534,0.117685,1774.92734,0.922534,0.117685,1774.92734,1,True,10
8,RandomForestMSE,0.323549,0.323766,r2,2.209092,0.2254,1610.507456,2.209092,0.2254,1610.507456,1,True,5
9,ExtraTreesMSE,0.305487,0.300769,r2,2.217072,0.207634,1637.451252,2.217072,0.207634,1637.451252,1,True,7


In [67]:
for key in tot_predictor.info()['model_info'].keys():
    print(tot_predictor.info()['model_info'][key]['name'])
    print(tot_predictor.info()['model_info'][key]['val_score'])
    print(tot_predictor.info()['model_info'][key]['hyperparameters'])
    print("--------------------")

KNeighborsUnif
0.12063454577130428
{'weights': 'uniform'}
--------------------
KNeighborsDist
0.11296728046076498
{'weights': 'distance'}
--------------------
LightGBMXT
0.37746580101498184
{'learning_rate': 0.05, 'extra_trees': True}
--------------------
LightGBM
0.3822106333090852
{'learning_rate': 0.05}
--------------------
RandomForestMSE
0.323766199424125
{'n_estimators': 300, 'max_leaf_nodes': 15000, 'n_jobs': -1, 'random_state': 0, 'bootstrap': True, 'criterion': 'squared_error'}
--------------------
CatBoost
0.37725578766480183
{'iterations': 10000, 'learning_rate': 0.05, 'random_seed': 0, 'allow_writing_files': False, 'eval_metric': 'R2'}
--------------------
ExtraTreesMSE
0.300769241213262
{'n_estimators': 300, 'max_leaf_nodes': 15000, 'n_jobs': -1, 'random_state': 0, 'bootstrap': True, 'criterion': 'squared_error'}
--------------------
NeuralNetFastAI
0.3698052783735828
{'layers': None, 'emb_drop': 0.1, 'ps': 0.1, 'bs': 'auto', 'lr': 0.01, 'epochs': 'auto', 'early.stopping.m

In [68]:
tot_predictor.info()['model_info'].keys()

dict_keys(['KNeighborsUnif', 'KNeighborsDist', 'LightGBMXT', 'LightGBM', 'RandomForestMSE', 'CatBoost', 'ExtraTreesMSE', 'NeuralNetFastAI', 'XGBoost', 'NeuralNetTorch', 'LightGBMLarge', 'WeightedEnsemble_L2'])

### Male Bayes Search Hyperparameter (Test 3 times 1:9 CV)

In [17]:
Df_male = Df.loc[Df['Gender_female']==0, :].drop(columns=['Gender_female'])

In [18]:
Df_male.shape

(893988, 63)

In [19]:
ym = Df_male['Cantril_ladder']

In [20]:
Xm = Df_male.drop(columns=['Cantril_ladder'])

In [24]:
param_space = {
    'n_estimators': Integer(100, 5000),
    'learning_rate': Real(0.001, 0.1, prior='log-uniform'),
    'max_depth': Integer(3, 16),
    'subsample': Real(0.5, 1.0),
    'min_child_weight': Real(0.001, 10, prior='log-uniform'),
    'max_delta_step': Real(0.001, 10, prior='log-uniform'),
    'reg_lambda': Real(0.001, 10, prior='log-uniform'),
    'reg_alpha': Real(0.001, 10, prior='log-uniform'),
    'gamma': Real(0.001, 10, prior='log-uniform')
}

In [25]:
xgb_reg = xgb.XGBRegressor(objective='reg:squarederror',  device = 'cuda', tree_method='hist', random_state=42, enable_categorical=True)

In [26]:
class RandomRunNFoldsKFold(KFold):
    def __init__(self, n_splits=10, random_state=None, run_splits=3, **kwargs):
        super().__init__(n_splits=n_splits, shuffle=True, random_state=random_state, **kwargs)
        self.random_state = random_state
        self.actual_splits = run_splits  # Number of actual splits to use

    def split(self, X, y=None, groups=None):
        folds = list(super().split(X, y, groups))
        if self.random_state is not None:
            random.seed(self.random_state)
        selected_folds = random.sample(folds, self.actual_splits)
        for train_index, test_index in selected_folds:
            yield train_index, test_index

    def get_n_splits(self, X=None, y=None, groups=None):
        return self.actual_splits

In [27]:
rkfcv = RandomRunNFoldsKFold(n_splits=10, run_splits=3, random_state=42)

In [28]:
bayes_search = BayesSearchCV(
    estimator=xgb_reg,
    search_spaces=param_space,
    n_iter=20,
    scoring='r2',
    cv=rkfcv,
    n_jobs = 1,
    n_points = 1,
    verbose=2,
    random_state=42,
    return_train_score = True
)

In [29]:
bayes_search.fit(Xm, ym)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




[CV] END gamma=0.04369339947510315, learning_rate=0.02853983686604182, max_delta_step=5.388550972627239, max_depth=7, min_child_weight=0.47928274405969296, n_estimators=2129, reg_alpha=0.025335258486348353, reg_lambda=0.9078559343576645, subsample=0.6522316555182531; total time=  54.0s
[CV] END gamma=0.04369339947510315, learning_rate=0.02853983686604182, max_delta_step=5.388550972627239, max_depth=7, min_child_weight=0.47928274405969296, n_estimators=2129, reg_alpha=0.025335258486348353, reg_lambda=0.9078559343576645, subsample=0.6522316555182531; total time=  55.0s
[CV] END gamma=0.04369339947510315, learning_rate=0.02853983686604182, max_delta_step=5.388550972627239, max_depth=7, min_child_weight=0.47928274405969296, n_estimators=2129, reg_alpha=0.025335258486348353, reg_lambda=0.9078559343576645, subsample=0.6522316555182531; total time=  55.3s
Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] END gamma=2.2364202820542705, learning_rate=0.05842928269761146, max_delta_

In [30]:
CV_result = bayes_search.cv_results_

In [31]:
pd.DataFrame(CV_result).sort_values(by='rank_test_score', ascending=True).head(10)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_gamma,param_learning_rate,param_max_delta_step,param_max_depth,param_min_child_weight,param_n_estimators,...,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score,rank_train_score
18,915.875857,2.901791,1.923655,0.031759,0.028791,0.004088,0.002016,14,0.004088,2294,...,0.383305,0.382702,0.002255,1,0.823415,0.823697,0.823143,0.823419,0.000226,4
4,476.081896,2.304479,1.680881,0.044041,1.578388,0.007517,0.127785,12,4.124851,3613,...,0.382509,0.381362,0.002622,2,0.810835,0.810657,0.810352,0.810615,0.000199,5
8,36.588404,0.595389,0.393787,0.007221,6.636085,0.025191,3.066251,8,0.033439,1136,...,0.380065,0.379927,0.002348,3,0.468452,0.469005,0.468326,0.468594,0.000295,9
13,147.527401,0.127367,1.067792,0.03275,0.908555,0.003435,0.004117,9,0.142306,3490,...,0.378967,0.379007,0.001994,4,0.44423,0.444804,0.444579,0.444538,0.000236,11
0,54.363598,0.542193,0.485002,0.002064,0.043693,0.02854,5.388551,7,0.479283,2129,...,0.378829,0.37835,0.002202,5,0.46332,0.464309,0.463652,0.46376,0.000411,10
9,34.237441,0.255892,0.319937,0.005256,0.001034,0.043218,0.92285,6,0.206375,1446,...,0.378396,0.378142,0.001952,6,0.432896,0.433102,0.433064,0.433021,9e-05,12
14,51.433625,0.202022,0.455894,0.005982,3.290547,0.010448,0.001,7,0.016015,1865,...,0.377042,0.377254,0.002009,7,0.41475,0.415279,0.415052,0.415027,0.000217,14
16,261.108505,0.911239,2.089321,0.020977,4.218054,0.012483,2.234886,10,0.009746,4918,...,0.378433,0.377127,0.002924,8,0.697146,0.697547,0.697329,0.697341,0.000164,8
12,79.60116,0.306946,0.567698,0.001968,0.416111,0.014061,10.0,5,1.440815,4027,...,0.375806,0.376043,0.002011,9,0.400381,0.400857,0.400674,0.400637,0.000196,17
5,37.633581,0.965993,0.341443,0.014591,0.863201,0.075638,0.004513,5,1.592256,1926,...,0.3761,0.375648,0.001976,10,0.425161,0.42539,0.425042,0.425198,0.000144,13


In [34]:
dump(bayes_search, 'Results/BayesSearchMale20iter.joblib')

['Results/BayesSearchMale20iter.joblib']

**50 iteration** Fail **30 iter** Fail

In [37]:
bayes_search = BayesSearchCV(
    estimator=xgb_reg,
    search_spaces=param_space,
    n_iter=30,
    scoring='r2',
    cv=rkfcv,
    n_jobs = 1,
    n_points = 1,
    verbose=2,
    random_state=42,
    return_train_score = True
)

In [38]:
bayes_search.fit(Xm, ym)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] END gamma=0.04369339947510315, learning_rate=0.02853983686604182, max_delta_step=5.388550972627239, max_depth=7, min_child_weight=0.47928274405969296, n_estimators=2129, reg_alpha=0.025335258486348353, reg_lambda=0.9078559343576645, subsample=0.6522316555182531; total time=  55.0s
[CV] END gamma=0.04369339947510315, learning_rate=0.02853983686604182, max_delta_step=5.388550972627239, max_depth=7, min_child_weight=0.47928274405969296, n_estimators=2129, reg_alpha=0.025335258486348353, reg_lambda=0.9078559343576645, subsample=0.6522316555182531; total time=  55.2s
[CV] END gamma=0.04369339947510315, learning_rate=0.02853983686604182, max_delta_step=5.388550972627239, max_depth=7, min_child_weight=0.47928274405969296, n_estimators=2129, reg_alpha=0.025335258486348353, reg_lambda=0.9078559343576645, subsample=0.6522316555182531; total time=  55.6s
Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] END gamma=2.2364

XGBoostError: bad allocation

In [39]:
CV_result = bayes_search.cv_results_

AttributeError: 'BayesSearchCV' object has no attribute 'cv_results_'

In [None]:
pd.DataFrame(CV_result).sort_values(by='rank_test_score', ascending=True).head(10)

### Female Bayes Search Hyperparameter (Test 3 times 1:9 CV)

In [46]:
Df_female = Df.loc[Df['Gender_female']==1, :].drop(columns=['Gender_female'])

In [47]:
Df_female.shape

(1017224, 63)

In [48]:
yf = Df_female['Cantril_ladder']

In [49]:
Xf = Df_male.drop(columns=['Cantril_ladder'])

In [50]:
param_space = {
    'n_estimators': Integer(100, 5000),
    'learning_rate': Real(0.001, 0.1, prior='log-uniform'),
    'max_depth': Integer(3, 16),
    'subsample': Real(0.5, 1.0),
    'min_child_weight': Real(0.001, 10, prior='log-uniform'),
    'max_delta_step': Real(0.001, 10, prior='log-uniform'),
    'reg_lambda': Real(0.001, 10, prior='log-uniform'),
    'reg_alpha': Real(0.001, 10, prior='log-uniform'),
    'gamma': Real(0.001, 10, prior='log-uniform')
}

In [51]:
xgb_reg = xgb.XGBRegressor(objective='reg:squarederror',  device = 'cuda', tree_method='hist', random_state=42, enable_categorical=True)

In [52]:
class RandomRunNFoldsKFold(KFold):
    def __init__(self, n_splits=10, random_state=None, run_splits=3, **kwargs):
        super().__init__(n_splits=n_splits, shuffle=True, random_state=random_state, **kwargs)
        self.random_state = random_state
        self.actual_splits = run_splits  # Number of actual splits to use

    def split(self, X, y=None, groups=None):
        folds = list(super().split(X, y, groups))
        if self.random_state is not None:
            random.seed(self.random_state)
        selected_folds = random.sample(folds, self.actual_splits)
        for train_index, test_index in selected_folds:
            yield train_index, test_index

    def get_n_splits(self, X=None, y=None, groups=None):
        return self.actual_splits

In [53]:
rkfcv = RandomRunNFoldsKFold(n_splits=10, run_splits=3, random_state=42)

In [54]:
bayes_search = BayesSearchCV(
    estimator=xgb_reg,
    search_spaces=param_space,
    n_iter=20,
    scoring='r2',
    cv=rkfcv,
    n_jobs = 1,
    n_points = 1,
    verbose=2,
    random_state=42,
    return_train_score = True
)

In [55]:
bayes_search.fit(Xm, ym)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] END gamma=0.04369339947510315, learning_rate=0.02853983686604182, max_delta_step=5.388550972627239, max_depth=7, min_child_weight=0.47928274405969296, n_estimators=2129, reg_alpha=0.025335258486348353, reg_lambda=0.9078559343576645, subsample=0.6522316555182531; total time=  54.5s
[CV] END gamma=0.04369339947510315, learning_rate=0.02853983686604182, max_delta_step=5.388550972627239, max_depth=7, min_child_weight=0.47928274405969296, n_estimators=2129, reg_alpha=0.025335258486348353, reg_lambda=0.9078559343576645, subsample=0.6522316555182531; total time=  54.8s
[CV] END gamma=0.04369339947510315, learning_rate=0.02853983686604182, max_delta_step=5.388550972627239, max_depth=7, min_child_weight=0.47928274405969296, n_estimators=2129, reg_alpha=0.025335258486348353, reg_lambda=0.9078559343576645, subsample=0.6522316555182531; total time=  57.7s
Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] END gamma=2.2364

In [56]:
CV_result = bayes_search.cv_results_

In [57]:
pd.DataFrame(CV_result).sort_values(by='rank_test_score', ascending=True).head(10)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_gamma,param_learning_rate,param_max_delta_step,param_max_depth,param_min_child_weight,param_n_estimators,...,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score,rank_train_score
18,933.649213,2.62907,1.921356,0.01077,0.028791,0.004088,0.002016,14,0.004088,2294,...,0.383305,0.382702,0.002255,1,0.823415,0.823697,0.823143,0.823419,0.000226,4
4,474.694017,0.915397,1.700797,0.018734,1.578388,0.007517,0.127785,12,4.124851,3613,...,0.382509,0.381362,0.002622,2,0.810835,0.810657,0.810352,0.810615,0.000199,5
8,37.851165,0.906164,0.374681,0.016726,6.636085,0.025191,3.066251,8,0.033439,1136,...,0.380065,0.379927,0.002348,3,0.468452,0.469005,0.468326,0.468594,0.000295,9
13,152.196619,0.589458,1.130162,0.036664,0.908555,0.003435,0.004117,9,0.142306,3490,...,0.378967,0.379007,0.001994,4,0.44423,0.444804,0.444579,0.444538,0.000236,11
0,55.309947,1.438525,0.463303,0.012328,0.043693,0.02854,5.388551,7,0.479283,2129,...,0.378829,0.37835,0.002202,5,0.46332,0.464309,0.463652,0.46376,0.000411,10
9,34.677713,0.189877,0.353655,0.018579,0.001034,0.043218,0.92285,6,0.206375,1446,...,0.378396,0.378142,0.001952,6,0.432896,0.433102,0.433064,0.433021,9e-05,12
14,53.489178,0.098205,0.481433,0.002112,3.290547,0.010448,0.001,7,0.016015,1865,...,0.377042,0.377254,0.002009,7,0.41475,0.415279,0.415052,0.415027,0.000217,14
16,268.34166,0.506936,2.157323,0.017032,4.218054,0.012483,2.234886,10,0.009746,4918,...,0.378433,0.377127,0.002924,8,0.697146,0.697547,0.697329,0.697341,0.000164,8
12,82.282364,0.25107,0.597585,0.016277,0.416111,0.014061,10.0,5,1.440815,4027,...,0.375806,0.376043,0.002011,9,0.400381,0.400857,0.400674,0.400637,0.000196,17
5,37.813525,0.853724,0.329904,0.007958,0.863201,0.075638,0.004513,5,1.592256,1926,...,0.3761,0.375648,0.001976,10,0.425161,0.42539,0.425042,0.425198,0.000144,13


In [58]:
dump(bayes_search, 'Results/BayesSearchFemale20iter.joblib')

['Results/BayesSearchFemale20iter.joblib']

### Total Bayes Search Hyperparameter (Test 3 times 1:9 CV)

In [17]:
Df_tot = Df.drop(columns=['Gender_female'])

In [18]:
Df_tot.shape

(1911212, 63)

In [19]:
ytot = Df_tot['Cantril_ladder']

In [20]:
Xtot = Df_tot.drop(columns=['Cantril_ladder'])

In [21]:
param_space = {
    'n_estimators': Integer(100, 5000),
    'learning_rate': Real(0.001, 0.1, prior='log-uniform'),
    'max_depth': Integer(3, 16),
    'subsample': Real(0.5, 1.0),
    'min_child_weight': Real(0.001, 10, prior='log-uniform'),
    'max_delta_step': Real(0.001, 10, prior='log-uniform'),
    'reg_lambda': Real(0.001, 10, prior='log-uniform'),
    'reg_alpha': Real(0.001, 10, prior='log-uniform'),
    'gamma': Real(0.001, 10, prior='log-uniform')
}

In [22]:
xgb_reg = xgb.XGBRegressor(objective='reg:squarederror',  device = 'cuda', tree_method='hist', random_state=42, enable_categorical=True)

In [23]:
class RandomRunNFoldsKFold(KFold):
    def __init__(self, n_splits=10, random_state=None, run_splits=3, **kwargs):
        super().__init__(n_splits=n_splits, shuffle=True, random_state=random_state, **kwargs)
        self.random_state = random_state
        self.actual_splits = run_splits  # Number of actual splits to use

    def split(self, X, y=None, groups=None):
        folds = list(super().split(X, y, groups))
        if self.random_state is not None:
            random.seed(self.random_state)
        selected_folds = random.sample(folds, self.actual_splits)
        for train_index, test_index in selected_folds:
            yield train_index, test_index

    def get_n_splits(self, X=None, y=None, groups=None):
        return self.actual_splits

In [24]:
rkfcv = RandomRunNFoldsKFold(n_splits=10, run_splits=3, random_state=42)

In [25]:
bayes_search = BayesSearchCV(
    estimator=xgb_reg,
    search_spaces=param_space,
    n_iter=20,
    scoring='r2',
    cv=rkfcv,
    n_jobs = 1,
    n_points = 1,
    verbose=2,
    random_state=42,
    return_train_score = True
)

In [26]:
bayes_search.fit(Xtot, ytot)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




[CV] END gamma=0.04369339947510315, learning_rate=0.02853983686604182, max_delta_step=5.388550972627239, max_depth=7, min_child_weight=0.47928274405969296, n_estimators=2129, reg_alpha=0.025335258486348353, reg_lambda=0.9078559343576645, subsample=0.6522316555182531; total time=  41.0s
[CV] END gamma=0.04369339947510315, learning_rate=0.02853983686604182, max_delta_step=5.388550972627239, max_depth=7, min_child_weight=0.47928274405969296, n_estimators=2129, reg_alpha=0.025335258486348353, reg_lambda=0.9078559343576645, subsample=0.6522316555182531; total time=  40.2s
[CV] END gamma=0.04369339947510315, learning_rate=0.02853983686604182, max_delta_step=5.388550972627239, max_depth=7, min_child_weight=0.47928274405969296, n_estimators=2129, reg_alpha=0.025335258486348353, reg_lambda=0.9078559343576645, subsample=0.6522316555182531; total time=  39.9s
Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] END gamma=2.236420282054271, learning_rate=0.058429282697611454, max_delta_

In [27]:
CV_result = bayes_search.cv_results_

In [28]:
pd.DataFrame(CV_result).sort_values(by='rank_test_score', ascending=True).head(10)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_gamma,param_learning_rate,param_max_delta_step,param_max_depth,param_min_child_weight,param_n_estimators,...,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score,rank_train_score
14,100.323376,0.319266,0.26452,0.010973,0.208946,0.015473,0.307049,10,0.50944,2238,...,0.384951,0.38443,0.000371,1,0.48148,0.481094,0.480763,0.481112,0.000293,6
11,60.396241,0.453436,0.187127,0.005192,0.0691,0.049845,0.264728,10,0.007026,1336,...,0.383382,0.383037,0.000272,2,0.544261,0.544125,0.54423,0.544205,5.8e-05,5
15,260.57258,1.856786,0.402321,0.012247,9.634201,0.030317,0.332646,16,0.055926,5000,...,0.382414,0.382107,0.000257,3,0.661779,0.661167,0.661234,0.661393,0.000274,4
8,25.987353,0.497602,0.135749,0.012324,6.636085,0.025191,3.066251,8,0.033439,1136,...,0.382237,0.382024,0.000249,4,0.427085,0.426976,0.426965,0.427009,5.4e-05,8
0,40.202627,0.444004,0.18526,0.007005,0.043693,0.02854,5.388551,7,0.479283,2129,...,0.381724,0.381427,0.000214,5,0.423763,0.423801,0.423813,0.423792,2.1e-05,9
9,24.07482,1.175925,0.134434,0.002516,0.001034,0.043218,0.92285,6,0.206375,1446,...,0.379627,0.379425,0.000173,6,0.404939,0.405013,0.404884,0.404946,5.3e-05,12
4,291.605956,4.376956,0.661518,0.129076,1.578388,0.007517,0.127785,12,4.124851,3613,...,0.377443,0.377192,0.000351,7,0.437603,0.437413,0.437335,0.43745,0.000112,7
17,11.10636,0.230397,0.096048,0.003125,0.001,0.1,2.30394,5,10.0,723,...,0.374338,0.374392,0.000241,8,0.386169,0.386285,0.386301,0.386252,5.9e-05,13
13,127.320524,1.172474,0.231998,0.002386,10.0,0.043998,10.0,16,0.001,3182,...,0.374287,0.374128,0.000628,9,0.704154,0.703672,0.703715,0.703847,0.000218,3
18,782.055686,2.388796,1.44272,0.252742,1.297212,0.034273,0.200991,14,0.31033,3633,...,0.372136,0.371791,0.00027,10,0.911679,0.911771,0.911615,0.911688,6.4e-05,1


In [29]:
dump(bayes_search, 'Results/BayesSearchTotal20iter.joblib')

['Results/BayesSearchTotal20iter.joblib']