# XGB Experiment based on 59 Variables 1513k Rows 14 Waves

In [1]:
%pwd

'/mnt/d/OneDrive - Kyushu University/ESG09_Article/Code'

In [2]:
%cd ..

/mnt/d/OneDrive - Kyushu University/ESG09_Article


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


## Import Package

In [15]:
import os 
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import xgboost as xgb

## Load and Make Datasets

In [4]:
Df_Filename = os.path.join("Data", "GallupWB_Ml60var1513k14wave_v1.parquet")

In [5]:
Df = pd.read_parquet(Df_Filename)

In [6]:
Df.shape

(1513459, 59)

### Train and Test df

In [7]:
Df = Df.sample(frac=1, random_state=42).reset_index(drop=True)

In [8]:
Df['COUNTRY_ISO3'] = Df['COUNTRY_ISO3'].astype('category')

In [9]:
Df.head()

Unnamed: 0,wave,INCOME_2,Cantril_ladder,Health_disable,Relative_have,Living_standard_change,Enough_food,Enough_shelter,Well_rested,Respected,...,Corruption_business,Corruption_government,Performance_leadership,Age,Marital_status,Employment,Children_under15,Feeling_income,Income_level,COUNTRY_ISO3
0,8,4667.622123,7.0,2.0,1.0,1.0,1.0,0.0,1.0,1.0,...,1.0,1.0,1.0,32.0,2.0,6.0,1.0,1.0,2.0,BRA
1,16,78065.925112,9.0,2.0,1.0,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,1.0,26.0,8.0,1.0,1.0,1.0,2.0,FIN
2,6,1270.193354,5.0,1.0,0.0,-0.0,1.0,1.0,1.0,1.0,...,1.0,1.0,2.0,65.0,2.0,6.0,0.0,2.0,2.0,PSE
3,6,716.86074,6.0,2.0,1.0,-0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,1.0,49.0,8.0,6.0,0.0,2.0,1.0,MEX
4,9,4531.278959,5.0,2.0,1.0,-1.0,1.0,0.0,1.0,1.0,...,1.0,1.0,2.0,74.0,2.0,2.0,1.0,3.0,4.0,ZWE


In [10]:
y = Df['Cantril_ladder']

In [11]:
X = Df.drop(columns=['Cantril_ladder'])

In [12]:
X.shape

(1513459, 58)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

### Basic XGBoost Experiment

In [16]:
model = xgb.XGBRegressor(objective='reg:squarederror', device = 'cuda', tree_method='hist', 
                         n_estimators=500, learning_rate=0.01, max_depth=5, 
                         random_state=42, enable_categorical=True  )
model.fit(X_train, y_train)

In [17]:
y_pred = model.predict(X_test)

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




In [18]:
r2_score(y_test, y_pred) * 100

36.219469666588466

In [19]:
y_train_pred = model.predict(X_train)

In [20]:
r2_score(y_train, y_train_pred) * 100

36.15957389149862

### Fine-tune Hyperparameters

In [32]:
n_estimators_list = list(range(100, 1_100, 100))
learning_rate_list = [0.001, 0.005, 0.01, 0.05, 0.1]
max_depth_list = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
gamma_list = [0, 0.01, 0.1, 1, 10]
min_child_weight_list = [0, 0.1, 1, 10]
max_delta_step_list = [0, 0.1, 1, 10]
subsample_list = [0.6, 0.7, 0.8, 0.9, 1]
reg_lambda_list = [0, 0.1, 1, 10]
reg_alpha_list = [0, 0.1, 1, 10]

In [None]:
results_list = []
for n_estimators in n_estimators_list:
    for learning_rate in learning_rate_list:
        for max_depth in max_depth_list:
            for gamma in gamma_list:
                for min_child_weight in min_child_weight_list:
                    for max_delta_step in max_delta_step_list:
                        for subsample in subsample_list:
                            for reg_lambda in reg_lambda_list:
                                for reg_alpha in reg_alpha_list:
                                    model = xgb.XGBRegressor(objective='reg:squarederror', device = 'cuda', 
                                                             tree_method='hist', 
                                                             n_estimators = n_estimators, 
                                                             learning_rate = learning_rate, 
                                                             max_depth = max_depth, gamma = gamma, 
                                                             min_child_weight = min_child_weight,
                                                             max_delta_step = max_delta_step,
                                                             subsample = subsample,
                                                             reg_lambda = reg_lambda,
                                                             reg_alpha = reg_alpha,
                                                             random_state=42, enable_categorical=True  )
                                    model.fit(X_train, y_train)
                                    y_pred = model.predict(X_test)
                                    test_r2 = r2_score(y_test, y_pred) * 100
                                    y_train_pred = model.predict(X_train)
                                    train_r2 = r2_score(y_train, y_train_pred) * 100
                                    row = [n_estimators, learning_rate, max_depth, gamma, 
                                           min_child_weight, max_delta_step,
                                           subsample, reg_lambda, reg_alpha, train_r2, test_r2]
                                    print(row)
                                    results_list.append(row)

[100, 0.001, 3, 0, 0, 0, 0.6, 0, 0, 4.552818990600793, 4.585544624371051]
[100, 0.001, 3, 0, 0, 0, 0.6, 0, 0.1, 4.552818939648118, 4.585544570107814]
[100, 0.001, 3, 0, 0, 0, 0.6, 0, 1, 4.552791439779924, 4.585516760860953]
[100, 0.001, 3, 0, 0, 0, 0.6, 0, 10, 4.55251499266256, 4.585311073678044]
[100, 0.001, 3, 0, 0, 0, 0.6, 0.1, 0, 4.552815412220945, 4.585540990634618]
[100, 0.001, 3, 0, 0, 0, 0.6, 0.1, 0.1, 4.5528126736016254, 4.585538256398092]
[100, 0.001, 3, 0, 0, 0, 0.6, 0.1, 1, 4.552785085210386, 4.585510312512908]
[100, 0.001, 3, 0, 0, 0, 0.6, 0.1, 10, 4.55251491675911, 4.585310986902158]
[100, 0.001, 3, 0, 0, 0, 0.6, 1, 0, 4.552780894267572, 4.585506139647499]
[100, 0.001, 3, 0, 0, 0, 0.6, 1, 0.1, 4.552779574858279, 4.5855047862344955]
[100, 0.001, 3, 0, 0, 0, 0.6, 1, 1, 4.55275063667483, 4.585475508572367]
[100, 0.001, 3, 0, 0, 0, 0.6, 1, 10, 4.552482060633823, 4.585277719948511]
[100, 0.001, 3, 0, 0, 0, 0.6, 10, 0, 4.552454528587912, 4.58520440641601]
[100, 0.001, 3, 0, 0, 