# XGB Experiment based on 59 Variables 1513k Rows 14 Waves

In [1]:
%pwd

'/mnt/d/OneDrive - Kyushu University/ESG09_Article/Code'

In [2]:
%cd ..

/mnt/d/OneDrive - Kyushu University/ESG09_Article


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


## Import Package

In [15]:
import os 
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import xgboost as xgb

## Load and Make Datasets

In [4]:
Df_Filename = os.path.join("Data", "GallupWB_Ml60var1513k14wave_v1.parquet")

In [5]:
Df = pd.read_parquet(Df_Filename)

In [6]:
Df.shape

(1513459, 59)

### Train and Test df

In [7]:
Df = Df.sample(frac=1, random_state=42).reset_index(drop=True)

In [8]:
Df['COUNTRY_ISO3'] = Df['COUNTRY_ISO3'].astype('category')

In [9]:
Df.head()

Unnamed: 0,wave,INCOME_2,Cantril_ladder,Health_disable,Relative_have,Living_standard_change,Enough_food,Enough_shelter,Well_rested,Respected,...,Corruption_business,Corruption_government,Performance_leadership,Age,Marital_status,Employment,Children_under15,Feeling_income,Income_level,COUNTRY_ISO3
0,8,4667.622123,7.0,2.0,1.0,1.0,1.0,0.0,1.0,1.0,...,1.0,1.0,1.0,32.0,2.0,6.0,1.0,1.0,2.0,BRA
1,16,78065.925112,9.0,2.0,1.0,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,1.0,26.0,8.0,1.0,1.0,1.0,2.0,FIN
2,6,1270.193354,5.0,1.0,0.0,-0.0,1.0,1.0,1.0,1.0,...,1.0,1.0,2.0,65.0,2.0,6.0,0.0,2.0,2.0,PSE
3,6,716.86074,6.0,2.0,1.0,-0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,1.0,49.0,8.0,6.0,0.0,2.0,1.0,MEX
4,9,4531.278959,5.0,2.0,1.0,-1.0,1.0,0.0,1.0,1.0,...,1.0,1.0,2.0,74.0,2.0,2.0,1.0,3.0,4.0,ZWE


In [10]:
y = Df['Cantril_ladder']

In [11]:
X = Df.drop(columns=['Cantril_ladder'])

In [12]:
X.shape

(1513459, 58)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

### Basic XGBoost Experiment

In [16]:
model = xgb.XGBRegressor(objective='reg:squarederror', device = 'cuda', tree_method='hist', 
                         n_estimators=500, learning_rate=0.01, max_depth=5, 
                         random_state=42, enable_categorical=True  )
model.fit(X_train, y_train)

In [17]:
y_pred = model.predict(X_test)

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




In [18]:
r2_score(y_test, y_pred) * 100

36.219469666588466

In [19]:
y_train_pred = model.predict(X_train)

In [20]:
r2_score(y_train, y_train_pred) * 100

36.15957389149862

### Fine-tune Hyperparameters

In [34]:
n_estimators_list = list(range(100, 1_100, 100))
learning_rate_list = [0.001, 0.01, 0.1]
max_depth_list = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
subsample_list = [0.6, 0.7, 0.8, 0.9, 1]

In [None]:
results_list = []
for n_estimators in n_estimators_list:
    for learning_rate in learning_rate_list:
        for max_depth in max_depth_list:
            for subsample in subsample_list:
                model = xgb.XGBRegressor(objective='reg:squarederror', device = 'cuda', 
                                         tree_method='hist', 
                                         n_estimators = n_estimators, 
                                         learning_rate = learning_rate, 
                                         max_depth = max_depth, 
                                         subsample = subsample,
                                         random_state=42, enable_categorical=True  )
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)
                test_r2 = r2_score(y_test, y_pred) * 100
                y_train_pred = model.predict(X_train)
                train_r2 = r2_score(y_train, y_train_pred) * 100
                row = [n_estimators, learning_rate, max_depth, 
                       subsample, train_r2, test_r2]
                print(row)
                results_list.append(row)

[100, 0.001, 3, 0.6, 4.552780894267572, 4.585506139647499]
[100, 0.001, 3, 0.7, 4.552345050830386, 4.585403975188296]
[100, 0.001, 3, 0.8, 4.552244880971479, 4.585143477170073]
[100, 0.001, 3, 0.9, 4.552076629669044, 4.585113208002689]
[100, 0.001, 3, 1, 4.552142862607811, 4.585187661219614]
[100, 0.001, 4, 0.6, 5.0396425101144615, 5.072341567016947]
[100, 0.001, 4, 0.7, 5.038764421262377, 5.07196068498671]
[100, 0.001, 4, 0.8, 5.038555963338808, 5.071368154877498]
[100, 0.001, 4, 0.9, 5.038813119907881, 5.072371428632783]
[100, 0.001, 4, 1, 5.038810404226279, 5.072805393094837]
[100, 0.001, 5, 0.6, 5.388574211079011, 5.42482473734649]
[100, 0.001, 5, 0.7, 5.387523883135614, 5.424109644773544]
[100, 0.001, 5, 0.8, 5.38734652766959, 5.423723587382967]
[100, 0.001, 5, 0.9, 5.386330998587596, 5.422666301296086]
[100, 0.001, 5, 1, 5.385859323809738, 5.422514344758433]
[100, 0.001, 6, 0.6, 5.635371458429028, 5.671025298228982]
[100, 0.001, 6, 0.7, 5.634862084992431, 5.670938161101102]
[100,

In [None]:
gamma_list = [0, 0.01, 0.1, 1, 10]
min_child_weight_list = [0, 0.1, 1, 10]
max_delta_step_list = [0, 0.1, 1, 10]
reg_lambda_list = [0, 0.1, 1, 10]
reg_alpha_list = [0, 0.1, 1, 10]