# XGB Experiment based on 73 Variables 798k Rows 6 Waves

In [1]:
%pwd

'/mnt/f/ESG09_Project/Code'

In [2]:
%cd ..

/mnt/f/ESG09_Project


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


## Import Package

In [3]:
import os 
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import xgboost as xgb

## Load and Make Datasets

In [4]:
Df_Filename = os.path.join("Data", "GallupWB_Zhang74var798k6wave_v1.parquet")

In [5]:
Df = pd.read_parquet(Df_Filename)

In [6]:
Df.shape

(798604, 74)

### Train and Test df

In [7]:
Df = Df.sample(frac=1, random_state=42).reset_index(drop=True)

In [8]:
Df['COUNTRY_ISO3'] = Df['COUNTRY_ISO3'].astype('category')

In [9]:
Df.head()

Unnamed: 0,wave,INCOME_2,Cantril_ladder,Health_disable,Relative_have,Life_satisfaction,Living_standard_trend,Enough_food,Enough_shelter,Well_rested,...,Children_under15,Feeling_income,Born_here,Home_handline,Mobile_phone,Internet_access_available,Used_internet_recently,Phone_internet_access,COUNTRY_ISO3,Income_level
0,13,5278.624306,3.0,1.0,1.0,0.0,-0.0,1.0,0.0,0.0,...,1.0,3.0,1.0,0.0,1.0,0.0,0.0,0.0,MEX,2.0
1,14,0.0,5.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,1.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,MKD,1.0
2,14,5345.822529,10.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,NIC,3.0
3,17,10127.21585,5.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,...,0.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,EGY,5.0
4,16,7106.753699,6.0,0.0,0.0,0.0,-0.0,1.0,0.0,0.0,...,0.0,3.0,1.0,0.0,1.0,0.0,0.0,0.0,PHL,4.0


In [10]:
y = Df['Cantril_ladder']

In [11]:
X = Df.drop(columns=['Cantril_ladder'])

In [12]:
X.shape

(798604, 73)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

### Basic XGBoost Experiment

In [14]:
model = xgb.XGBRegressor(objective='reg:squarederror', device = 'cuda', tree_method='hist', 
                         n_estimators=500, learning_rate=0.01, max_depth=5, 
                         random_state=42, enable_categorical=True  )
model.fit(X_train, y_train)

In [15]:
y_pred = model.predict(X_test)

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




In [16]:
r2_score(y_test, y_pred) * 100

34.265907623539285

In [17]:
y_train_pred = model.predict(X_train)

In [18]:
r2_score(y_train, y_train_pred) * 100

35.71465709957001

In [21]:
n_estimators_list = list(range(100, 1_100, 100))
learning_rate_list = [0.01, 0.1]
max_depth_list = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
subsample_list = [0.6, 0.7, 0.8, 0.9, 1]

In [22]:
results_list = []
for n_estimators in n_estimators_list:
    for learning_rate in learning_rate_list:
        for max_depth in max_depth_list:
            for subsample in subsample_list:
                model = xgb.XGBRegressor(objective='reg:squarederror', 
                                         device = 'cuda', 
                                         tree_method='hist', 
                                         n_estimators = n_estimators, 
                                         learning_rate = learning_rate, 
                                         max_depth = max_depth, 
                                         subsample = subsample,
                                         random_state=42, enable_categorical=True  )
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)
                test_r2 = r2_score(y_test, y_pred) * 100
                y_train_pred = model.predict(X_train)
                train_r2 = r2_score(y_train, y_train_pred) * 100
                row = [n_estimators, learning_rate, max_depth, 
                       subsample, train_r2, test_r2]
                print(row)
                model = None
                y_pred = None
                y_train_pred = None
                results_list.append(row)

[100, 0.01, 3, 0.6, 24.49648248402093, 23.989265851622044]
[100, 0.01, 3, 0.7, 24.483892046125135, 23.977718965256724]
[100, 0.01, 3, 0.8, 24.486413987268605, 23.979166459376643]
[100, 0.01, 3, 0.9, 24.47569759385463, 23.965499603010354]
[100, 0.01, 3, 1, 24.473981771740394, 23.96280104563302]
[100, 0.01, 4, 0.6, 25.998247221365332, 25.457922853273864]
[100, 0.01, 4, 0.7, 25.993031541774968, 25.45204739456367]
[100, 0.01, 4, 0.8, 25.980950713169815, 25.438480774510275]
[100, 0.01, 4, 0.9, 25.97912660455818, 25.438354714179223]
[100, 0.01, 4, 1, 25.973481692414214, 25.434894813048835]
[100, 0.01, 5, 0.6, 27.084091258922417, 26.493435432309486]
[100, 0.01, 5, 0.7, 27.077255414857714, 26.489596714300568]
[100, 0.01, 5, 0.8, 27.065648969929413, 26.474591235180213]
[100, 0.01, 5, 0.9, 27.062767647221065, 26.46890993587774]
[100, 0.01, 5, 1, 27.056810025743015, 26.467700982675023]
[100, 0.01, 6, 0.6, 27.917520566570108, 27.239953493194125]
[100, 0.01, 6, 0.7, 27.91468316948582, 27.2356089354

KeyboardInterrupt: 