# Gender Difference: XGB Experiment based on 59 Variables 1513k Rows 14 Waves

In [1]:
%pwd

'/Users/shibo/Desktop/GallupWellBeingGroup/Code'

In [2]:
%cd ..

/Users/shibo/Desktop/GallupWellBeingGroup


## Import Packagee

In [3]:
import os 
import pandas as pd
from scipy import stats
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import xgboost as xgb

## Load and Make Datasets

In [6]:
Df_Filename = os.path.join("Data", "GallupWB_WasedataEnergy_1.967Mvar_v1.parquet")

In [7]:
Df = pd.read_parquet(Df_Filename)

In [6]:
Df.shape

(1911212, 64)

In [7]:
Df.columns

Index(['wave', 'INCOME_2', 'Cantril_ladder', 'Health_disable', 'Relative_have',
       'Living_standard_change', 'Enough_food', 'Enough_shelter',
       'Well_rested', 'Respected', 'Smile', 'Interesting_thing', 'Enjoyment',
       'Physical_pain', 'Worry', 'Sadness', 'Stress', 'Anger',
       'City_satisficied', 'Economic_change', 'Goodtime_job', 'Sat_pubtran',
       'Sat_road', 'Sat_edu', 'Sat_qualityair', 'Sat_qualitywater',
       'Sat_healthcare', 'Sat_affhouse', 'Sat_oppofriend', 'Good_minorities',
       'Good_gayles', 'Good_immigrants', 'Donated', 'Volunteer',
       'Help_stranger', 'Voice_official', 'Local_police', 'Safety_walk',
       'Stolen', 'Assualted', 'Religion_importance', 'Children_respected',
       'Children_learn', 'Women_respected', 'Sat_dealpoor', 'Sat_perserveenv',
       'Freedom_chooselife', 'Conf_military', 'Conf_judicial',
       'Conf_government', 'Conf_financial', 'Conf_honestyelections',
       'Freedom_media', 'Corruption_business', 'Corruption_governm

### Check the Difference between Gender

In [8]:
female_cantrilladder = Df.loc[Df['Gender_female']==1, 'Cantril_ladder'].mean()

In [9]:
female_cantrilladder

5.569313150299246

In [10]:
male_cantrilladder = Df.loc[Df['Gender_female']==0, 'Cantril_ladder'].mean()

In [11]:
male_cantrilladder

5.466124824941722

In [12]:
t_stat, p_value = stats.ttest_ind(Df.loc[Df['Gender_female']==1, 'Cantril_ladder'], Df.loc[Df['Gender_female']==0, 'Cantril_ladder'])

In [13]:
print(f"T-statistic: {t_stat}, P-value: {p_value}")

T-statistic: 29.49502322170667, P-value: 3.682499213688914e-191


### Shuffle Conversion

In [14]:
Df = Df.sample(frac=1, random_state=42).reset_index(drop=True)

In [15]:
Df['COUNTRY_ISO3'] = Df['COUNTRY_ISO3'].astype('category')

### Df_male 

In [16]:
Df_male = Df.loc[Df['Gender_female']==0, :].drop(columns=['Gender_female'])

In [17]:
Df_male.shape

(893988, 63)

In [18]:
ym = Df_male['Cantril_ladder']

In [19]:
Xm = Df_male.drop(columns=['Cantril_ladder'])

In [20]:
Xm_train, Xm_test, ym_train, ym_test = train_test_split(Xm, ym, test_size=0.1, random_state=42)

### Df_female 

In [21]:
Df_female = Df.loc[Df['Gender_female']==1, :].drop(columns=['Gender_female'])

In [22]:
Df_female.shape

(1017224, 63)

In [23]:
yf = Df_female['Cantril_ladder']

In [24]:
Xf = Df_female.drop(columns=['Cantril_ladder'])

In [25]:
Xf_train, Xf_test, yf_train, yf_test = train_test_split(Xf, yf, test_size=0.1, random_state=42)

## Model and Tuning Hyper

### Male Model

In [26]:
model = xgb.XGBRegressor(objective='reg:squarederror', device = 'cuda', tree_method='gpu_hist', 
                         n_estimators=500, learning_rate=0.01, max_depth=5, 
                         random_state=42, enable_categorical=True  )
model.fit(Xm_train, ym_train)

Parameters: { "device" } are not used.



In [27]:
ym_pred = model.predict(Xm_test)

In [28]:
r2_score(ym_test, ym_pred) * 100

35.345814761344485

In [29]:
ym_train_pred = model.predict(Xm_train)

In [30]:
r2_score(ym_train, ym_train_pred) * 100

36.214464768212785

In [8]:
n_estimators_list = list(range(100, 1_100, 100))
learning_rate_list = [0.001, 0.01, 0.1]
max_depth_list = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
subsample_list = [0.6, 0.7, 0.8, 0.9, 1]

In [9]:
results_list = []
for n_estimators in n_estimators_list:
    for learning_rate in learning_rate_list:
        for max_depth in max_depth_list:
            for subsample in subsample_list:
                model = xgb.XGBRegressor(objective='reg:squarederror', 
                                         tree_method='gpu_hist',  ## on linux: device = 'cuda', tree_method='hist', 
                                         n_estimators = n_estimators, 
                                         learning_rate = learning_rate, 
                                         max_depth = max_depth, 
                                         subsample = subsample,
                                         random_state=42, enable_categorical=True  )
                model.fit(Xm_train, ym_train)
                ym_pred = model.predict(Xm_test)
                test_r2 = r2_score(ym_test, ym_pred) * 100
                ym_train_pred = model.predict(Xm_train)
                train_r2 = r2_score(ym_train, ym_train_pred) * 100
                row = [n_estimators, learning_rate, max_depth, 
                       subsample, train_r2, test_r2]
                print(row)
                model = None
                ym_pred = None
                ym_train_pred = None
                results_list.append(row)

NameError: name 'Xm_train' is not defined

In [33]:
results_list_df = pd.DataFrame(results_list, columns = ['n_estimators', 'learning_rate', 'max_depth', 'subsample', 'train_r2', 'test_r2'])

In [40]:
results_list_df.to_parquet(os.path.join('Results', "Hyperparameter_GenderMaleXgboost_ML.parquet"))

In [41]:
results_list_df.head()

Unnamed: 0,n_estimators,learning_rate,max_depth,subsample,train_r2,test_r2
0,100,0.001,3,0.6,-346.946515,-345.661486
1,100,0.001,3,0.7,-346.945118,-345.658858
2,100,0.001,3,0.8,-346.945979,-345.660372
3,100,0.001,3,0.9,-346.945067,-345.659066
4,100,0.001,3,1.0,-346.943746,-345.657782


In [43]:
results_list_df.sort_values(by='test_r2', ascending=False).head(20)

Unnamed: 0,n_estimators,learning_rate,max_depth,subsample,train_r2,test_r2
1868,1000,0.01,12,0.9,65.259347,37.972949
1865,1000,0.01,12,0.6,63.688727,37.949387
1862,1000,0.01,11,0.8,56.286785,37.944039
1867,1000,0.01,12,0.8,64.856366,37.932941
1673,900,0.01,12,0.9,63.580487,37.917952
1872,1000,0.01,13,0.8,74.39485,37.917832
1866,1000,0.01,12,0.7,64.327114,37.917203
1670,900,0.01,12,0.6,62.041453,37.906973
1873,1000,0.01,13,0.9,74.956253,37.905652
1677,900,0.01,13,0.8,72.50999,37.900075


**Fine tune other hyperparameter**

In [45]:
gamma_list = [0, 0.01, 0.1, 1, 10]
min_child_weight_list = [0, 0.1, 1, 10]
max_delta_step_list = [0, 0.1, 1, 10]
reg_lambda_list = [0, 0.1, 1, 10]
reg_alpha_list = [0, 0.1, 1, 10]

In [None]:
# 1000	0.01	12	0.9	65.259347	37.972949
results_list_otherhyper = []
for gamma in gamma_list:
    for min_child_weight in min_child_weight_list:
        for max_delta_step in max_delta_step_list:
            for reg_lambda in reg_lambda_list:
                for reg_alpha in reg_alpha_list:
                    model = xgb.XGBRegressor(objective='reg:squarederror',  tree_method='gpu_hist',  ## on linux: device = 'cuda', tree_method='hist', 
                                             n_estimators = 1000,  learning_rate = 0.01, max_depth = 12,   subsample = 0.9,
                                             gamma = gamma, min_child_weight = min_child_weight, max_delta_step = max_delta_step,
                                             reg_lambda = reg_lambda, reg_alpha = reg_alpha, 
                                             random_state=42, enable_categorical=True  )
                    model.fit(Xm_train, ym_train)
                    ym_pred = model.predict(Xm_test)
                    test_r2 = r2_score(ym_test, ym_pred) * 100
                    ym_train_pred = model.predict(Xm_train)
                    train_r2 = r2_score(ym_train, ym_train_pred) * 100
                    row = [gamma, min_child_weight, max_delta_step, reg_lambda, 
                           reg_alpha, train_r2, test_r2]
                    print(row)
                    model = None
                    ym_pred = None
                    ym_train_pred = None
                    results_list_otherhyper.append(row)

[0, 0, 0, 0, 0, 66.32583976200496, 37.92534171377735]
[0, 0, 0, 0, 0.1, 66.25929018662178, 37.89396674883224]
[0, 0, 0, 0, 1, 65.77513090428084, 37.92659717073056]
[0, 0, 0, 0, 10, 57.76528690574637, 37.959529768840085]
[0, 0, 0, 0.1, 0, 66.14473734107358, 37.91347779740685]
[0, 0, 0, 0.1, 0.1, 66.11109036118917, 37.903010747107345]
[0, 0, 0, 0.1, 1, 65.64327235851457, 37.913586773826104]
[0, 0, 0, 0.1, 10, 57.73111159250488, 37.934744396040244]
[0, 0, 0, 1, 0, 65.2593470125833, 37.972948793923756]
[0, 0, 0, 1, 0.1, 65.2311686285684, 37.98462798746666]
[0, 0, 0, 1, 1, 64.68336611746707, 37.962690044963246]
[0, 0, 0, 1, 10, 57.25202947338671, 37.93972695444805]
[0, 0, 0, 10, 0, 59.587659852857456, 38.049672756904464]
[0, 0, 0, 10, 0.1, 59.56645364193989, 38.03215426078196]
[0, 0, 0, 10, 1, 59.11353450893302, 38.05629920865351]
[0, 0, 0, 10, 10, 54.593747394480815, 37.99005296392986]
[0, 0, 0.1, 0, 0, 66.28456097496328, 37.97191901178252]
[0, 0, 0.1, 0, 0.1, 66.19149802930379, 37.9021953