# Gender Difference: XGB Experiment based on 64 Variables 1911k Rows 14 Waves

In [1]:
%pwd

'/mnt/d/OneDrive - Kyushu University/ESG09_Article/Code'

In [2]:
%cd ..

/mnt/d/OneDrive - Kyushu University/ESG09_Article


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


## Import Package

In [37]:
from autogluon.tabular import TabularDataset, TabularPredictor
import catboost as cb
from joblib import dump, load
import lightgbm
import numpy as np
import os 
import pandas as pd
import random
from scipy import stats
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder
from skopt import BayesSearchCV
from skopt.space import Real, Integer
import xgboost as xgb

## Load and Make Datasets

In [4]:
Df_Filename = os.path.join("Data", "GallupWB_Ml64var1911k14wave_v1.parquet")

In [5]:
Df = pd.read_parquet(Df_Filename)

In [6]:
Df.shape

(1911212, 64)

In [7]:
Df.columns

Index(['wave', 'INCOME_2', 'Cantril_ladder', 'Health_disable', 'Relative_have',
       'Living_standard_change', 'Enough_food', 'Enough_shelter',
       'Well_rested', 'Respected', 'Smile', 'Interesting_thing', 'Enjoyment',
       'Physical_pain', 'Worry', 'Sadness', 'Stress', 'Anger',
       'City_satisficied', 'Economic_change', 'Goodtime_job', 'Sat_pubtran',
       'Sat_road', 'Sat_edu', 'Sat_qualityair', 'Sat_qualitywater',
       'Sat_healthcare', 'Sat_affhouse', 'Sat_oppofriend', 'Good_minorities',
       'Good_gayles', 'Good_immigrants', 'Donated', 'Volunteer',
       'Help_stranger', 'Voice_official', 'Local_police', 'Safety_walk',
       'Stolen', 'Assualted', 'Religion_importance', 'Children_respected',
       'Children_learn', 'Women_respected', 'Sat_dealpoor', 'Sat_perserveenv',
       'Freedom_chooselife', 'Conf_military', 'Conf_judicial',
       'Conf_government', 'Conf_financial', 'Conf_honestyelections',
       'Freedom_media', 'Corruption_business', 'Corruption_governm

### Check the Difference between Gender

In [8]:
female_cantrilladder = Df.loc[Df['Gender_female']==1, 'Cantril_ladder'].mean()

In [9]:
female_cantrilladder

5.569313150299246

In [10]:
male_cantrilladder = Df.loc[Df['Gender_female']==0, 'Cantril_ladder'].mean()

In [11]:
male_cantrilladder

5.466124824941722

In [12]:
t_stat, p_value = stats.ttest_ind(Df.loc[Df['Gender_female']==1, 'Cantril_ladder'], Df.loc[Df['Gender_female']==0, 'Cantril_ladder'])

In [13]:
print(f"T-statistic: {t_stat}, P-value: {p_value}")

T-statistic: 29.49502322170667, P-value: 3.682499213688914e-191


### Shuffle Conversion and One hot

In [14]:
Df = Df.sample(frac=1, random_state=42)

In [15]:
Df['COUNTRY_ISO3'] = Df['COUNTRY_ISO3'].astype('category')

In [16]:
one_hot_encoded = pd.get_dummies(Df['COUNTRY_ISO3'], prefix='Country')

In [17]:
one_hot_encoded.head()

Unnamed: 0,Country_AFG,Country_AGO,Country_ALB,Country_ARE,Country_ARG,Country_ARM,Country_AUS,Country_AUT,Country_AZE,Country_BDI,...,Country_VEN,Country_VNM,Country_XKX,Country_XNC,Country_XNK,Country_XSR,Country_YEM,Country_ZAF,Country_ZMB,Country_ZWE
1469152,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1562723,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
708695,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1828455,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
968859,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [18]:
Df_countryOneHot = pd.concat([Df, one_hot_encoded], axis=1)

In [19]:
Df_countryOneHot.shape

(1911212, 228)

### Df_male 

In [20]:
Df_male = Df_countryOneHot.loc[Df_countryOneHot['Gender_female']==0, :].drop(columns=['Gender_female'])

In [21]:
Df_male.shape

(893988, 227)

In [30]:
ym = Df_male['Cantril_ladder']

In [31]:
Xm = Df_male.drop(columns=['Cantril_ladder', 'COUNTRY_ISO3'])

### Df_female 

In [24]:
Df_female = Df_countryOneHot.loc[Df_countryOneHot['Gender_female']==1, :].drop(columns=['Gender_female'])

In [25]:
Df_female.shape

(1017224, 227)

In [32]:
yf = Df_female['Cantril_ladder']

In [33]:
Xf = Df_female.drop(columns=['Cantril_ladder', 'COUNTRY_ISO3'])

## 10-CV Linear Regression 

### Male

In [28]:
model = LinearRegression()

kf = KFold(n_splits=10, shuffle=True, random_state=42)

In [35]:
r2_scores = []
for train_index, test_index in kf.split(Xm):
    X_train, X_test = Xm.iloc[train_index], Xm.iloc[test_index]
    y_train, y_test = ym.iloc[train_index], ym.iloc[test_index]
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test)
    
    # Calculate mean squared error
    r2 = r2_score(y_test, y_pred)
    r2_scores.append(r2)
    
    # Print fold results
    print(f'Fold R2: {r2}')

Fold R2: 0.3366196716538916
Fold R2: 0.33983354247426745
Fold R2: 0.3351244459305015
Fold R2: 0.3066222521577294
Fold R2: 0.3390786238266855
Fold R2: 0.3361867146834654
Fold R2: 0.3403555574213536
Fold R2: 0.3376248708129216
Fold R2: 0.3349398551917866
Fold R2: 0.33995646621172426


In [38]:
mean_r2 = np.mean(r2_scores)
std_r2 = np.std(r2_scores)

print(f'Mean R2: {mean_r2}')
print(f'Standard Deviation of R2: {std_r2}')

Mean R2: 0.3346342000364327
Standard Deviation of R2: 0.00952966952845712


### Female

In [39]:
model = LinearRegression()

kf = KFold(n_splits=10, shuffle=True, random_state=42)

In [40]:
r2_scores = []
for train_index, test_index in kf.split(Xf):
    X_train, X_test = Xf.iloc[train_index], Xf.iloc[test_index]
    y_train, y_test = yf.iloc[train_index], yf.iloc[test_index]
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test)
    
    # Calculate mean squared error
    r2 = r2_score(y_test, y_pred)
    r2_scores.append(r2)
    
    # Print fold results
    print(f'Fold R2: {r2}')

Fold R2: 0.33914432371364467
Fold R2: 0.34145427946987184
Fold R2: 0.3383248333311869
Fold R2: 0.33621082266128
Fold R2: 0.33662643410717596
Fold R2: 0.33889501442513814
Fold R2: 0.3412368306636001
Fold R2: 0.3402954933188045
Fold R2: 0.30219012240707455
Fold R2: 0.34050579527389246


In [41]:
mean_r2 = np.mean(r2_scores)
std_r2 = np.std(r2_scores)

print(f'Mean R2: {mean_r2}')
print(f'Standard Deviation of R2: {std_r2}')

Mean R2: 0.33548839493716687
Standard Deviation of R2: 0.011227319516799804
