In [1]:
%pwd

'/Users/shibo/Desktop/GallupWellBeingGroup/Code'

In [2]:
cd ..

/Users/shibo/Desktop/GallupWellBeingGroup


# Import Pacakge

In [3]:
import os 
import pandas as pd
from scipy import stats
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import xgboost as xgb

# Load and make dataset

In [4]:
#Df_Filename = os.path.join("Data", "GallupWB_WasedataEnergy_1.967Mvar_v1.parquet")

In [5]:
parquet_file_path = 'Data/GallupWB_WasedataEnergy_1.967Mvar_v1.parquet'

In [6]:
Df = pd.read_parquet(parquet_file_path, engine='pyarrow')

In [8]:
Df = pd.read_parquet(parquet_file_path)

In [10]:
Df.shape

(1967478, 66)

In [11]:
Df.columns

Index(['Disability', 'Food', 'Shelter', 'Relatives', 'Satisfy_life',
       'Wellrested', 'Treated_respect', 'Smile_Laugh', 'Learn_interesting',
       'Enjoyment', 'Physical_pain', 'Worry', 'Sadness', 'Stress', 'Anger',
       'Satisfied_city', 'Move_city', 'Recommend_city', 'Find_job',
       'Public_transportation', 'Roads_highways', 'Education', 'Air_quality',
       'Water_quality', 'Health_care', 'Good_housing', 'Friends_make',
       'Racial_minority', 'Gay_les', 'Immigrants', 'Donation', 'Voluntary',
       'Help_stranger', 'Voice_opinion', 'Confidence', 'Safety', 'Steal',
       'Assualted', 'Children_respected', 'Children_learn', 'Women_respected',
       'Preserve_environment', 'Freedom_life', 'Military', 'Judicial_system',
       'Government', 'Banks', 'election_honesty', 'Corruption_business',
       'Corruption_government', 'Gender', 'Leadership_country',
       'Marital_status', 'Acess_internet', 'Use_internet', 'Phone_internet',
       'Life_standard', 'Economic_conditi

## Check difference between 

## Shuffle Conversion

In [12]:
Df = Df.sample(frac=1, random_state=42).reset_index(drop=True)

In [13]:
Df['COUNTRY_ISO3'] = Df['COUNTRY_ISO3'].astype('category')

In [16]:
ye = Df['Economic_condition']

In [17]:
Xe = Df.drop(columns=['Wellbeing_ladder'])

In [18]:
Xe_train, Xe_test, ye_train, ye_test = train_test_split(Xe, ye, test_size=0.1, random_state=42)

# Model and Tuning Hyper

## Economic Model

In [21]:
model = xgb.XGBRegressor(objective='reg:squarederror', 
                         n_estimators=500, learning_rate=0.01, max_depth=5, 
                         random_state=42, enable_categorical=True  )
model.fit(Xe_train, ye_train)

In [22]:
ye_pred = model.predict(Xe_test)

In [23]:
r2_score(ye_test, ye_pred) * 100

99.99568251984368

In [24]:
ye_train_pred = model.predict(Xe_train)

In [25]:
r2_score(ye_train, ye_train_pred) * 100

99.99568259004388

In [33]:
n_estimators_list = list(range(50, 300, 50))
learning_rate_list = [0.05, 0.1]
max_depth_list = [5,10,15]
subsample_list = [0.6, 0.7, 0.8, 0.9, 1]

In [34]:
results_list = []
for n_estimators in n_estimators_list:
    for learning_rate in learning_rate_list:
        for max_depth in max_depth_list:
            for subsample in subsample_list:
                model = xgb.XGBRegressor(objective='reg:squarederror', 
                                         n_estimators = n_estimators,
                                         learning_rate = learning_rate, 
                                         max_depth = max_depth, 
                                         subsample = subsample,
                                         random_state=42, enable_categorical=True  )
                model.fit(Xe_train, ye_train)
                ye_pred = model.predict(Xe_test)
                test_r2 = r2_score(ye_test, ye_pred) * 100
                ye_train_pred = model.predict(Xe_train)
                train_r2 = r2_score(ye_train, ye_train_pred) * 100
                row = [n_estimators, learning_rate, max_depth, 
                       subsample, train_r2, test_r2]
                print(row)
                model = None
                ye_pred = None
                ye_train_pred = None
                results_list.append(row)

[50, 0.05, 5, 0.6, 99.4079357100152, 99.40792611208336]
[50, 0.05, 5, 0.7, 99.40793749453941, 99.40792790509471]
[50, 0.05, 5, 0.8, 99.40793927905838, 99.40792969810079]
[50, 0.05, 5, 0.9, 99.40793927905838, 99.40792969810079]
[50, 0.05, 5, 1, 99.40794022132275, 99.40793063565667]
[50, 0.05, 10, 0.6, 99.4079357100152, 99.40792611208336]
[50, 0.05, 10, 0.7, 99.40793749453941, 99.40792790509471]
[50, 0.05, 10, 0.8, 99.40793927905838, 99.40792969810079]
[50, 0.05, 10, 0.9, 99.40793927905838, 99.40792969810079]
[50, 0.05, 10, 1, 99.40794022132275, 99.40793063565667]
[50, 0.05, 15, 0.6, 99.4079357100152, 99.40792611208336]
[50, 0.05, 15, 0.7, 99.40793749453941, 99.40792790509471]
[50, 0.05, 15, 0.8, 99.40793927905838, 99.40792969810079]
[50, 0.05, 15, 0.9, 99.40793927905838, 99.40792969810079]
[50, 0.05, 15, 1, 99.40794022132275, 99.40793063565667]
[50, 0.1, 5, 0.6, 99.99734373387524, 99.99734369148592]
[50, 0.1, 5, 0.7, 99.99734373387524, 99.99734369148592]
[50, 0.1, 5, 0.8, 99.99734373387