# Age Difference: XGB Experiment based on 59 Variables 1513k Rows 14 Waves

In [1]:
%pwd

'/home/GPU/esg09-wellbeing/Code'

In [2]:
%cd ..

/home/GPU/esg09-wellbeing


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


## Import Package

In [3]:
from autogluon.tabular import TabularDataset, TabularPredictor
import numpy as np
import os 
import pandas as pd
from scipy import stats
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

## Load and Make Datasets

In [4]:
Df_Filename = os.path.join("Data", "GallupWB_Ml64var1911k14wave_v1.parquet")

In [5]:
Df = pd.read_parquet(Df_Filename)

In [6]:
Df.shape

(1911212, 64)

In [7]:
Df.columns

Index(['wave', 'INCOME_2', 'Cantril_ladder', 'Health_disable', 'Relative_have',
       'Living_standard_change', 'Enough_food', 'Enough_shelter',
       'Well_rested', 'Respected', 'Smile', 'Interesting_thing', 'Enjoyment',
       'Physical_pain', 'Worry', 'Sadness', 'Stress', 'Anger',
       'City_satisficied', 'Economic_change', 'Goodtime_job', 'Sat_pubtran',
       'Sat_road', 'Sat_edu', 'Sat_qualityair', 'Sat_qualitywater',
       'Sat_healthcare', 'Sat_affhouse', 'Sat_oppofriend', 'Good_minorities',
       'Good_gayles', 'Good_immigrants', 'Donated', 'Volunteer',
       'Help_stranger', 'Voice_official', 'Local_police', 'Safety_walk',
       'Stolen', 'Assualted', 'Religion_importance', 'Children_respected',
       'Children_learn', 'Women_respected', 'Sat_dealpoor', 'Sat_perserveenv',
       'Freedom_chooselife', 'Conf_military', 'Conf_judicial',
       'Conf_government', 'Conf_financial', 'Conf_honestyelections',
       'Freedom_media', 'Corruption_business', 'Corruption_governm

### Check the Difference between Age Group

In [8]:
young_cantrilladder = Df.loc[Df['Age']<=40, 'Cantril_ladder']

In [9]:
young_cantrilladder.mean()

5.46157389538526

In [10]:
middle_cantrilladder = Df.loc[(Df['Age']>40)&(Df['Age']<=65), 'Cantril_ladder']

In [11]:
middle_cantrilladder.mean()

5.546090633585152

In [12]:
old_cantrilladder = Df.loc[Df['Age']>65, 'Cantril_ladder']

In [13]:
old_cantrilladder.mean()

5.727577206476798

In [14]:
t_stat, p_value = stats.ttest_ind(young_cantrilladder, middle_cantrilladder)

In [15]:
print(f"T-statistic: {t_stat}, P-value: {p_value}")

T-statistic: -22.394787205917154, P-value: 4.592108979551354e-111


In [16]:
t_stat, p_value = stats.ttest_ind(young_cantrilladder, old_cantrilladder)

In [17]:
print(f"T-statistic: {t_stat}, P-value: {p_value}")

T-statistic: -46.54255508076877, P-value: 0.0


In [18]:
t_stat, p_value = stats.ttest_ind(middle_cantrilladder, old_cantrilladder)

In [19]:
print(f"T-statistic: {t_stat}, P-value: {p_value}")

T-statistic: -30.141361019016323, P-value: 1.760592592819179e-199


### Shuffle Conversion

In [20]:
Df = Df.sample(frac=1, random_state=42).reset_index(drop=True)

In [21]:
Df['COUNTRY_ISO3'] = Df['COUNTRY_ISO3'].astype('category')

In [22]:
Df.isna().any().any()

False

### Df_young

In [46]:
Df_young = Df.loc[Df['Age']<=40, :]

In [47]:
Df_young.shape

(1031174, 64)

In [48]:
y_train, y_test = train_test_split(Df_young, test_size=0.1, random_state=42)

### Df_middle

In [49]:
Df_middle = Df.loc[(Df['Age']>40)&(Df['Age']<=65), :]

In [50]:
Df_middle.shape

(663573, 64)

In [51]:
m_train, m_test = train_test_split(Df_middle, test_size=0.1, random_state=42)

### Df_old

In [52]:
Df_old = Df.loc[Df['Age']>65, :]

In [53]:
Df_old.shape

(216465, 64)

In [54]:
o_train, o_test = train_test_split(Df_old, test_size=0.1, random_state=42)

## AutoML Test

### Young Model

In [55]:
label = 'Cantril_ladder'

In [58]:
predictor = TabularPredictor(label=label, eval_metric='r2', problem_type = 'regression').fit(y_train)

No path specified. Models will be saved in: "AutogluonModels/ag-20240620_070505"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.1.1
Python Version:     3.9.19
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #68~20.04.1-Ubuntu SMP Wed May 1 14:35:27 UTC 2024
CPU Count:          16
Memory Avail:       29.58 GB / 102.16 GB (29.0%)
Disk Space Avail:   357.18 GB / 484.40 GB (73.7%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='best_quality'   : Maximize accuracy. Default time_limit=3600.
	presets='high_quality'   : Strong accuracy with fast inference speed. Default time_limit=3600.
	presets='good_quality'   : Good accuracy with very fast inference speed. Default time_limit=3600.
	presets='medium_quality' : Fast training time, ideal for initial prototyping.
	

[1000]	valid_set's l2: 3.80205	valid_set's r2: 0.335319
[2000]	valid_set's l2: 3.78628	valid_set's r2: 0.338074
[3000]	valid_set's l2: 3.784	valid_set's r2: 0.338473
[4000]	valid_set's l2: 3.78007	valid_set's r2: 0.339161


	0.3395	 = Validation score   (r2)
	145.35s	 = Training   runtime
	0.55s	 = Validation runtime
Fitting model: LightGBM ...


[1000]	valid_set's l2: 3.76747	valid_set's r2: 0.341363
[2000]	valid_set's l2: 3.75413	valid_set's r2: 0.343695
[3000]	valid_set's l2: 3.75279	valid_set's r2: 0.343931
[4000]	valid_set's l2: 3.74434	valid_set's r2: 0.345408
[5000]	valid_set's l2: 3.74885	valid_set's r2: 0.344619


	0.3454	 = Validation score   (r2)
	126.8s	 = Training   runtime
	0.61s	 = Validation runtime
Fitting model: RandomForestMSE ...


KeyboardInterrupt: 