# IU Difference: XGB Experiment based on 73 Variables 525k Rows 6 Waves

In [1]:
%pwd

'/mnt/f/ESG09_Project/Code'

In [2]:
%cd ..

/mnt/f/ESG09_Project


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


## Import Package

In [3]:
import os 
import pandas as pd
from scipy import stats
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import xgboost as xgb

## Load and Make Datasets

In [4]:
Df_Filename = os.path.join("Data", "GallupWB_Zhang73var798k6wave_v1.parquet")

In [5]:
Df = pd.read_parquet(Df_Filename)

In [6]:
Df.shape

(798604, 73)

In [7]:
Df.head()

Unnamed: 0,wave,INCOME_2,Cantril_ladder,Health_disable,Relative_have,Life_satisfaction,Living_standard_trend,Enough_food,Enough_shelter,Well_rested,...,Employment,Children_under15,Feeling_income,Born_here,Home_handline,Mobile_phone,Internet_access_available,Used_internet_recently,Phone_internet_access,COUNTRY_ISO3
1709734,12,592300.098717,5.0,0.0,1.0,0.0,-1.0,1.0,0.0,0.0,...,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,USA
1709735,12,106614.017769,9.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,...,6.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,USA
1709736,12,21322.803554,6.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,...,6.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,USA
1709737,12,56860.809477,5.0,0.0,1.0,1.0,-0.0,0.0,0.0,1.0,...,6.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,USA
1709738,12,33844.027641,7.0,1.0,1.0,1.0,-0.0,0.0,0.0,0.0,...,6.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,USA


In [8]:
Df.columns

Index(['wave', 'INCOME_2', 'Cantril_ladder', 'Health_disable', 'Relative_have',
       'Life_satisfaction', 'Living_standard_trend', 'Enough_food',
       'Enough_shelter', 'Well_rested', 'Respected', 'Smile',
       'Interesting_thing', 'Enjoyment', 'Physical_pain', 'Worry', 'Sadness',
       'Stress', 'Anger', 'City_satisfaction', 'Recommended_live_place',
       'Economic_rating', 'Economic_trend', 'Local_job_outlook',
       'Satisfied_pubtran', 'Satisfied_road', 'Satisfied_edu',
       'Satisfied_qualityair', 'Satisfied_qualitywater',
       'Satisfied_healthcare', 'Satisfied_affhouse', 'Satisfied_oppofriend',
       'Good_minorities', 'Good_homo', 'Good_immigrants', 'Donated',
       'Volunteer', 'Help_stranger', 'Voice_official',
       'Local_police_confidence', 'Safety_walk', 'Stolen', 'Assualted',
       'Religion_importance', 'Children_respected', 'Children_learn',
       'Women_respected', 'Satisfied_dealpoor', 'Satisfied_Env_preservation',
       'Satisfied_Personalfreedom

In [9]:
print(Df['Internet_access_available'].value_counts())

Internet_access_available
1.0    539754
0.0    258850
Name: count, dtype: int64


In [10]:
print(Df['Phone_internet_access'].value_counts())

Phone_internet_access
1.0    527859
0.0    270745
Name: count, dtype: int64


In [11]:
print(Df['Used_internet_recently'].value_counts())

Used_internet_recently
1.0    496245
0.0    302359
Name: count, dtype: int64


### Check the Difference with Internet Use

In [12]:
InternetAvailable_CantrilLadder= Df.loc[Df['Internet_access_available']==1, 'Cantril_ladder'].mean()

In [13]:
InternetAvailable_CantrilLadder

6.051217776987294

In [14]:
InternetUnavailable_CantrilLadder = Df.loc[Df['Internet_access_available']==0, 'Cantril_ladder'].mean()

In [15]:
InternetUnavailable_CantrilLadder

4.582418389028395

In [16]:
t_stat, p_value = stats.ttest_ind(Df.loc[Df['Internet_access_available']==1, 'Cantril_ladder'], Df.loc[Df['Internet_access_available']==0, 'Cantril_ladder'])

In [17]:
print(f"T-statistic: {t_stat}, P-value: {p_value}")

T-statistic: 250.03071115917842, P-value: 0.0


### Shuffle Conversion

In [18]:
Df = Df.sample(frac=1, random_state=42).reset_index(drop=True)

In [19]:
Df['COUNTRY_ISO3'] = Df['COUNTRY_ISO3'].astype('category')

### Df_Internet_Unavailable 

In [20]:
Df_Internet_Unavailable = Df.loc[Df['Internet_access_available']==0, :].drop(columns=['Internet_access_available'])

In [21]:
Df_Internet_Unavailable.shape

(258850, 72)

In [22]:
yu = Df_Internet_Unavailable['Cantril_ladder']

In [23]:
Xu = Df_Internet_Unavailable.drop(columns=['Cantril_ladder'])

In [24]:
Xu_train, Xu_test, yu_train, yu_test = train_test_split(Xu, yu, test_size=0.1, random_state=42)

### Df_Internet_Available 

In [25]:
Df_Internet_available = Df.loc[Df['Internet_access_available']==1, :].drop(columns=['Internet_access_available'])

In [26]:
Df_Internet_available.shape

(539754, 72)

In [27]:
yf = Df_Internet_available['Cantril_ladder']

In [28]:
Xf = Df_Internet_available.drop(columns=['Cantril_ladder'])

In [29]:
Xf_train, Xf_test, yf_train, yf_test = train_test_split(Xf, yf, test_size=0.1, random_state=42)

## Model and Tuning Hyper

### Internet_Unavailable Model

In [30]:
model = xgb.XGBRegressor(objective='reg:squarederror', device = 'cuda', tree_method='hist', 
                         n_estimators=500, learning_rate=0.01, max_depth=5, 
                         random_state=42, enable_categorical=True  )
model.fit(Xu_train, yu_train)

In [31]:
yu_pred = model.predict(Xu_test)

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




In [32]:
r2_score(yu_test, yu_pred) * 100

24.694201570678242

In [33]:
yu_train_pred = model.predict(Xu_train)

In [34]:
r2_score(yu_train, yu_train_pred) * 100

26.234098068598012

In [35]:
n_estimators_list = list(range(100, 1_100, 100))
learning_rate_list = [0.01, 0.1]
max_depth_list = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
subsample_list = [0.6, 0.7, 0.8, 0.9, 1]

In [36]:
results_list = []
for n_estimators in n_estimators_list:
    for learning_rate in learning_rate_list:
        for max_depth in max_depth_list:
            for subsample in subsample_list:
                model = xgb.XGBRegressor(objective='reg:squarederror', 
                                         device = 'cuda', 
                                         tree_method='hist', 
                                         n_estimators = n_estimators, 
                                         learning_rate = learning_rate, 
                                         max_depth = max_depth, 
                                         subsample = subsample,
                                         random_state=42, enable_categorical=True  )
                model.fit(Xu_train, yu_train)
                yu_pred = model.predict(Xu_test)
                test_r2 = r2_score(yu_test, yu_pred) * 100
                yu_train_pred = model.predict(Xu_train)
                train_r2 = r2_score(yu_train, yu_train_pred) * 100
                row = [n_estimators, learning_rate, max_depth, 
                       subsample, train_r2, test_r2]
                print(row)
                model = None
                yu_pred = None
                yu_train_pred = None
                results_list.append(row)

[100, 0.001, 3, 0.6, 3.0190284721963945, 3.0101815156643563]
[100, 0.001, 3, 0.7, 3.017643985831331, 3.009531804548937]
[100, 0.001, 3, 0.8, 3.0174171719857257, 3.0097451834003586]
[100, 0.001, 3, 0.9, 3.016314383055052, 3.0091296372122933]
[100, 0.001, 3, 1, 3.01718758522016, 3.0124385865247683]
[100, 0.001, 4, 0.6, 3.3461078956993573, 3.34266034546713]
[100, 0.001, 4, 0.7, 3.345974864808421, 3.3451115633840844]
[100, 0.001, 4, 0.8, 3.3460281637477385, 3.3443391781159515]
[100, 0.001, 4, 0.9, 3.346210800223126, 3.34552339728994]
[100, 0.001, 4, 1, 3.3496344854055726, 3.3508616509783207]
[100, 0.001, 5, 0.6, 3.607053849877251, 3.5919522471569265]
[100, 0.001, 5, 0.7, 3.6060503053686133, 3.5908731982820186]
[100, 0.001, 5, 0.8, 3.6047742854344644, 3.588457339194606]
[100, 0.001, 5, 0.9, 3.603775557376321, 3.588234722135164]
[100, 0.001, 5, 1, 3.6002606192121034, 3.586613937558847]
[100, 0.001, 6, 0.6, 3.8085608198594634, 3.7673350139293738]
[100, 0.001, 6, 0.7, 3.8084971846183313, 3.766

KeyboardInterrupt: 