# Random Forest Test with Well-being

In [1]:
%pwd

'/Users/shibo/Desktop/GallupWellBeingGroup/Code'

In [2]:
%cd ..

/Users/shibo/Desktop/GallupWellBeingGroup


## Import Package

In [3]:
import os 
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import xgboost as xgb

In [4]:
!pip install xgboost



## Fine Wash

### Load Dataset

In [5]:
RawDataset_Filename = os.path.join("Data", "GallupWB_WashedRawData17Wave_v1.parquet")

In [6]:
RawDataset = pd.read_parquet(RawDataset_Filename)

In [7]:
RawDataset.shape

(2594089, 90)

In [8]:
nan_counts = RawDataset.isna().sum()
for line in zip(RawDataset.columns, nan_counts):
    print(line)

('wave', 0)
('COUNTRY_ISO3', 0)
('INCOME_1', 420409)
('INCOME_2', 423416)
('INCOME_3', 420411)
('INCOME_4', 423418)
('INCOME_5', 419801)
('INCOME_6', 2594089)
('INCOME_7', 419841)
('WP16', 14604)
('WP18', 14603)
('WP23', 123903)
('WP27', 151900)
('WP30', 129145)
('WP31', 142727)
('WP40', 52441)
('WP43', 49598)
('WP60', 86371)
('WP61', 67376)
('WP63', 104399)
('WP65', 69390)
('WP67', 74033)
('WP68', 85721)
('WP69', 82693)
('WP70', 83895)
('WP71', 184749)
('WP74', 100720)
('WP83', 185474)
('WP85', 310161)
('WP86', 240808)
('WP19472', 1719188)
('WP88', 262622)
('WP89', 171021)
('WP91', 198859)
('WP92', 200973)
('WP93', 145248)
('WP94', 202990)
('WP95', 201667)
('WP96', 2594089)
('WP97', 238617)
('WP98', 231858)
('WP10248', 710990)
('WP103', 452880)
('WP105', 545368)
('WP106', 262574)
('WP108', 189759)
('WP109', 187253)
('WP110', 193819)
('WP111', 675056)
('WP112', 419460)
('WP113', 206175)
('WP117', 268482)
('WP118', 786324)
('WP119', 291689)
('WP129', 177618)
('WP130', 156114)
('WP9050',

### wave

In [9]:
RawDataset['wave'].value_counts().sort_index()

wave
1.0     129972
2.0     104831
3.0     120443
3.1       3568
3.2       4141
3.3       2592
3.4       1758
4.0      99664
4.1      18074
4.2      19164
4.3       1147
5.0     112702
5.1      18141
5.2      20013
6.0     130614
6.1      23134
6.2      26258
6.3       7224
6.4       7157
6.5       1077
7.1     151720
7.2      49465
7.3      12879
7.4      16019
8.1     138673
9.1     154205
9.2      36143
10.1    148232
11.1    148724
11.2      2000
12.1    154166
12.9      1000
13.1    118922
13.2     33952
14.1    176253
15.1    121207
15.5      6350
15.9      1917
16.1    126902
17.1    143686
Name: count, dtype: int64

In [10]:
RawDataset['wave'] = RawDataset['wave'].astype(int)

In [11]:
RawDataset['wave'].value_counts().sort_index()

wave
1     129972
2     104831
3     132502
4     138049
5     150856
6     195464
7     230083
8     138673
9     190348
10    148232
11    150724
12    155166
13    152874
14    176253
15    129474
16    126902
17    143686
Name: count, dtype: int64

### check elements

In [19]:
RawDataset.shape

(2594089, 90)

In [13]:
print(RawDataset['WP67'].head())

0    1.0
1    1.0
2    1.0
3    1.0
4    1.0
Name: WP67, dtype: float64


### Filter rows

In [16]:
Droped_datasetdf = RawDataset.drop(columns=['WP27','WP85','WP86','WP103','WP105','WP106','WP111','WP117','WP118','WP119',
                     'WP131','WP137','WP138','WP139','WP141','WP144','WP10251','WP145','WP146','WP150'
                     ,'WP151','WP153','WP155','WP156','WP13125','WP1230','WP1233','WP4657',
                     'WP9042','WP9048','WP17625'])

In [17]:
Droped_datasetdf.shape

(2594089, 59)

In [28]:
print(Droped_datasetdf['WP129'].head(50))

0     1.0
1     2.0
2     1.0
3     1.0
4     2.0
5     2.0
6     1.0
7     2.0
8     2.0
9     1.0
10    1.0
11    2.0
12    1.0
13    1.0
14    2.0
15    2.0
16    2.0
17    1.0
18    1.0
19    2.0
20    1.0
21    1.0
22    2.0
23    1.0
24    1.0
25    1.0
26    1.0
27    1.0
28    1.0
29    1.0
30    1.0
31    1.0
32    2.0
33    1.0
34    2.0
35    2.0
36    1.0
37    1.0
38    3.0
39    1.0
40    2.0
41    2.0
42    1.0
43    1.0
44    2.0
45    2.0
46    1.0
47    1.0
48    1.0
49    1.0
Name: WP129, dtype: float64


In [31]:
columns_to_update = ['WP23', 'WP30', 'WP40', 'WP43', 'WP60', 'WP61', 'WP63', 'WP65', 'WP67', 'WP68', 'WP69', 'WP70',
                     'WP71', 'WP74', 'WP83', 'WP89', 'WP91', 'WP92', 'WP93', 'WP94', 'WP95', 'WP97', 'WP98', 'WP10248',
                     'WP108', 'WP109', 'WP110', 'WP112', 'WP113', 'WP129', 'WP130', 'WP9050', 'WP132', 'WP134', 'WP17626',
                     'WP16056', 'WP15862', 'WP19544']
Droped_datasetdf[columns_to_update] = Droped_datasetdf[columns_to_update].applymap(lambda x: 2 if x not in [1, 2] else x)

  Droped_datasetdf[columns_to_update] = Droped_datasetdf[columns_to_update].applymap(lambda x: 2 if x not in [1, 2] else x)


In [32]:
print(Droped_datasetdf['WP129'].head(50))

0     1.0
1     2.0
2     1.0
3     1.0
4     2.0
5     2.0
6     1.0
7     2.0
8     2.0
9     1.0
10    1.0
11    2.0
12    1.0
13    1.0
14    2.0
15    2.0
16    2.0
17    1.0
18    1.0
19    2.0
20    1.0
21    1.0
22    2.0
23    1.0
24    1.0
25    1.0
26    1.0
27    1.0
28    1.0
29    1.0
30    1.0
31    1.0
32    2.0
33    1.0
34    2.0
35    2.0
36    1.0
37    1.0
38    2.0
39    1.0
40    2.0
41    2.0
42    1.0
43    1.0
44    2.0
45    2.0
46    1.0
47    1.0
48    1.0
49    1.0
Name: WP129, dtype: float64


In [33]:
print(Droped_datasetdf['WP31'].head(50))

0     2.0
1     2.0
2     1.0
3     2.0
4     1.0
5     1.0
6     1.0
7     2.0
8     1.0
9     2.0
10    1.0
11    3.0
12    1.0
13    1.0
14    1.0
15    2.0
16    1.0
17    1.0
18    2.0
19    2.0
20    1.0
21    1.0
22    1.0
23    1.0
24    1.0
25    1.0
26    1.0
27    3.0
28    3.0
29    3.0
30    3.0
31    2.0
32    1.0
33    1.0
34    1.0
35    1.0
36    1.0
37    2.0
38    2.0
39    1.0
40    3.0
41    2.0
42    3.0
43    3.0
44    1.0
45    3.0
46    2.0
47    3.0
48    1.0
49    3.0
Name: WP31, dtype: float64


In [34]:
columns_to_update = ['WP31','WP88']
Droped_datasetdf[columns_to_update] = Droped_datasetdf[columns_to_update].applymap(lambda x: 2 if x not in [1, 2, 3] else x)

  Droped_datasetdf[columns_to_update] = Droped_datasetdf[columns_to_update].applymap(lambda x: 2 if x not in [1, 2, 3] else x)


In [36]:
Droped_datasetdf = Droped_datasetdf[~((Droped_datasetdf['WP16'].isin([98, 99])) | (Droped_datasetdf['WP18'].isin([98, 99])))]

# Reset index (optional)
Droped_datasetdf.reset_index(drop=True, inplace=True)

In [37]:
Droped_datasetdf.shape

(2366009, 59)