# Endogenous Switch model based on Bayes search

In [1]:
%pwd

'/home/GPU/GallupWellBeingGroup/Code'

In [2]:
cd ..

/home/GPU/GallupWellBeingGroup


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


# Import Package

In [3]:
from joblib import dump
import os 
import pandas as pd
import xgboost as xgb
import random
from scipy import stats
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, confusion_matrix
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder

In [4]:
import shap
from xgboost import plot_importance
from xgboost import plot_tree
import matplotlib.pyplot as plt

In [5]:
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

# Load Dataset

In [6]:
Df =  pd.read_parquet('Data/GallupWB_WashedDataEnergy1x967mPhrase2_v1.parquet')

# Categorize the Dataset based on "Ecnomic_trend" and create Df0, Df1, Df2

In [7]:
Df0 = Df[Df['Economic_trend'] == 0].drop(columns=['Economic_trend'])
Df1 = Df[Df['Economic_trend'] == 1].drop(columns=['Economic_trend'])
Df2 = Df[Df['Economic_trend'] == 2].drop(columns=['Economic_trend'])
# Display the new DataFrames
print("Df0:")
print(Df0.shape)

Df0:
(603860, 233)


In [8]:
Df0.head()

Unnamed: 0,Disability,Food,Shelter,Relatives,Satisfy_life,Wellrested,Treated_respect,Smile_Laugh,Learn_interesting,Enjoyment,...,COUNTRY_ISO3_XNC,COUNTRY_ISO3_XNK,COUNTRY_ISO3_XSR,COUNTRY_ISO3_YEM,COUNTRY_ISO3_ZAF,COUNTRY_ISO3_ZMB,COUNTRY_ISO3_ZWE,Predicted_Economic_trend_0,Predicted_Economic_trend_1,Predicted_Economic_trend_2
3,1.0,2.0,2.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.803382,0.067895,0.128723
7,2.0,2.0,2.0,2.0,1.0,2.0,1.0,2.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.667939,0.13481,0.197251
16,2.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.609756,0.304902,0.085342
21,2.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.780491,0.046231,0.173278
22,2.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.749487,0.122699,0.127814


# Bayes Search for Df0

In [9]:
ye0 = Df0['Wellbeing_ladder']

In [10]:
Xe0 = Df0.drop(columns=['Wellbeing_ladder'])

In [11]:
param_space = {
    'n_estimators': Integer(100, 5000),
    'learning_rate': Real(0.001, 0.1, prior='log-uniform'),
    'max_depth': Integer(3, 16),
    'subsample': Real(0.5, 1.0),
    'min_child_weight': Real(0.001, 10, prior='log-uniform'),
    'max_delta_step': Real(0.001, 10, prior='log-uniform'),
    'reg_lambda': Real(0.001, 10, prior='log-uniform'),
    'reg_alpha': Real(0.001, 10, prior='log-uniform'),
    'gamma': Real(0.001, 10, prior='log-uniform')
}

In [12]:
xgb_reg = xgb.XGBRegressor(objective='reg:squarederror',tree_method='hist', device='cuda', random_state=42)

In [13]:
class RandomRunNFoldsKFold(KFold):
    def __init__(self, n_splits=10, random_state=None, run_splits=3, **kwargs):
        super().__init__(n_splits=n_splits, shuffle=True, random_state=random_state, **kwargs)
        self.random_state = random_state
        self.actual_splits = run_splits  # Number of actual splits to use

    def split(self, X, y=None, groups=None):
        folds = list(super().split(X, y, groups))
        if self.random_state is not None:
            random.seed(self.random_state)
        selected_folds = random.sample(folds, self.actual_splits)
        for train_index, test_index in selected_folds:
            yield train_index, test_index

    def get_n_splits(self, X=None, y=None, groups=None):
        return self.actual_splits

In [14]:
rkfcv = RandomRunNFoldsKFold(n_splits=10, run_splits=3, random_state=42)

In [15]:
bayes_search = BayesSearchCV(
    estimator=xgb_reg,
    search_spaces=param_space,
    n_iter=20,
    scoring='r2',
    cv=rkfcv,
    n_jobs = 1,
    n_points = 1,
    verbose=2,
    random_state=42,
    return_train_score = True
)

In [16]:
bayes_search.fit(Xe0, ye0)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




[CV] END gamma=0.04369339947510315, learning_rate=0.02853983686604182, max_delta_step=5.388550972627239, max_depth=7, min_child_weight=0.47928274405969296, n_estimators=2129, reg_alpha=0.025335258486348353, reg_lambda=0.9078559343576645, subsample=0.6522316555182531; total time=  40.2s
[CV] END gamma=0.04369339947510315, learning_rate=0.02853983686604182, max_delta_step=5.388550972627239, max_depth=7, min_child_weight=0.47928274405969296, n_estimators=2129, reg_alpha=0.025335258486348353, reg_lambda=0.9078559343576645, subsample=0.6522316555182531; total time=  40.4s
[CV] END gamma=0.04369339947510315, learning_rate=0.02853983686604182, max_delta_step=5.388550972627239, max_depth=7, min_child_weight=0.47928274405969296, n_estimators=2129, reg_alpha=0.025335258486348353, reg_lambda=0.9078559343576645, subsample=0.6522316555182531; total time=  40.9s
Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] END gamma=2.2364202820542705, learning_rate=0.05842928269761146, max_delta_

In [17]:
bayes_search.best_params_

OrderedDict([('gamma', 0.04369339947510315),
             ('learning_rate', 0.02853983686604182),
             ('max_delta_step', 5.388550972627239),
             ('max_depth', 7),
             ('min_child_weight', 0.47928274405969296),
             ('n_estimators', 2129),
             ('reg_alpha', 0.025335258486348353),
             ('reg_lambda', 0.9078559343576645),
             ('subsample', 0.6522316555182531)])

In [18]:
pd.DataFrame(bayes_search.cv_results_).head(10)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_gamma,param_learning_rate,param_max_delta_step,param_max_depth,param_min_child_weight,param_n_estimators,...,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score,rank_train_score
0,39.902474,0.27246,0.600542,0.009008,0.043693,0.02854,5.388551,7,0.479283,2129,...,0.511417,0.509198,0.003707,1,0.58094,0.581714,0.58072,0.581125,0.000426,6
1,11.966768,0.050383,0.307745,0.002343,2.23642,0.058429,0.016355,15,2.860959,405,...,0.155273,0.154052,0.001483,18,0.154866,0.155103,0.154803,0.154924,0.000129,18
2,53.473223,0.460114,0.765726,0.003565,0.060163,0.068777,0.002627,9,0.005647,2323,...,0.16425,0.163021,0.001589,17,0.163921,0.164192,0.163919,0.16401,0.000128,17
3,22.893806,0.199127,0.396862,0.040765,1.776577,0.002207,0.246711,13,0.123654,568,...,0.127836,0.126813,0.0012,19,0.128788,0.128867,0.128762,0.128806,4.5e-05,19
4,137.879909,0.376213,1.535642,0.012809,1.578388,0.007517,0.127785,12,4.124851,3613,...,0.493026,0.491224,0.003203,10,0.516519,0.517424,0.517068,0.517004,0.000372,11
5,37.103274,0.054448,0.405122,0.000448,0.863201,0.075638,0.004513,5,1.592256,1926,...,0.232977,0.231446,0.002408,16,0.231831,0.232325,0.231916,0.232024,0.000216,16
6,154.173422,0.537067,1.553699,0.018607,0.29398,0.035541,0.027296,14,0.234681,3014,...,0.492381,0.490839,0.002842,11,0.523078,0.523753,0.523605,0.523479,0.00029,10
7,27.188135,0.145147,0.372589,0.05543,0.149146,0.069186,0.096798,14,0.019781,351,...,0.472999,0.47117,0.003274,12,0.501443,0.501782,0.50149,0.501571,0.00015,12
8,25.691641,0.012489,0.430604,0.018478,6.636085,0.025191,3.066251,8,0.033439,1136,...,0.508151,0.506023,0.003605,5,0.561875,0.563388,0.562435,0.562566,0.000625,7
9,24.83974,0.040408,0.418777,0.026354,0.001034,0.043218,0.92285,6,0.206375,1446,...,0.510229,0.50784,0.003897,3,0.537851,0.539363,0.53847,0.538562,0.000621,9


In [19]:
dump(bayes_search, "Results/Shi_BayesDf0SearchingResults.joblib")

['Results/Shi_BayesDf0SearchingResults.joblib']

In [20]:
Xe0_train, Xe0_test, ye0_train, ye0_test = train_test_split(Xe0, ye0, test_size=0.1, random_state=42)

## Model0

In [21]:
model0 = xgb.XGBRegressor(objective='reg:squarederror',device='cuda',
                         tree_method='hist', random_state=42, **bayes_search.best_params_)

# Bayes Search for Df1

In [22]:
ye1 = Df1['Wellbeing_ladder']

In [23]:
Xe1 = Df1.drop(columns=['Wellbeing_ladder'])

In [24]:
param_space = {
    'n_estimators': Integer(100, 5000),
    'learning_rate': Real(0.001, 0.1, prior='log-uniform'),
    'max_depth': Integer(3, 16),
    'subsample': Real(0.5, 1.0),
    'min_child_weight': Real(0.001, 10, prior='log-uniform'),
    'max_delta_step': Real(0.001, 10, prior='log-uniform'),
    'reg_lambda': Real(0.001, 10, prior='log-uniform'),
    'reg_alpha': Real(0.001, 10, prior='log-uniform'),
    'gamma': Real(0.001, 10, prior='log-uniform')
}

In [25]:
xgb_reg = xgb.XGBRegressor(objective='reg:squarederror',tree_method='hist', device='cuda', random_state=42)

In [26]:
class RandomRunNFoldsKFold(KFold):
    def __init__(self, n_splits=10, random_state=None, run_splits=3, **kwargs):
        super().__init__(n_splits=n_splits, shuffle=True, random_state=random_state, **kwargs)
        self.random_state = random_state
        self.actual_splits = run_splits  # Number of actual splits to use

    def split(self, X, y=None, groups=None):
        folds = list(super().split(X, y, groups))
        if self.random_state is not None:
            random.seed(self.random_state)
        selected_folds = random.sample(folds, self.actual_splits)
        for train_index, test_index in selected_folds:
            yield train_index, test_index

    def get_n_splits(self, X=None, y=None, groups=None):
        return self.actual_splits

In [27]:
rkfcv = RandomRunNFoldsKFold(n_splits=10, run_splits=3, random_state=42)

In [28]:
bayes_search1 = BayesSearchCV(
    estimator=xgb_reg,
    search_spaces=param_space,
    n_iter=20,
    scoring='r2',
    cv=rkfcv,
    n_jobs = 1,
    n_points = 1,
    verbose=2,
    random_state=42,
    return_train_score = True
)

In [29]:
bayes_search1.fit(Xe1, ye1)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] END gamma=0.04369339947510315, learning_rate=0.02853983686604182, max_delta_step=5.388550972627239, max_depth=7, min_child_weight=0.47928274405969296, n_estimators=2129, reg_alpha=0.025335258486348353, reg_lambda=0.9078559343576645, subsample=0.6522316555182531; total time=  47.7s
[CV] END gamma=0.04369339947510315, learning_rate=0.02853983686604182, max_delta_step=5.388550972627239, max_depth=7, min_child_weight=0.47928274405969296, n_estimators=2129, reg_alpha=0.025335258486348353, reg_lambda=0.9078559343576645, subsample=0.6522316555182531; total time=  47.7s
[CV] END gamma=0.04369339947510315, learning_rate=0.02853983686604182, max_delta_step=5.388550972627239, max_depth=7, min_child_weight=0.47928274405969296, n_estimators=2129, reg_alpha=0.025335258486348353, reg_lambda=0.9078559343576645, subsample=0.6522316555182531; total time=  47.8s
Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] END gamma=2.2364

In [30]:
bayes_search1.best_params_

OrderedDict([('gamma', 0.04369339947510315),
             ('learning_rate', 0.02853983686604182),
             ('max_delta_step', 5.388550972627239),
             ('max_depth', 7),
             ('min_child_weight', 0.47928274405969296),
             ('n_estimators', 2129),
             ('reg_alpha', 0.025335258486348353),
             ('reg_lambda', 0.9078559343576645),
             ('subsample', 0.6522316555182531)])

In [31]:
dump(bayes_search1, "Results/Shi_BayesDf1SearchingResults.joblib")

['Results/Shi_BayesDf1SearchingResults.joblib']

In [32]:
pd.DataFrame(bayes_search1.cv_results_).head(10)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_gamma,param_learning_rate,param_max_delta_step,param_max_depth,param_min_child_weight,param_n_estimators,...,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score,rank_train_score
0,47.017265,0.043902,0.682261,0.001821,0.043693,0.02854,5.388551,7,0.479283,2129,...,0.509732,0.51173,0.001446,1,0.574024,0.574464,0.574717,0.574402,0.000287,7
1,13.636927,0.034564,0.37228,0.014865,2.23642,0.058429,0.016355,15,2.860959,405,...,0.153342,0.154189,0.001052,18,0.154702,0.154536,0.154727,0.154655,8.5e-05,18
2,61.380765,0.031627,0.900627,0.02291,0.060163,0.068777,0.002627,9,0.005647,2323,...,0.162048,0.163061,0.001171,17,0.163694,0.163486,0.163769,0.16365,0.000119,17
3,25.989825,0.066531,0.456129,0.03014,1.776577,0.002207,0.246711,13,0.123654,568,...,0.126392,0.127098,0.000836,19,0.128678,0.128447,0.128704,0.12861,0.000116,19
4,162.003794,5.109526,1.807755,0.025009,1.578388,0.007517,0.127785,12,4.124851,3613,...,0.490995,0.493948,0.002105,11,0.517557,0.517695,0.517864,0.517706,0.000126,12
5,45.975754,0.049133,0.532965,0.056854,0.863201,0.075638,0.004513,5,1.592256,1926,...,0.22985,0.231415,0.001661,16,0.231609,0.231375,0.231754,0.231579,0.000156,16
6,184.573178,0.328115,1.797515,0.038823,0.29398,0.035541,0.027296,14,0.234681,3014,...,0.490731,0.49304,0.001658,12,0.52349,0.523629,0.524089,0.523736,0.000256,11
7,32.068976,0.133475,0.442346,0.02788,0.149146,0.069186,0.096798,14,0.019781,351,...,0.470479,0.47268,0.001596,14,0.501264,0.501195,0.501937,0.501466,0.000335,14
8,31.137819,0.032246,0.48107,0.001509,6.636085,0.025191,3.066251,8,0.033439,1136,...,0.506019,0.508294,0.001637,3,0.557561,0.55742,0.557591,0.557524,7.4e-05,8
9,30.655661,0.023961,0.492385,0.043769,0.001034,0.043218,0.92285,6,0.206375,1446,...,0.507919,0.509947,0.001434,2,0.537642,0.537415,0.53793,0.537662,0.000211,10


In [33]:
Xe1_train, Xe1_test, ye1_train, ye1_test = train_test_split(Xe1, ye1, test_size=0.1, random_state=42)

## Model1

In [34]:
model1 = xgb.XGBRegressor(objective='reg:squarederror',device='cuda',
                         tree_method='hist', random_state=42, **bayes_search1.best_params_)

# Bayes Search for Df2

In [35]:
ye2 = Df2['Wellbeing_ladder']

In [36]:
Xe2 = Df2.drop(columns=['Wellbeing_ladder'])

In [37]:
param_space = {
    'n_estimators': Integer(100, 5000),
    'learning_rate': Real(0.001, 0.1, prior='log-uniform'),
    'max_depth': Integer(3, 16),
    'subsample': Real(0.5, 1.0),
    'min_child_weight': Real(0.001, 10, prior='log-uniform'),
    'max_delta_step': Real(0.001, 10, prior='log-uniform'),
    'reg_lambda': Real(0.001, 10, prior='log-uniform'),
    'reg_alpha': Real(0.001, 10, prior='log-uniform'),
    'gamma': Real(0.001, 10, prior='log-uniform')
}

In [38]:
xgb_reg = xgb.XGBRegressor(objective='reg:squarederror',tree_method='hist', device='cuda', random_state=42)

In [39]:
class RandomRunNFoldsKFold(KFold):
    def __init__(self, n_splits=10, random_state=None, run_splits=3, **kwargs):
        super().__init__(n_splits=n_splits, shuffle=True, random_state=random_state, **kwargs)
        self.random_state = random_state
        self.actual_splits = run_splits  # Number of actual splits to use

    def split(self, X, y=None, groups=None):
        folds = list(super().split(X, y, groups))
        if self.random_state is not None:
            random.seed(self.random_state)
        selected_folds = random.sample(folds, self.actual_splits)
        for train_index, test_index in selected_folds:
            yield train_index, test_index

    def get_n_splits(self, X=None, y=None, groups=None):
        return self.actual_splits

In [40]:
rkfcv = RandomRunNFoldsKFold(n_splits=10, run_splits=3, random_state=42)

In [41]:
bayes_search2 = BayesSearchCV(
    estimator=xgb_reg,
    search_spaces=param_space,
    n_iter=20,
    scoring='r2',
    cv=rkfcv,
    n_jobs = 1,
    n_points = 1,
    verbose=2,
    random_state=42,
    return_train_score = True
)

In [42]:
bayes_search2.fit(Xe2, ye2)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] END gamma=0.04369339947510315, learning_rate=0.02853983686604182, max_delta_step=5.388550972627239, max_depth=7, min_child_weight=0.47928274405969296, n_estimators=2129, reg_alpha=0.025335258486348353, reg_lambda=0.9078559343576645, subsample=0.6522316555182531; total time=  48.0s
[CV] END gamma=0.04369339947510315, learning_rate=0.02853983686604182, max_delta_step=5.388550972627239, max_depth=7, min_child_weight=0.47928274405969296, n_estimators=2129, reg_alpha=0.025335258486348353, reg_lambda=0.9078559343576645, subsample=0.6522316555182531; total time=  48.1s
[CV] END gamma=0.04369339947510315, learning_rate=0.02853983686604182, max_delta_step=5.388550972627239, max_depth=7, min_child_weight=0.47928274405969296, n_estimators=2129, reg_alpha=0.025335258486348353, reg_lambda=0.9078559343576645, subsample=0.6522316555182531; total time=  48.0s
Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] END gamma=2.2364

In [43]:
bayes_search2.best_params_

OrderedDict([('gamma', 0.04369339947510315),
             ('learning_rate', 0.02853983686604182),
             ('max_delta_step', 5.388550972627239),
             ('max_depth', 7),
             ('min_child_weight', 0.47928274405969296),
             ('n_estimators', 2129),
             ('reg_alpha', 0.025335258486348353),
             ('reg_lambda', 0.9078559343576645),
             ('subsample', 0.6522316555182531)])

In [44]:
pd.DataFrame(bayes_search2.cv_results_).head(10)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_gamma,param_learning_rate,param_max_delta_step,param_max_depth,param_min_child_weight,param_n_estimators,...,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score,rank_train_score
0,47.344979,0.047129,0.665636,0.003868,0.043693,0.02854,5.388551,7,0.479283,2129,...,0.546173,0.548988,0.004098,1,0.606427,0.60545,0.605908,0.605929,0.000399,3
1,13.596739,0.033852,0.331768,0.002681,2.23642,0.058429,0.016355,15,2.860959,405,...,0.165463,0.165462,0.000813,18,0.164907,0.164734,0.164909,0.16485,8.2e-05,18
2,62.604503,0.096282,0.835756,0.020741,0.060163,0.068777,0.002627,9,0.005647,2323,...,0.174686,0.174818,0.000819,17,0.174569,0.174343,0.174551,0.174488,0.000103,17
3,26.036002,0.223071,0.395481,0.008002,1.776577,0.002207,0.246711,13,0.123654,568,...,0.135864,0.135932,0.000643,19,0.13671,0.136589,0.136842,0.136714,0.000103,19
4,155.307454,0.582984,1.683688,0.051125,1.578388,0.007517,0.127785,12,4.124851,3613,...,0.527904,0.529516,0.003587,10,0.548847,0.547725,0.548841,0.548471,0.000528,11
5,42.811791,0.135513,0.456067,0.035727,0.863201,0.075638,0.004513,5,1.592256,1926,...,0.247272,0.247962,0.001349,16,0.246865,0.246508,0.246915,0.246762,0.000181,16
6,171.334087,0.875077,1.648315,0.01432,0.29398,0.035541,0.027296,14,0.234681,3014,...,0.527602,0.529164,0.003868,11,0.554483,0.55394,0.554599,0.554341,0.000288,10
7,29.730603,0.015195,0.370676,0.020434,0.149146,0.069186,0.096798,14,0.019781,351,...,0.506771,0.508348,0.003878,14,0.531846,0.530787,0.532168,0.5316,0.00059,12
8,29.169127,0.06541,0.447858,0.004474,6.636085,0.025191,3.066251,8,0.033439,1136,...,0.542783,0.545244,0.003718,6,0.590792,0.590261,0.591174,0.590742,0.000375,6
9,28.594151,0.008431,0.477271,0.070494,0.001034,0.043218,0.92285,6,0.206375,1446,...,0.544514,0.546569,0.003689,4,0.570497,0.569644,0.570341,0.570161,0.000371,8


In [45]:
dump(bayes_search2, "Results/Shi_BayesDf2SearchingResults.joblib")

['Results/Shi_BayesDf2SearchingResults.joblib']

In [46]:
Xe2_train, Xe2_test, ye2_train, ye2_test = train_test_split(Xe2, ye2, test_size=0.1, random_state=42)

## Model2

In [47]:
model2 = xgb.XGBRegressor(objective='reg:squarederror',device='cuda',
                         tree_method='hist', random_state=42, **bayes_search2.best_params_)

# Swithing Endogeous Treatment Effect Model 

In [48]:
import pandas as pd
import numpy as np

In [49]:
kf = KFold(n_splits=10, shuffle=True, random_state=42)

In [50]:
range_limit = 8192
num_samples = 10
sampled_integers = random.sample(range(range_limit), num_samples)
print(sampled_integers)

[4012, 3657, 2286, 1679, 1424, 6912, 520, 488, 1535, 3582]


## Model0 with Economic_trend [0,1,2]

In [51]:
X = Xe0
y = ye0
model = model0

In [52]:
e01_prediction_list = [np.array(ye1.index), np.array(ye1.to_list())]
e02_prediction_list = [np.array(ye2.index), np.array(ye2.to_list())]
fold_predictions_array_list = []
r2_00_scores_list = []
r2_01_scores_list = []
r2_02_scores_list = []

for i, random_integer in enumerate(sampled_integers):
    kf = KFold(n_splits=10, shuffle=True, random_state=random_integer)
    
    r2_00_scores = []
    r2_01_scores = []
    r2_02_scores = []
    fold_predictions = []
    # Perform cross-validation
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Train the model
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test)
        
        # Evaluate the model
        r2 = r2_score(y_test, y_pred)
        r2_00_scores.append(r2)
        this_folder = np.array([y_test.index, y_test, y_pred]).T
        fold_predictions.append(this_folder)
    
        # Middle Predict
        ye1_pred = model.predict(Xe1)
        e01_prediction_list.append(ye1_pred)
        r2_01 = r2_score(ye1, ye1_pred)
        r2_01_scores.append(r2_01)
        
        # Old Predict
        ye2_pred = model.predict(Xe2)
        e02_prediction_list.append(ye2_pred)
        r2_02 = r2_score(ye2, ye2_pred)
        r2_02_scores.append(r2_02)
    
    print(f"Mean R2: {np.mean(r2_00_scores)}")
    print(f"Standard Deviation of R2: {np.std(r2_00_scores)}")
    print(f"R2 list: {r2_00_scores}")
    print(f"R2 01 list: {r2_01_scores}")
    print(f"R2 02 list: {r2_02_scores}")
    
    fold_predictions_array = pd.DataFrame(np.concatenate(fold_predictions, axis = 0), columns = ['index', 'Real_y', f'Predict_y_{i}'])
    fold_predictions_array_list.append(fold_predictions_array)
    r2_00_scores_list.append(r2_00_scores)
    r2_01_scores_list.append(r2_01_scores)
    r2_02_scores_list.append(r2_02_scores)

Mean R2: 0.5104120904284548
Standard Deviation of R2: 0.0027998158815621707
R2 list: [0.5050219573593897, 0.5120500573015384, 0.5096636141446459, 0.5134430008691047, 0.5115350244223469, 0.5143555245297254, 0.5084485041285627, 0.5129945270208887, 0.5077190613727284, 0.5088896331356187]
R2 01 list: [0.5037646057917082, 0.5025954083151186, 0.5044261508392144, 0.503297227073947, 0.5029224096554616, 0.5018121648265004, 0.5035727593063895, 0.5028758836537578, 0.5029879323573276, 0.504450445138046]
R2 02 list: [0.5332915888967252, 0.5343974441155614, 0.5327157898958053, 0.5332465746023899, 0.5332555664231823, 0.5324769705887803, 0.5328771993339416, 0.533159805624531, 0.5326753073007136, 0.5335248407699811]
Mean R2: 0.510122898332615
Standard Deviation of R2: 0.0031065337066429844
R2 list: [0.5068033945868744, 0.503517371356089, 0.5109448697414958, 0.5121110102442188, 0.5150083978681206, 0.5115611228772847, 0.5131060065366477, 0.5090144816903452, 0.5101169485458061, 0.5090453798792667]
R2 01 l

In [53]:
r2_list = [0.5182905675155851, 0.5120921442145996, 0.5057149530487006, 0.5089416919377515, 0.508028119134966, 0.5058644406450278, 0.511705256581354, 0.5132420058444158, 0.507633405748055, 0.5094244931896856]

In [54]:
np.mean(r2_list)

0.510093707786014

In [55]:
np.max(r2_list)

0.5182905675155851

In [56]:
np.min(r2_list)

0.5057149530487006

In [57]:
np.std(r2_list)

0.003644805243808523

In [58]:
r2_01_list = [0.5032420104400579, 0.5030233046634306, 0.5022971768390335, 0.5031435650324045, 0.5037272227982905, 0.5039508204693044, 0.5032676991651541, 0.5025730211569808, 0.503408627149236, 0.5038321320592603]

In [59]:
np.mean(r2_01_list)

0.5032465579773152

In [60]:
np.max(r2_01_list)

0.5039508204693044

In [61]:
np.min(r2_01_list)

0.5022971768390335

In [62]:
np.std(r2_01_list)

0.0005012066801759871

In [63]:
r2_02_list= [0.5317268106985963, 0.5335252141156523, 0.533104829594083, 0.5323820954734506, 0.5334301852340642, 0.533570660452567, 0.5336798610922686, 0.5326426589638997, 0.533935925838277, 0.5335403819849798]

In [64]:
np.mean(r2_02_list)

0.5331538623447838

In [65]:
np.max(r2_02_list)

0.533935925838277

In [66]:
np.min(r2_02_list)

0.5317268106985963

In [67]:
np.std(r2_02_list)

0.0006573816375461936

### Prediction of Economic_trend[1] in Model0

In [68]:
column_list = ['index', 'Real_y']
for num in list(range(0, 10*10)):
    column_list.append(f'Predict_y_{num}')
e01_preidiction_df = pd.DataFrame(np.array(e01_prediction_list).T, columns = column_list)

In [69]:
e01_preidiction_df.head()

Unnamed: 0,index,Real_y,Predict_y_0,Predict_y_1,Predict_y_2,Predict_y_3,Predict_y_4,Predict_y_5,Predict_y_6,Predict_y_7,...,Predict_y_90,Predict_y_91,Predict_y_92,Predict_y_93,Predict_y_94,Predict_y_95,Predict_y_96,Predict_y_97,Predict_y_98,Predict_y_99
0,0.0,8.0,7.733458,7.637095,7.610961,7.619158,7.742453,7.768381,7.612696,7.763814,...,7.652297,7.689312,7.588758,7.637125,7.660337,7.719583,7.612795,7.808457,7.585389,7.640364
1,2.0,5.0,4.209073,4.323153,4.253851,4.165401,4.30041,4.387646,4.309906,4.296983,...,4.283239,4.259446,4.270911,4.279523,4.296661,4.296088,4.281374,4.269156,4.21062,4.342552
2,4.0,3.0,2.720949,2.438397,2.589288,2.342106,2.673263,2.775207,2.606603,2.364591,...,2.277939,2.694591,2.654095,2.44341,2.579838,2.597667,2.717162,2.581131,2.409947,2.177977
3,5.0,8.0,7.09077,7.034881,7.028901,7.102423,7.136701,6.976285,7.05597,7.105423,...,7.104449,7.134211,7.12173,7.082099,7.039291,6.959576,7.011361,7.062193,7.036166,7.069852
4,8.0,0.0,4.298094,4.361493,4.355768,4.253955,4.284957,4.151066,4.322045,4.321821,...,4.208465,4.31415,4.408392,4.207411,4.225244,4.368878,4.172615,4.197502,4.302244,4.131462


In [70]:
e01_preidiction_df['Predict_y'] = e01_preidiction_df.iloc[:,2:103].mean(axis = 1)

In [71]:
e01_preidiction_df['Predict_y'].head()

0    7.662318
1    4.280170
2    2.526887
3    7.063875
4    4.236615
Name: Predict_y, dtype: float64

In [72]:
e01_preidiction_df = e01_preidiction_df[['index', 'Real_y', 'Predict_y']]

In [73]:
e01_preidiction_df['index'] = e01_preidiction_df['index'].astype(int)

In [74]:
e01_preidiction_df.head(10)

Unnamed: 0,index,Real_y,Predict_y
0,0,8.0,7.662318
1,2,5.0,4.28017
2,4,3.0,2.526887
3,5,8.0,7.063875
4,8,0.0,4.236615
5,10,3.0,2.650037
6,13,5.0,3.75955
7,19,7.0,6.717612
8,26,10.0,6.201753
9,28,10.0,6.323516


In [75]:
e01_preidiction_df.to_parquet(os.path.join('Results', 'PredictionWB_Xgbe01Prediction_v1.parquet'))

### Prediction of Economic_trend[2] in Model0

In [76]:
column_list = ['index', 'Real_y']
for num in list(range(0, 10*10)):
    column_list.append(f'Predict_y_{num}')
e02_preidiction_df = pd.DataFrame(np.array(e02_prediction_list).T, columns = column_list)

In [77]:
e02_preidiction_df.head()

Unnamed: 0,index,Real_y,Predict_y_0,Predict_y_1,Predict_y_2,Predict_y_3,Predict_y_4,Predict_y_5,Predict_y_6,Predict_y_7,...,Predict_y_90,Predict_y_91,Predict_y_92,Predict_y_93,Predict_y_94,Predict_y_95,Predict_y_96,Predict_y_97,Predict_y_98,Predict_y_99
0,1.0,6.0,3.973378,4.360759,4.126728,4.173982,4.786405,4.0619,4.253829,4.363434,...,4.32401,4.434385,4.600573,4.294843,4.553823,4.354361,4.554909,3.979714,4.16371,4.618255
1,6.0,0.0,5.687775,6.466213,6.684648,6.605783,5.672853,6.159799,6.760798,6.37061,...,6.735417,5.821207,6.857515,6.250395,6.223205,5.741577,6.423774,6.039367,6.626668,6.042326
2,9.0,3.0,5.637237,5.570837,5.612037,5.642939,5.611874,5.569945,5.598694,5.604544,...,5.642735,5.482384,5.576438,5.630508,5.483751,5.54433,5.571022,5.496877,5.560979,5.624852
3,11.0,8.0,7.537441,7.587066,7.459524,7.634824,7.54912,7.616135,7.602658,7.613686,...,7.609141,7.63545,7.567234,7.557968,7.45507,7.548029,7.427916,7.655639,7.53323,7.595583
4,12.0,2.0,3.064691,2.541044,2.547522,2.437281,2.702076,2.823817,2.759375,2.832314,...,2.719951,2.801459,2.432382,2.706222,2.58489,2.878531,2.749481,2.548491,2.9672,2.498165


In [78]:
e02_preidiction_df['Predict_y'] = e02_preidiction_df.iloc[:,2:103].mean(axis = 1)

In [79]:
e02_preidiction_df['Predict_y'].head()

0    4.306457
1    6.233777
2    5.578202
3    7.571265
4    2.718994
Name: Predict_y, dtype: float64

In [80]:
e02_preidiction_df = e02_preidiction_df[['index', 'Real_y', 'Predict_y']]

In [81]:
e02_preidiction_df['index'] = e02_preidiction_df['index'].astype(int)

In [82]:
e02_preidiction_df.head(10)

Unnamed: 0,index,Real_y,Predict_y
0,1,6.0,4.306457
1,6,0.0,6.233777
2,9,3.0,5.578202
3,11,8.0,7.571265
4,12,2.0,2.718994
5,14,5.0,4.935384
6,15,6.0,5.197368
7,17,5.0,4.548435
8,18,6.0,5.116991
9,20,6.0,6.317519


In [83]:
e02_preidiction_df.to_parquet(os.path.join('Results', 'PredictionWB_Xgbe02Prediction_v1.parquet'))

### Prediction of Economic_trend[0] in Model0

In [139]:
merged_e00_prediction_df = None
for e00_prediction_df in fold_predictions_array_list:
    e00_prediction_df['index'] = e00_prediction_df['index'].astype(int)
    e00_prediction_df['Real_y'] = e00_prediction_df['Real_y'].astype(int)
    print(e00_prediction_df.columns)
    if merged_e00_prediction_df is None:
        merged_e00_prediction_df = e00_prediction_df
    else:
        merged_e00_prediction_df = pd.merge(merged_e00_prediction_df, e00_prediction_df, on=['index', 'Real_y'])

Index(['index', 'Real_y', 'Predict_y_0'], dtype='object')
Index(['index', 'Real_y', 'Predict_y_1'], dtype='object')
Index(['index', 'Real_y', 'Predict_y_2'], dtype='object')
Index(['index', 'Real_y', 'Predict_y_3'], dtype='object')
Index(['index', 'Real_y', 'Predict_y_4'], dtype='object')
Index(['index', 'Real_y', 'Predict_y_5'], dtype='object')
Index(['index', 'Real_y', 'Predict_y_6'], dtype='object')
Index(['index', 'Real_y', 'Predict_y_7'], dtype='object')
Index(['index', 'Real_y', 'Predict_y_8'], dtype='object')
Index(['index', 'Real_y', 'Predict_y_9'], dtype='object')


In [140]:
merged_e00_prediction_df.head()

Unnamed: 0,index,Real_y,Predict_y_0,Predict_y_1,Predict_y_2,Predict_y_3,Predict_y_4,Predict_y_5,Predict_y_6,Predict_y_7,Predict_y_8,Predict_y_9
0,52,8,6.719201,6.790845,6.655471,6.505177,6.499429,6.336513,6.504231,6.537787,6.408659,6.583901
1,68,10,8.510676,8.548958,8.625995,8.564353,8.627156,8.548543,8.582077,8.633958,8.605536,8.489386
2,79,9,6.060488,6.33794,5.94681,6.1347,5.82226,6.260591,6.125841,5.847194,6.569089,6.06434
3,113,7,8.884588,9.28273,9.159409,9.068896,8.433082,9.017807,8.935734,9.035445,9.086436,9.140272
4,133,5,5.562832,5.518826,5.570529,5.569708,5.608385,5.65639,5.539241,5.587884,5.592958,5.525921


In [141]:
merged_e00_prediction_df['Predict_y'] = merged_e00_prediction_df.iloc[:,2:13].mean(axis = 1)

In [142]:
merged_e00_prediction_df = merged_e00_prediction_df[['index', 'Real_y', 'Predict_y']]

In [143]:
merged_e00_prediction_df.head(10)

Unnamed: 0,index,Real_y,Predict_y
0,52,8,6.554121
1,68,10,8.573664
2,79,9,6.116925
3,113,7,9.00444
4,133,5,5.573267
5,142,7,7.116758
6,165,3,3.801812
7,211,6,5.870588
8,220,5,6.672411
9,246,6,6.029741


In [144]:
merged_e00_prediction_df.to_parquet(os.path.join('Results', 'PredictionWB_Xgbe00Prediction_v1.parquet'))

## Model1 with Economic_trend [0,1,2]

In [84]:
X = Xe1
y = ye1
model = model1

In [102]:
e10_prediction_list = [np.array(ye0.index), np.array(ye0.to_list())]
e12_prediction_list = [np.array(ye2.index), np.array(ye2.to_list())]
fold_predictions_array_list = []
r2_10_scores_list = []
r2_11_scores_list = []
r2_12_scores_list = []

for i, random_integer in enumerate(sampled_integers):
    kf = KFold(n_splits=10, shuffle=True, random_state=random_integer)
    
    r2_10_scores = []
    r2_11_scores = []
    r2_12_scores = []
    fold_predictions = []
    # Perform cross-validation
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Train the model
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test)
        
        # Evaluate the model
        r2 = r2_score(y_test, y_pred)
        r2_11_scores.append(r2)
        this_folder = np.array([y_test.index, y_test, y_pred]).T
        fold_predictions.append(this_folder)
    
        # Middle Predict
        ye0_pred = model.predict(Xe0)
        e10_prediction_list.append(ye0_pred)
        r2_10 = r2_score(ye0, ye0_pred)
        r2_10_scores.append(r2_10)
        
        # Old Predict
        ye2_pred = model.predict(Xe2)
        e12_prediction_list.append(ye2_pred)
        r2_12 = r2_score(ye2, ye2_pred)
        r2_12_scores.append(r2_12)
    
    print(f"Mean R2: {np.mean(r2_11_scores)}")
    print(f"Standard Deviation of R2: {np.std(r2_11_scores)}")
    print(f"R2 11 list: {r2_11_scores}")
    print(f"R2 10 list: {r2_10_scores}")
    print(f"R2 12 list: {r2_12_scores}")
    
    fold_predictions_array = pd.DataFrame(np.concatenate(fold_predictions, axis = 0), columns = ['index', 'Real_y', f'Predict_y_{i}'])
    fold_predictions_array_list.append(fold_predictions_array)
    r2_11_scores_list.append(r2_11_scores)
    r2_10_scores_list.append(r2_10_scores)
    r2_12_scores_list.append(r2_12_scores)

Mean R2: 0.5136546559578287
Standard Deviation of R2: 0.0017544033141594184
R2 11 list: [0.5147900117635522, 0.512462131130444, 0.5145025528944349, 0.5145586117054568, 0.511919865355217, 0.5121556744913051, 0.5133260615647803, 0.5166297070890831, 0.5106819206642433, 0.5155200229197696]
R2 10 list: [0.5029774919831275, 0.503256743716952, 0.5036642324468106, 0.5031416132687523, 0.5028448352528183, 0.5032739928137, 0.5034366571447206, 0.5029739867218969, 0.5024642074803525, 0.5030005889936882]
R2 12 list: [0.5350417409201628, 0.5346106379021331, 0.5354073159054503, 0.535387273762256, 0.5343920787827531, 0.5351026376259675, 0.5349436358890625, 0.5342769589872611, 0.5350089934150313, 0.5342112814513759]
Mean R2: 0.5136583361966656
Standard Deviation of R2: 0.004638912396230322
R2 11 list: [0.5107957855219667, 0.5191760655926954, 0.5053564535307661, 0.5142669680622145, 0.5171194002522121, 0.5151255660974603, 0.5056698015631025, 0.5177539118755374, 0.5141545878240807, 0.5171648216466198]
R2 1

### Prediction of Economic_trend[0] in Model1

In [103]:
column_list = ['index', 'Real_y']
for num in list(range(0, 10*10)):
    column_list.append(f'Predict_y_{num}')
e10_preidiction_df = pd.DataFrame(np.array(e10_prediction_list).T, columns = column_list)

In [104]:
e10_preidiction_df.head()

Unnamed: 0,index,Real_y,Predict_y_0,Predict_y_1,Predict_y_2,Predict_y_3,Predict_y_4,Predict_y_5,Predict_y_6,Predict_y_7,...,Predict_y_90,Predict_y_91,Predict_y_92,Predict_y_93,Predict_y_94,Predict_y_95,Predict_y_96,Predict_y_97,Predict_y_98,Predict_y_99
0,3.0,8.0,6.218585,6.149304,6.202435,6.154196,6.234852,6.205513,6.19794,6.071349,...,6.098459,6.111287,6.131614,6.148021,6.057348,6.099515,6.169745,6.129697,6.082693,6.073318
1,7.0,5.0,4.074072,4.120693,4.13361,4.112233,4.050375,4.140577,4.03831,4.076021,...,4.097259,4.064558,4.095336,4.008698,4.1087,4.103643,4.128755,4.082883,4.085472,4.089364
2,16.0,5.0,5.851572,5.714752,5.81968,5.855279,5.857523,5.813941,5.851313,5.826214,...,5.789261,5.953941,5.820731,5.730507,5.831708,5.82276,5.611298,5.807508,5.724917,5.717439
3,21.0,7.0,6.685571,6.640362,6.57188,6.756708,6.668977,6.627417,6.596254,6.517241,...,6.61103,6.632261,6.673443,6.542055,6.660082,6.750741,6.760259,6.496795,6.676645,6.790314
4,22.0,6.0,5.319095,5.325548,5.580487,4.965728,5.088061,5.008641,4.893945,5.619192,...,5.672854,5.191718,5.47364,5.226614,5.365746,4.997601,5.432347,4.68408,5.096208,5.515173


In [105]:
e10_preidiction_df['Predict_y'] = e10_preidiction_df.iloc[:,2:103].mean(axis = 1)

In [106]:
e10_preidiction_df['Predict_y'].head()

0    6.137184
1    4.091756
2    5.817187
3    6.664819
4    5.271010
Name: Predict_y, dtype: float64

In [107]:
e10_preidiction_df = e10_preidiction_df[['index', 'Real_y', 'Predict_y']]

In [108]:
e10_preidiction_df['index'] = e10_preidiction_df['index'].astype(int)

In [109]:
e10_preidiction_df.head(10)

Unnamed: 0,index,Real_y,Predict_y
0,3,8.0,6.137184
1,7,5.0,4.091756
2,16,5.0,5.817187
3,21,7.0,6.664819
4,22,6.0,5.27101
5,24,3.0,4.280764
6,25,1.0,1.666073
7,27,8.0,7.450934
8,30,4.0,4.881411
9,31,3.0,4.734066


In [110]:
e10_preidiction_df.to_parquet(os.path.join('Results', 'PredictionWB_Xgbe10Prediction_v1.parquet'))

### Prediction of Economic_trend[1] in Model1

In [145]:
merged_e11_prediction_df = None
for e11_prediction_df in fold_predictions_array_list:
    e11_prediction_df['index'] = e11_prediction_df['index'].astype(int)
    e11_prediction_df['Real_y'] = e11_prediction_df['Real_y'].astype(int)
    print(e11_prediction_df.columns)
    if merged_e11_prediction_df is None:
        merged_e11_prediction_df = e11_prediction_df
    else:
        merged_e11_prediction_df = pd.merge(merged_e11_prediction_df, e11_prediction_df, on=['index', 'Real_y'])

Index(['index', 'Real_y', 'Predict_y_0'], dtype='object')
Index(['index', 'Real_y', 'Predict_y_1'], dtype='object')
Index(['index', 'Real_y', 'Predict_y_2'], dtype='object')
Index(['index', 'Real_y', 'Predict_y_3'], dtype='object')
Index(['index', 'Real_y', 'Predict_y_4'], dtype='object')
Index(['index', 'Real_y', 'Predict_y_5'], dtype='object')
Index(['index', 'Real_y', 'Predict_y_6'], dtype='object')
Index(['index', 'Real_y', 'Predict_y_7'], dtype='object')
Index(['index', 'Real_y', 'Predict_y_8'], dtype='object')
Index(['index', 'Real_y', 'Predict_y_9'], dtype='object')


In [146]:
merged_e11_prediction_df.head()

Unnamed: 0,index,Real_y,Predict_y_0,Predict_y_1,Predict_y_2,Predict_y_3,Predict_y_4,Predict_y_5,Predict_y_6,Predict_y_7,Predict_y_8,Predict_y_9
0,52,8,6.719201,6.790845,6.655471,6.505177,6.499429,6.336513,6.504231,6.537787,6.408659,6.583901
1,68,10,8.510676,8.548958,8.625995,8.564353,8.627156,8.548543,8.582077,8.633958,8.605536,8.489386
2,79,9,6.060488,6.33794,5.94681,6.1347,5.82226,6.260591,6.125841,5.847194,6.569089,6.06434
3,113,7,8.884588,9.28273,9.159409,9.068896,8.433082,9.017807,8.935734,9.035445,9.086436,9.140272
4,133,5,5.562832,5.518826,5.570529,5.569708,5.608385,5.65639,5.539241,5.587884,5.592958,5.525921


In [147]:
merged_e11_prediction_df['Predict_y'] = merged_e11_prediction_df.iloc[:,2:13].mean(axis = 1)

In [148]:
merged_e11_prediction_df = merged_e11_prediction_df[['index', 'Real_y', 'Predict_y']]

In [149]:
merged_e11_prediction_df.head(10)

Unnamed: 0,index,Real_y,Predict_y
0,52,8,6.554121
1,68,10,8.573664
2,79,9,6.116925
3,113,7,9.00444
4,133,5,5.573267
5,142,7,7.116758
6,165,3,3.801812
7,211,6,5.870588
8,220,5,6.672411
9,246,6,6.029741


In [151]:
merged_e11_prediction_df.to_parquet(os.path.join('Results', 'PredictionWB_Xgbe11Prediction_v1.parquet'))

### Prediction of Economic_trend[2] in Model1

In [111]:
column_list = ['index', 'Real_y']
for num in list(range(0, 10*10)):
    column_list.append(f'Predict_y_{num}')
e12_preidiction_df = pd.DataFrame(np.array(e12_prediction_list).T, columns = column_list)

In [112]:
e12_preidiction_df.head()

Unnamed: 0,index,Real_y,Predict_y_0,Predict_y_1,Predict_y_2,Predict_y_3,Predict_y_4,Predict_y_5,Predict_y_6,Predict_y_7,...,Predict_y_90,Predict_y_91,Predict_y_92,Predict_y_93,Predict_y_94,Predict_y_95,Predict_y_96,Predict_y_97,Predict_y_98,Predict_y_99
0,1.0,6.0,4.928741,4.695695,4.549246,4.531234,4.755167,5.045456,4.613992,4.870732,...,4.813287,4.709946,4.958558,4.501342,4.911014,4.513288,4.877309,4.762616,4.850844,5.188836
1,6.0,0.0,6.118148,5.95335,6.034355,5.34992,5.625271,5.620802,5.940335,5.967506,...,6.310944,5.6628,5.536166,6.320897,5.536838,5.680333,5.96903,5.723166,5.317257,6.159493
2,9.0,3.0,5.598516,5.667996,5.645422,5.635797,5.641865,5.609624,5.602597,5.633823,...,5.684844,5.622425,5.607913,5.613178,5.689178,5.675573,5.622633,5.65013,5.606568,5.61807
3,11.0,8.0,7.62763,7.660066,7.509213,7.67346,7.684585,7.689083,7.538706,7.61286,...,7.72405,7.615992,7.524024,7.658561,7.652386,7.713908,7.691465,7.75648,7.678843,7.613339
4,12.0,2.0,2.289318,2.373051,2.393275,2.198524,2.385531,2.251633,2.539943,2.414562,...,2.267049,2.425932,2.33539,2.320165,2.523997,2.364529,2.329187,2.408729,2.283999,2.427171


In [113]:
e12_preidiction_df['Predict_y'] = e12_preidiction_df.iloc[:,2:103].mean(axis = 1)

In [114]:
e12_preidiction_df['Predict_y'].head()

0    4.756882
1    5.902259
2    5.641819
3    7.649858
4    2.335526
Name: Predict_y, dtype: float64

In [115]:
e12_preidiction_df = e12_preidiction_df[['index', 'Real_y', 'Predict_y']]

In [116]:
e12_preidiction_df['index'] = e12_preidiction_df['index'].astype(int)

In [117]:
e12_preidiction_df.head(10)

Unnamed: 0,index,Real_y,Predict_y
0,1,6.0,4.756882
1,6,0.0,5.902259
2,9,3.0,5.641819
3,11,8.0,7.649858
4,12,2.0,2.335526
5,14,5.0,4.876256
6,15,6.0,5.005667
7,17,5.0,4.521591
8,18,6.0,5.007941
9,20,6.0,6.082828


In [118]:
e12_preidiction_df.to_parquet(os.path.join('Results', 'PredictionWB_Xgbe12Prediction_v1.parquet'))

## Model2 with Economic_trend [0,1,2]

In [119]:
X = Xe2
y = ye2
model = model2

In [120]:
e20_prediction_list = [np.array(ye0.index), np.array(ye0.to_list())]
e21_prediction_list = [np.array(ye1.index), np.array(ye1.to_list())]
fold_predictions_array_list = []
r2_20_scores_list = []
r2_21_scores_list = []
r2_22_scores_list = []

for i, random_integer in enumerate(sampled_integers):
    kf = KFold(n_splits=10, shuffle=True, random_state=random_integer)
    
    r2_20_scores = []
    r2_21_scores = []
    r2_22_scores = []
    fold_predictions = []
    # Perform cross-validation
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Train the model
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test)
        
        # Evaluate the model
        r2 = r2_score(y_test, y_pred)
        r2_22_scores.append(r2)
        this_folder = np.array([y_test.index, y_test, y_pred]).T
        fold_predictions.append(this_folder)
    
        # Middle Predict
        ye0_pred = model.predict(Xe0)
        e20_prediction_list.append(ye0_pred)
        r2_20 = r2_score(ye0, ye0_pred)
        r2_20_scores.append(r2_20)
        
        # Old Predict
        ye1_pred = model.predict(Xe1)
        e21_prediction_list.append(ye1_pred)
        r2_21 = r2_score(ye1, ye1_pred)
        r2_21_scores.append(r2_21)
    
    print(f"Mean R2: {np.mean(r2_22_scores)}")
    print(f"Standard Deviation of R2: {np.std(r2_22_scores)}")
    print(f"R2 22 list: {r2_22_scores}")
    print(f"R2 20 list: {r2_20_scores}")
    print(f"R2 21 list: {r2_21_scores}")
    
    fold_predictions_array = pd.DataFrame(np.concatenate(fold_predictions, axis = 0), columns = ['index', 'Real_y', f'Predict_y_{i}'])
    fold_predictions_array_list.append(fold_predictions_array)
    r2_22_scores_list.append(r2_22_scores)
    r2_20_scores_list.append(r2_20_scores)
    r2_21_scores_list.append(r2_21_scores)

Mean R2: 0.5462564917122326
Standard Deviation of R2: 0.0038187034870296385
R2 22 list: [0.5454605938367054, 0.544330943212321, 0.5510251249409283, 0.5487559760427937, 0.5522167672020785, 0.5489074131626182, 0.5427756546642669, 0.5388187550296788, 0.5451162736466915, 0.545157415384243]
R2 20 list: [0.5020923790165804, 0.5018747342701084, 0.5014218969191582, 0.502522472425184, 0.5020402215677512, 0.5024276251979609, 0.5020724718571006, 0.5017806651108547, 0.5020633889968427, 0.5018094109448328]
R2 21 list: [0.5049915762600956, 0.5048503244775021, 0.5049380129569414, 0.5050286640067154, 0.5050426235014622, 0.5045070629717505, 0.5051811888053337, 0.5043094632221555, 0.5045103498361005, 0.5057739164782863]
Mean R2: 0.5462224357676173
Standard Deviation of R2: 0.0053568991750411
R2 22 list: [0.543730171633511, 0.5392811360032421, 0.5430637926249116, 0.5423947015307793, 0.5525685943107754, 0.5507748245843868, 0.5551660926101226, 0.5458258591089068, 0.5502872470362968, 0.5391319382332399]
R2 

### Prediction of Economic_trend[0] in Model2

In [121]:
column_list = ['index', 'Real_y']
for num in list(range(0, 10*10)):
    column_list.append(f'Predict_y_{num}')
e20_preidiction_df = pd.DataFrame(np.array(e20_prediction_list).T, columns = column_list)

In [122]:
e20_preidiction_df.head()

Unnamed: 0,index,Real_y,Predict_y_0,Predict_y_1,Predict_y_2,Predict_y_3,Predict_y_4,Predict_y_5,Predict_y_6,Predict_y_7,...,Predict_y_90,Predict_y_91,Predict_y_92,Predict_y_93,Predict_y_94,Predict_y_95,Predict_y_96,Predict_y_97,Predict_y_98,Predict_y_99
0,3.0,8.0,6.135424,6.161144,6.161076,6.070591,6.090657,6.085197,6.052475,6.083082,...,6.033886,6.01496,6.170728,6.071132,6.02945,6.060162,6.032672,6.069654,6.01851,6.023045
1,7.0,5.0,4.127985,4.158195,4.160506,4.139277,4.15852,4.177382,4.187157,4.170249,...,4.177217,4.196023,4.181591,4.153108,4.146312,4.157417,4.103353,4.178782,4.152169,4.122608
2,16.0,5.0,5.536427,5.801324,5.869442,5.703802,5.870805,5.818956,5.72265,5.650614,...,5.738383,5.640924,5.810185,5.665906,5.777282,5.805254,5.548165,5.799558,5.742721,5.630338
3,21.0,7.0,6.953834,6.855992,6.892343,6.901298,6.886818,7.000603,6.874156,6.930952,...,6.850206,6.944954,6.950739,6.864229,6.855556,6.870425,6.863046,6.907989,6.87409,6.896475
4,22.0,6.0,5.938115,5.596141,5.800996,5.98432,5.482324,5.39073,5.810451,5.531911,...,5.873082,5.958148,5.958562,5.672003,5.95117,5.537444,5.686342,5.734625,5.779057,5.882915


In [123]:
e20_preidiction_df['Predict_y'] = e20_preidiction_df.iloc[:,2:103].mean(axis = 1)

In [124]:
e20_preidiction_df['Predict_y'].head()

0    6.063761
1    4.158498
2    5.708244
3    6.897411
4    5.746517
Name: Predict_y, dtype: float64

In [125]:
e20_preidiction_df = e20_preidiction_df[['index', 'Real_y', 'Predict_y']]

In [126]:
e20_preidiction_df['index'] = e20_preidiction_df['index'].astype(int)

In [127]:
e20_preidiction_df.head(10)

Unnamed: 0,index,Real_y,Predict_y
0,3,8.0,6.063761
1,7,5.0,4.158498
2,16,5.0,5.708244
3,21,7.0,6.897411
4,22,6.0,5.746517
5,24,3.0,4.264168
6,25,1.0,1.66577
7,27,8.0,7.475246
8,30,4.0,5.097367
9,31,3.0,4.826726


In [128]:
e20_preidiction_df.to_parquet(os.path.join('Results', 'PredictionWB_Xgbe20Prediction_v1.parquet'))

### Prediction of Economic_trend[1] in Model2

In [129]:
column_list = ['index', 'Real_y']
for num in list(range(0, 10*10)):
    column_list.append(f'Predict_y_{num}')
e21_preidiction_df = pd.DataFrame(np.array(e21_prediction_list).T, columns = column_list)

In [130]:
e21_preidiction_df.head()

Unnamed: 0,index,Real_y,Predict_y_0,Predict_y_1,Predict_y_2,Predict_y_3,Predict_y_4,Predict_y_5,Predict_y_6,Predict_y_7,...,Predict_y_90,Predict_y_91,Predict_y_92,Predict_y_93,Predict_y_94,Predict_y_95,Predict_y_96,Predict_y_97,Predict_y_98,Predict_y_99
0,0.0,8.0,7.379648,7.513511,7.514323,7.413648,7.448703,7.602998,7.42696,7.616939,...,7.654312,7.419815,7.454679,7.423604,7.474126,7.589067,7.504085,7.549699,7.340617,7.437551
1,2.0,5.0,4.440041,4.423221,4.391146,4.456661,4.41488,4.403286,4.38001,4.424778,...,4.443176,4.381472,4.37845,4.414473,4.389928,4.387125,4.312559,4.368249,4.459893,4.390383
2,4.0,3.0,3.401336,3.620744,3.898046,3.536945,3.877126,3.819113,3.761889,3.489345,...,3.386984,3.747248,3.948794,3.486504,3.819567,3.521098,3.646301,3.099253,3.510649,3.600528
3,5.0,8.0,7.091882,7.16221,7.241999,7.146213,7.127692,7.287336,7.254677,7.272304,...,7.206868,7.241435,7.154188,7.288816,7.241266,7.237863,7.300048,7.259327,7.175571,7.178135
4,8.0,0.0,3.973896,4.250819,3.943414,4.186592,4.150531,3.981866,4.140488,3.993928,...,4.095928,4.031445,4.122805,4.278037,3.973735,4.019135,4.025806,4.155595,3.998407,4.100913


In [131]:
e21_preidiction_df['Predict_y'] = e21_preidiction_df.iloc[:,2:103].mean(axis = 1)

In [132]:
e21_preidiction_df['Predict_y'].head()

0    7.471190
1    4.403076
2    3.699449
3    7.217769
4    4.075401
Name: Predict_y, dtype: float64

In [133]:
e21_preidiction_df = e21_preidiction_df[['index', 'Real_y', 'Predict_y']]

In [134]:
e21_preidiction_df['index'] = e21_preidiction_df['index'].astype(int)

In [135]:
e21_preidiction_df.head(10)

Unnamed: 0,index,Real_y,Predict_y
0,0,8.0,7.47119
1,2,5.0,4.403076
2,4,3.0,3.699449
3,5,8.0,7.217769
4,8,0.0,4.075401
5,10,3.0,2.685922
6,13,5.0,3.811568
7,19,7.0,6.431727
8,26,10.0,6.887267
9,28,10.0,6.027976


In [136]:
e21_preidiction_df.to_parquet(os.path.join('Results', 'PredictionWB_Xgbe21Prediction_v1.parquet'))

### Prediction of Economic_trend[2] in Model2

In [152]:
merged_e22_prediction_df = None
for e22_prediction_df in fold_predictions_array_list:
    e22_prediction_df['index'] = e22_prediction_df['index'].astype(int)
    e22_prediction_df['Real_y'] = e22_prediction_df['Real_y'].astype(int)
    print(e22_prediction_df.columns)
    if merged_e22_prediction_df is None:
        merged_e22_prediction_df = e22_prediction_df
    else:
        merged_e22_prediction_df = pd.merge(merged_e22_prediction_df, e22_prediction_df, on=['index', 'Real_y'])

Index(['index', 'Real_y', 'Predict_y_0'], dtype='object')
Index(['index', 'Real_y', 'Predict_y_1'], dtype='object')
Index(['index', 'Real_y', 'Predict_y_2'], dtype='object')
Index(['index', 'Real_y', 'Predict_y_3'], dtype='object')
Index(['index', 'Real_y', 'Predict_y_4'], dtype='object')
Index(['index', 'Real_y', 'Predict_y_5'], dtype='object')
Index(['index', 'Real_y', 'Predict_y_6'], dtype='object')
Index(['index', 'Real_y', 'Predict_y_7'], dtype='object')
Index(['index', 'Real_y', 'Predict_y_8'], dtype='object')
Index(['index', 'Real_y', 'Predict_y_9'], dtype='object')


In [153]:
merged_e22_prediction_df.head()

Unnamed: 0,index,Real_y,Predict_y_0,Predict_y_1,Predict_y_2,Predict_y_3,Predict_y_4,Predict_y_5,Predict_y_6,Predict_y_7,Predict_y_8,Predict_y_9
0,52,8,6.719201,6.790845,6.655471,6.505177,6.499429,6.336513,6.504231,6.537787,6.408659,6.583901
1,68,10,8.510676,8.548958,8.625995,8.564353,8.627156,8.548543,8.582077,8.633958,8.605536,8.489386
2,79,9,6.060488,6.33794,5.94681,6.1347,5.82226,6.260591,6.125841,5.847194,6.569089,6.06434
3,113,7,8.884588,9.28273,9.159409,9.068896,8.433082,9.017807,8.935734,9.035445,9.086436,9.140272
4,133,5,5.562832,5.518826,5.570529,5.569708,5.608385,5.65639,5.539241,5.587884,5.592958,5.525921


In [154]:
merged_e22_prediction_df['Predict_y'] = merged_e22_prediction_df.iloc[:,2:13].mean(axis = 1)

In [155]:
merged_e22_prediction_df = merged_e22_prediction_df[['index', 'Real_y', 'Predict_y']]

In [156]:
merged_e22_prediction_df.head(10)

Unnamed: 0,index,Real_y,Predict_y
0,52,8,6.554121
1,68,10,8.573664
2,79,9,6.116925
3,113,7,9.00444
4,133,5,5.573267
5,142,7,7.116758
6,165,3,3.801812
7,211,6,5.870588
8,220,5,6.672411
9,246,6,6.029741


In [157]:
merged_e22_prediction_df.to_parquet(os.path.join('Results', 'PredictionWB_Xgbe22Prediction_v1.parquet'))