# Hyperparameter Search

In [1]:
%pwd

'D:\\15_Article\\PyCode_v241111'

In [2]:
%cd ..

D:\15_Article


In [4]:
!dir

 Volume in drive D has no label.
 Volume Serial Number is 0E99-E4FB

 Directory of D:\15_Article

11/12/2024  11:04 AM    <DIR>          .
11/12/2024  11:04 AM    <DIR>          ..
11/11/2024  05:55 PM               364 .gitignore
11/12/2024  11:04 AM    <DIR>          .virtual_documents
11/12/2024  10:40 AM    <DIR>          01_Data
11/11/2024  05:55 PM    <DIR>          02_RCode
11/11/2024  05:55 PM               218 03_RStudio.Rproj
11/11/2024  05:55 PM    <DIR>          04_Figure
11/11/2024  05:55 PM    <DIR>          05_Manuscript
11/12/2024  11:02 AM    <DIR>          06_PyCode
11/11/2024  05:55 PM    <DIR>          08_ShScript
11/11/2024  05:55 PM            11,558 LICENSE
11/12/2024  11:02 AM    <DIR>          PyCode_v241111
11/11/2024  05:55 PM             4,122 README.md
               4 File(s)         16,262 bytes
              10 Dir(s)  372,871,139,328 bytes free


## Import Package 

In [24]:
from glob import glob
import numpy as np
import pandas as pd
import random
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, KFold
from skopt import BayesSearchCV
from skopt.space import Real, Integer

### Function

In [6]:
### X and y
def getXandY(Output_Vari):
    y_list = glob("01_Data/*_y_" + Output_Vari + "*.csv")
    y = pd.read_csv(y_list[0], index_col=0)
    y = y.iloc[:,0].to_numpy()
    X_list = glob("01_Data/*_X_" + Output_Vari + "*.csv")
    X = pd.read_csv(X_list[0], index_col=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, 
                                                        random_state=1)
    return X_train, X_test, y_train, y_test

## Experiment for Happiness

### Load Data

In [7]:
Output_Vari = "Happinessoverall"

In [8]:
X_train, X_test, y_train, y_test = getXandY(Output_Vari)

In [9]:
X_train.head()

Unnamed: 0,year,lat,lon,female,age,high_stress,low_stress,easy_to_relax,good_for_living,live_environment_satefy,...,housewife,retired,unemployed,college_no_diploma,bachelor,master,phd,income_indiv,NDVI,NTL
329671,2016,37.183964,138.256897,1,51,4,5,3,4,3,...,0,0,0,0,0,0,0,3.5,29.553186,4.315041
24633,2015,37.23909,140.353348,1,26,5,5,3,3,4,...,0,0,0,1,0,0,0,1.0,40.441441,2.005019
279845,2016,35.528252,140.185822,0,67,2,1,4,4,3,...,0,0,0,0,1,0,0,1.0,53.940829,7.728996
269295,2016,33.566044,130.342041,1,35,1,3,4,4,3,...,1,0,0,1,0,0,0,1.0,30.014896,17.481237
232515,2015,33.619823,130.515259,1,32,4,4,2,4,3,...,1,0,0,0,1,0,0,2.5,55.156878,5.77994


In [10]:
X_train.columns

Index(['year', 'lat', 'lon', 'female', 'age', 'high_stress', 'low_stress',
       'easy_to_relax', 'good_for_living', 'live_environment_satefy',
       'community_attachment', 'income', 'self_reported_health', 'student',
       'worker', 'company_owner', 'government_officer', 'self_employed',
       'professional', 'housewife', 'retired', 'unemployed',
       'college_no_diploma', 'bachelor', 'master', 'phd', 'income_indiv',
       'NDVI', 'NTL'],
      dtype='object')

In [11]:
X_train.shape

(344855, 29)

In [12]:
X_test.shape

(38318, 29)

In [13]:
X = pd.concat([X_train, X_test])

In [14]:
X.shape

(383173, 29)

In [15]:
y = np.concatenate([y_train, y_test])

In [16]:
y.shape

(383173,)

### Run Test Model

In [26]:
param_space = {
    'n_estimators': Integer(100, 5000),
    'max_depth': Integer(3, 16),
    'max_samples': Real(0.5, 1.0),
    'min_samples_split':  Integer(2, 32)
}

In [28]:
rf_reg =RandomForestRegressor(n_jobs = 4)

In [29]:
class RandomRunNFoldsKFold(KFold):
    def __init__(self, n_splits=10, random_state=None, run_splits=3, **kwargs):
        super().__init__(n_splits=n_splits, shuffle=True, random_state=random_state, **kwargs)
        self.random_state = random_state
        self.actual_splits = run_splits  # Number of actual splits to use

    def split(self, X, y=None, groups=None):
        folds = list(super().split(X, y, groups))
        if self.random_state is not None:
            random.seed(self.random_state)
        selected_folds = random.sample(folds, self.actual_splits)
        for train_index, test_index in selected_folds:
            yield train_index, test_index

    def get_n_splits(self, X=None, y=None, groups=None):
        return self.actual_splits

In [30]:
rkfcv = RandomRunNFoldsKFold(n_splits=10, run_splits=3, random_state=42)

In [31]:
bayes_search = BayesSearchCV(
    estimator=rf_reg,
    search_spaces=param_space,
    n_iter=50,
    scoring='r2',
    cv=rkfcv,
    n_jobs = 1,
    n_points = 1,
    verbose=2,
    random_state=42,
    return_train_score = True
)

In [None]:
bayes_search.fit(X, y)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] END max_depth=8, max_samples=0.8638628715886625, min_samples_split=30, n_estimators=1647; total time=10.3min
[CV] END max_depth=8, max_samples=0.8638628715886625, min_samples_split=30, n_estimators=1647; total time=10.2min
[CV] END max_depth=8, max_samples=0.8638628715886625, min_samples_split=30, n_estimators=1647; total time=10.3min
Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] END max_depth=14, max_samples=0.9416576386904312, min_samples_split=11, n_estimators=4761; total time=50.4min
[CV] END max_depth=14, max_samples=0.9416576386904312, min_samples_split=11, n_estimators=4761; total time=49.7min
[CV] END max_depth=14, max_samples=0.9416576386904312, min_samples_split=11, n_estimators=4761; total time=49.6min
Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] END max_depth=9, max_samples=0.9593612608346885, min_samples_split=5, n_estimators=2223; total time=16.3min
