# Hyperparameter Search

In [60]:
%pwd

'/home/rapids/notebooks/DP15/03_RStudio'

In [2]:
%cd ..

/home/rapids/notebooks/DP15/03_RStudio


In [3]:
!dir

01_Data		  04_Figure	 08_ShScript	 gadm36_JPN_1_sp.rds
02_RCode	  05_Manuscript  LICENSE
03_RStudio.Rproj  06_PyCode	 PyCode_v241111
03_Results	  07_PyResults	 README.md


## Import Package 

In [30]:
from glob import glob
from joblib import dump, load
import numpy as np
import pandas as pd
import random
from cuml.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, make_scorer, accuracy_score
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV

In [54]:
import warnings

warnings.filterwarnings(
    "ignore", 
    message="To use pickling first train using float32 data to fit the estimator"
)

### Function

In [7]:
### X and y
def getXandY(Output_Vari):
    y_list = glob("01_Data/*_y_" + Output_Vari + "*.csv")
    y = pd.read_csv(y_list[0], index_col=0)
    y = y.iloc[:,0].to_numpy()
    X_list = glob("01_Data/*_X_" + Output_Vari + "*.csv")
    X = pd.read_csv(X_list[0], index_col=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, 
                                                        random_state=1)
    return X_train, X_test, y_train, y_test

## Experiment for Happiness

### Load Data

In [8]:
Output_Vari = "Happinessoverall"

In [9]:
X_train, X_test, y_train, y_test = getXandY(Output_Vari)

In [10]:
X_train.head()

Unnamed: 0,year,lat,lon,female,age,high_stress,low_stress,easy_to_relax,good_for_living,live_environment_satefy,...,housewife,retired,unemployed,college_no_diploma,bachelor,master,phd,income_indiv,NDVI,NTL
329671,2016,37.183964,138.256897,1,51,4,5,3,4,3,...,0,0,0,0,0,0,0,3.5,29.553186,4.315041
24633,2015,37.23909,140.353348,1,26,5,5,3,3,4,...,0,0,0,1,0,0,0,1.0,40.441441,2.005019
279845,2016,35.528252,140.185822,0,67,2,1,4,4,3,...,0,0,0,0,1,0,0,1.0,53.940829,7.728996
269295,2016,33.566044,130.342041,1,35,1,3,4,4,3,...,1,0,0,1,0,0,0,1.0,30.014896,17.481237
232515,2015,33.619823,130.515259,1,32,4,4,2,4,3,...,1,0,0,0,1,0,0,2.5,55.156878,5.77994


In [11]:
X_train.describe()

Unnamed: 0,year,lat,lon,female,age,high_stress,low_stress,easy_to_relax,good_for_living,live_environment_satefy,...,housewife,retired,unemployed,college_no_diploma,bachelor,master,phd,income_indiv,NDVI,NTL
count,344855.0,344855.0,344855.0,344855.0,344855.0,344855.0,344855.0,344855.0,344855.0,344855.0,...,344855.0,344855.0,344855.0,344855.0,344855.0,344855.0,344855.0,344855.0,344855.0,344855.0
mean,2015.65654,35.708914,137.51452,0.352612,48.967114,3.172557,3.606846,3.293581,3.826623,2.953085,...,0.086381,0.06441,0.046147,0.209624,0.437245,0.047295,0.013539,4.686452,36.247119,18.599653
std,0.771443,2.131133,3.052344,0.477784,11.731917,1.166244,1.060261,1.020122,0.897251,0.761716,...,0.280927,0.245482,0.209804,0.407041,0.496047,0.21227,0.115567,3.903289,13.664925,15.865111
min,2015.0,24.301767,123.762947,0.0,17.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4.860561,0.013894
25%,2015.0,34.747284,135.545242,0.0,41.0,2.0,3.0,3.0,3.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,26.247235,6.048134
50%,2015.0,35.477516,138.984985,0.0,49.0,3.0,4.0,4.0,4.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,34.654246,14.545732
75%,2016.0,35.809013,139.715393,1.0,57.0,4.0,4.0,4.0,4.0,3.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,6.5,44.238307,26.747059
max,2017.0,45.511021,145.74234,1.0,101.0,5.0,5.0,5.0,5.0,4.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,30.0,87.709065,88.380999


In [12]:
X_train.columns

Index(['year', 'lat', 'lon', 'female', 'age', 'high_stress', 'low_stress',
       'easy_to_relax', 'good_for_living', 'live_environment_satefy',
       'community_attachment', 'income', 'self_reported_health', 'student',
       'worker', 'company_owner', 'government_officer', 'self_employed',
       'professional', 'housewife', 'retired', 'unemployed',
       'college_no_diploma', 'bachelor', 'master', 'phd', 'income_indiv',
       'NDVI', 'NTL'],
      dtype='object')

In [13]:
X_train.shape

(344855, 29)

In [14]:
X_test.shape

(38318, 29)

In [15]:
X = pd.concat([X_train, X_test])

In [16]:
X.shape

(383173, 29)

In [17]:
y = np.concatenate([y_train, y_test])

In [18]:
y.shape

(383173,)

### Run Test Model

In [22]:
rf_reg =RandomForestRegressor()

In [23]:
param_grid = {
    "n_estimators": list(range(100, 5_000, 100)),
    "max_depth": list(range(3, 16, 1)),
    "max_features": [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    "min_samples_split": list(range(2, 33, 1)),
    "criterion": ["squared_error", "absolute_error", "friedman_mse", "poisson"]
}

In [24]:
class RandomRunNFoldsKFold(KFold):
    def __init__(self, n_splits=10, random_state=None, run_splits=3, **kwargs):
        super().__init__(n_splits=n_splits, shuffle=True, random_state=random_state, **kwargs)
        self.random_state = random_state
        self.actual_splits = run_splits  # Number of actual splits to use

    def split(self, X, y=None, groups=None):
        folds = list(super().split(X, y, groups))
        if self.random_state is not None:
            random.seed(self.random_state)
        selected_folds = random.sample(folds, self.actual_splits)
        for train_index, test_index in selected_folds:
            yield train_index, test_index

    def get_n_splits(self, X=None, y=None, groups=None):
        return self.actual_splits

In [25]:
rkfcv = RandomRunNFoldsKFold(n_splits=10, run_splits=3, random_state=42)

In [49]:
def acc(y_true, y_pred):
    # Apply the transformations
    y_pred = np.where(y_pred < 1.5, 1,  # If value < 1.5, set to 1
             np.where(y_pred < 2.5, 2,  # If 1.5 <= value < 2.5, set to 2
             np.where(y_pred < 3.5, 3,  # If 2.5 <= value < 3.5, set to 3
             np.where(y_pred < 4.5, 4, 5))))  # If 3.5 <= value < 4.5, set to 4, else set to 5
    return accuracy_score(y_true, y_pred)

custom_scoring = make_scorer(acc, greater_is_better=True)  

In [55]:
random_search = RandomizedSearchCV(
    estimator=rf_reg,
    param_distributions=param_grid,
    n_iter=500,  # Number of parameter settings to sample
    scoring=custom_scoring,
    cv=rkfcv,  # 3-fold cross-validation
    random_state=42,
    verbose=2,
    return_train_score = True
)

In [56]:
# Fit the model
random_search.fit(X, y)

Fitting 3 folds for each of 500 candidates, totalling 1500 fits
[CV] END criterion=absolute_error, max_depth=3, max_features=0.7, min_samples_split=10, n_estimators=4700; total time= 1.0min
[CV] END criterion=absolute_error, max_depth=3, max_features=0.7, min_samples_split=10, n_estimators=4700; total time= 1.1min
[CV] END criterion=absolute_error, max_depth=3, max_features=0.7, min_samples_split=10, n_estimators=4700; total time= 1.1min
[CV] END criterion=absolute_error, max_depth=6, max_features=0.5, min_samples_split=23, n_estimators=1500; total time=  19.2s
[CV] END criterion=absolute_error, max_depth=6, max_features=0.5, min_samples_split=23, n_estimators=1500; total time=  19.2s
[CV] END criterion=absolute_error, max_depth=6, max_features=0.5, min_samples_split=23, n_estimators=1500; total time=  19.3s
[CV] END criterion=absolute_error, max_depth=4, max_features=0.7, min_samples_split=28, n_estimators=2500; total time=  33.0s
[CV] END criterion=absolute_error, max_depth=4, max_fe

In [57]:
CV_result = random_search.cv_results_

In [58]:
pd.DataFrame(CV_result).sort_values(by='rank_test_score', ascending=True).loc[:,:].head(10)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_min_samples_split,param_max_features,param_max_depth,param_criterion,params,...,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
177,71.859899,0.16711,1.44401,0.039034,3800,6,0.5,14,absolute_error,"{'n_estimators': 3800, 'min_samples_split': 6,...",...,0.591758,0.594932,0.592538,0.001727,1,0.622778,0.622369,0.622019,0.622389,0.00031
128,20.869755,0.061599,0.440616,0.015508,1100,4,0.5,14,poisson,"{'n_estimators': 1100, 'min_samples_split': 4,...",...,0.591758,0.594645,0.592407,0.001628,2,0.623381,0.622847,0.622666,0.622965,0.000304
37,91.578374,0.19737,3.094015,0.327313,4600,7,0.5,15,friedman_mse,"{'n_estimators': 4600, 'min_samples_split': 7,...",...,0.591785,0.594723,0.592399,0.001703,3,0.634713,0.63449,0.633896,0.634366,0.000345
33,81.71752,0.234454,4.862012,1.750793,4100,8,0.5,15,poisson,"{'n_estimators': 4100, 'min_samples_split': 8,...",...,0.591706,0.594671,0.592346,0.001698,4,0.633951,0.6338,0.63318,0.633643,0.000333
269,30.023355,0.039948,0.740669,0.027877,1300,11,0.7,15,squared_error,"{'n_estimators': 1300, 'min_samples_split': 11...",...,0.59155,0.594671,0.592312,0.001703,5,0.634203,0.633565,0.63316,0.633643,0.000429
419,24.135726,0.05449,0.425687,0.041546,1300,25,0.5,14,friedman_mse,"{'n_estimators': 1300, 'min_samples_split': 25...",...,0.591732,0.59454,0.592286,0.001662,6,0.616076,0.615717,0.61539,0.615728,0.00028
210,84.849406,0.139508,3.495589,0.479751,4200,5,0.5,15,squared_error,"{'n_estimators': 4200, 'min_samples_split': 5,...",...,0.591967,0.594462,0.592286,0.001662,7,0.63589,0.63596,0.635155,0.635668,0.000364
201,64.880704,0.092034,1.444097,0.170095,3400,9,0.5,14,friedman_mse,"{'n_estimators': 3400, 'min_samples_split': 9,...",...,0.591497,0.594697,0.592277,0.001747,8,0.621597,0.621131,0.620758,0.621162,0.000344
287,81.111482,0.334119,2.796319,0.395679,4100,9,0.5,15,squared_error,"{'n_estimators': 4100, 'min_samples_split': 9,...",...,0.591576,0.59441,0.592277,0.001537,9,0.633173,0.633014,0.632525,0.632904,0.000276
242,81.289248,0.201151,3.338135,1.711393,4100,16,0.5,15,absolute_error,"{'n_estimators': 4100, 'min_samples_split': 16...",...,0.591289,0.594984,0.592268,0.001945,10,0.628714,0.628574,0.627964,0.628417,0.000326


In [63]:
!ls 

01_Data		  04_Figure	 08_ShScript	 gadm36_JPN_1_sp.rds
02_RCode	  05_Manuscript  LICENSE
03_RStudio.Rproj  06_PyCode	 PyCode_v241111
03_Results	  07_PyResults	 README.md


In [64]:
dump(random_search, '03_Results/RandomSearch500.joblib')

['03_Results/RandomSearch500.joblib']

In [65]:
pd.DataFrame(CV_result).sort_values(by='rank_test_score', ascending=True).to_csv('03_Results/RandomSearch500.csv')

### Test Perfromance

In [66]:
random_search.best_params_

{'n_estimators': 3800,
 'min_samples_split': 6,
 'max_features': 0.5,
 'max_depth': 14,
 'criterion': 'absolute_error'}

In [69]:
rf_reg_final =RandomForestRegressor(n_estimators = 3800, min_samples_split = 6, max_features = 0.5, max_depth = 14, split_criterion=2)
rf_reg_final.fit(X_train, y_train)

In [70]:
# Predictions
y_pred = rf_reg_final.predict(X_test)

In [71]:
y_pred

402158    3.716265
110495    2.779432
470583    3.767394
176356    3.929041
411706    3.242872
            ...   
436304    3.896008
312634    4.075689
212674    4.171364
413227    3.301546
348847    3.911431
Length: 38318, dtype: float64

In [72]:
y_pred_categorical = np.where(y_pred < 1.5, 1,
                       np.where(y_pred < 2.5, 2,
                         np.where(y_pred < 3.5, 3,
                           np.where(y_pred < 4.5, 4, 5))))

y_pred_categorical

array([4, 3, 4, ..., 4, 3, 4])

In [73]:
accuracy_score(y_test, y_pred_categorical)

0.5899838196148024