# Hyperparameter Search

In [1]:
%pwd

'/home/rapids/notebooks/DP15/03_RStudio/PyCode_v241111'

In [2]:
%cd ..

/home/rapids/notebooks/DP15/03_RStudio


## Import Package 

In [3]:
from glob import glob
from joblib import dump, load
import numpy as np
import pandas as pd
import random
from cuml.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, make_scorer, accuracy_score
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV

In [4]:
import warnings

warnings.filterwarnings(
    "ignore", 
    message="To use pickling first train using float32 data to fit the estimator"
)

### Function

In [5]:
### X and y
def getXandY(Output_Vari):
    y_list = glob("01_Data/*_y_" + Output_Vari + "*.csv")
    y = pd.read_csv(y_list[0], index_col=0)
    y = y.iloc[:,0].to_numpy()
    X_list = glob("01_Data/*_X_" + Output_Vari + "*.csv")
    X = pd.read_csv(X_list[0], index_col=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, 
                                                        random_state=1)
    return X_train, X_test, y_train, y_test

## Experiment for Happiness

### Load Data

In [6]:
Output_Vari = "Happinessoverall"

In [7]:
X_train, X_test, y_train, y_test = getXandY(Output_Vari)

In [8]:
X_train.head()

Unnamed: 0,year,lat,lon,female,age,high_stress,low_stress,easy_to_relax,good_for_living,live_environment_satefy,...,housewife,retired,unemployed,college_no_diploma,bachelor,master,phd,income_indiv,NDVI,NTL
329671,2016,37.183964,138.256897,1,51,4,5,3,4,3,...,0,0,0,0,0,0,0,3.5,29.553186,4.315041
24633,2015,37.23909,140.353348,1,26,5,5,3,3,4,...,0,0,0,1,0,0,0,1.0,40.441441,2.005019
279845,2016,35.528252,140.185822,0,67,2,1,4,4,3,...,0,0,0,0,1,0,0,1.0,53.940829,7.728996
269295,2016,33.566044,130.342041,1,35,1,3,4,4,3,...,1,0,0,1,0,0,0,1.0,30.014896,17.481237
232515,2015,33.619823,130.515259,1,32,4,4,2,4,3,...,1,0,0,0,1,0,0,2.5,55.156878,5.77994


In [9]:
X_train.describe()

Unnamed: 0,year,lat,lon,female,age,high_stress,low_stress,easy_to_relax,good_for_living,live_environment_satefy,...,housewife,retired,unemployed,college_no_diploma,bachelor,master,phd,income_indiv,NDVI,NTL
count,344855.0,344855.0,344855.0,344855.0,344855.0,344855.0,344855.0,344855.0,344855.0,344855.0,...,344855.0,344855.0,344855.0,344855.0,344855.0,344855.0,344855.0,344855.0,344855.0,344855.0
mean,2015.65654,35.708914,137.51452,0.352612,48.967114,3.172557,3.606846,3.293581,3.826623,2.953085,...,0.086381,0.06441,0.046147,0.209624,0.437245,0.047295,0.013539,4.686452,36.247119,18.599653
std,0.771443,2.131133,3.052344,0.477784,11.731917,1.166244,1.060261,1.020122,0.897251,0.761716,...,0.280927,0.245482,0.209804,0.407041,0.496047,0.21227,0.115567,3.903289,13.664925,15.865111
min,2015.0,24.301767,123.762947,0.0,17.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4.860561,0.013894
25%,2015.0,34.747284,135.545242,0.0,41.0,2.0,3.0,3.0,3.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,26.247235,6.048134
50%,2015.0,35.477516,138.984985,0.0,49.0,3.0,4.0,4.0,4.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,34.654246,14.545732
75%,2016.0,35.809013,139.715393,1.0,57.0,4.0,4.0,4.0,4.0,3.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,6.5,44.238307,26.747059
max,2017.0,45.511021,145.74234,1.0,101.0,5.0,5.0,5.0,5.0,4.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,30.0,87.709065,88.380999


In [10]:
X_train.columns

Index(['year', 'lat', 'lon', 'female', 'age', 'high_stress', 'low_stress',
       'easy_to_relax', 'good_for_living', 'live_environment_satefy',
       'community_attachment', 'income', 'self_reported_health', 'student',
       'worker', 'company_owner', 'government_officer', 'self_employed',
       'professional', 'housewife', 'retired', 'unemployed',
       'college_no_diploma', 'bachelor', 'master', 'phd', 'income_indiv',
       'NDVI', 'NTL'],
      dtype='object')

In [11]:
X_train.shape

(344855, 29)

In [12]:
X_test.shape

(38318, 29)

In [13]:
X = pd.concat([X_train, X_test])

In [14]:
X.shape

(383173, 29)

In [15]:
y = np.concatenate([y_train, y_test])

In [16]:
y.shape

(383173,)

### Run Test Model

In [17]:
rf_reg =RandomForestRegressor()

In [18]:
param_grid = {
    "n_estimators": list(range(100, 5_000, 100)),
    "max_depth": list(range(3, 16, 1)),
    "max_features": [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    "min_samples_split": list(range(2, 33, 1))
}

In [19]:
class RandomRunNFoldsKFold(KFold):
    def __init__(self, n_splits=10, random_state=None, run_splits=3, **kwargs):
        super().__init__(n_splits=n_splits, shuffle=True, random_state=random_state, **kwargs)
        self.random_state = random_state
        self.actual_splits = run_splits  # Number of actual splits to use

    def split(self, X, y=None, groups=None):
        folds = list(super().split(X, y, groups))
        if self.random_state is not None:
            random.seed(self.random_state)
        selected_folds = random.sample(folds, self.actual_splits)
        for train_index, test_index in selected_folds:
            yield train_index, test_index

    def get_n_splits(self, X=None, y=None, groups=None):
        return self.actual_splits

In [20]:
rkfcv = RandomRunNFoldsKFold(n_splits=10, run_splits=3, random_state=42)

In [21]:
def acc(y_true, y_pred):
    # Apply the transformations
    y_pred = np.where(y_pred < 1.5, 1,  # If value < 1.5, set to 1
             np.where(y_pred < 2.5, 2,  # If 1.5 <= value < 2.5, set to 2
             np.where(y_pred < 3.5, 3,  # If 2.5 <= value < 3.5, set to 3
             np.where(y_pred < 4.5, 4, 5))))  # If 3.5 <= value < 4.5, set to 4, else set to 5
    return r2_score(y_true, y_pred)

custom_scoring = make_scorer(acc, greater_is_better=True)  

In [22]:
random_search = RandomizedSearchCV(
    estimator=rf_reg,
    param_distributions=param_grid,
    n_iter=500,  # Number of parameter settings to sample
    scoring="r2",
    cv=rkfcv,  # 3-fold cross-validation
    random_state=42,
    verbose=2,
    return_train_score = True
)

In [23]:
# Fit the model
random_search.fit(X, y)

Fitting 3 folds for each of 500 candidates, totalling 1500 fits
[CV] END max_depth=4, max_features=0.9, min_samples_split=14, n_estimators=1800; total time=  24.5s
[CV] END max_depth=4, max_features=0.9, min_samples_split=14, n_estimators=1800; total time=  24.4s
[CV] END max_depth=4, max_features=0.9, min_samples_split=14, n_estimators=1800; total time=  24.3s
[CV] END max_depth=3, max_features=0.5, min_samples_split=19, n_estimators=2800; total time=  36.5s
[CV] END max_depth=3, max_features=0.5, min_samples_split=19, n_estimators=2800; total time=  36.5s
[CV] END max_depth=3, max_features=0.5, min_samples_split=19, n_estimators=2800; total time=  36.4s
[CV] END max_depth=14, max_features=0.7, min_samples_split=10, n_estimators=1100; total time=  25.1s
[CV] END max_depth=14, max_features=0.7, min_samples_split=10, n_estimators=1100; total time=  25.0s
[CV] END max_depth=14, max_features=0.7, min_samples_split=10, n_estimators=1100; total time=  24.7s
[CV] END max_depth=15, max_featur

In [24]:
CV_result = random_search.cv_results_

In [25]:
pd.DataFrame(CV_result).sort_values(by='rank_test_score', ascending=True).loc[:,:].head(10)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_min_samples_split,param_max_features,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
192,84.24925,0.117433,3.073676,0.058919,4200,5,0.5,15,"{'n_estimators': 4200, 'min_samples_split': 5,...",0.381855,0.376849,0.377891,0.378865,0.002156,1,0.530876,0.531389,0.531834,0.531366,0.000391
269,81.224165,0.187555,2.514277,0.157051,4100,9,0.5,15,"{'n_estimators': 4100, 'min_samples_split': 9,...",0.381796,0.376744,0.377908,0.378816,0.00216,2,0.52224,0.522722,0.523203,0.522722,0.000393
280,48.73341,0.176644,1.298556,0.036629,2500,16,0.5,15,"{'n_estimators': 2500, 'min_samples_split': 16...",0.381746,0.376729,0.377852,0.378775,0.00215,3,0.508435,0.508923,0.50927,0.508876,0.000343
287,70.551384,0.368111,2.773879,1.135713,3600,12,0.5,15,"{'n_estimators': 3600, 'min_samples_split': 12...",0.381743,0.376678,0.377834,0.378752,0.002168,4,0.515982,0.516421,0.516815,0.516406,0.00034
242,92.576799,0.236132,3.417491,1.023577,4700,13,0.5,15,"{'n_estimators': 4700, 'min_samples_split': 13...",0.381766,0.376641,0.377837,0.378748,0.002189,5,0.513923,0.514368,0.514742,0.514344,0.000335
64,30.888651,0.191571,0.677869,0.032771,1600,25,0.5,15,"{'n_estimators': 1600, 'min_samples_split': 25...",0.381675,0.376705,0.3777,0.378693,0.002147,6,0.495143,0.495529,0.495975,0.495549,0.00034
484,67.865067,0.086978,1.628088,0.056866,3500,27,0.5,15,"{'n_estimators': 3500, 'min_samples_split': 27...",0.381747,0.376573,0.377727,0.378682,0.002218,7,0.492629,0.493039,0.493418,0.493029,0.000322
170,88.136682,0.132403,1.897918,0.029637,4600,28,0.5,15,"{'n_estimators': 4600, 'min_samples_split': 28...",0.381682,0.376497,0.377797,0.378658,0.002203,8,0.491394,0.491835,0.492159,0.491796,0.000314
3,36.919311,0.175881,1.884031,0.57602,1900,20,0.5,15,"{'n_estimators': 1900, 'min_samples_split': 20...",0.381587,0.376613,0.377768,0.378656,0.002125,9,0.502057,0.502433,0.502835,0.502441,0.000318
415,8.078737,0.018578,0.646255,0.060412,400,7,0.5,15,"{'n_estimators': 400, 'min_samples_split': 7, ...",0.381228,0.376729,0.377831,0.378596,0.001915,10,0.526743,0.527212,0.527804,0.527253,0.000434


In [26]:
dump(random_search, '03_Results/RandomSearch500.joblib')

['03_Results/RandomSearch500.joblib']

In [27]:
pd.DataFrame(CV_result).sort_values(by='rank_test_score', ascending=True).to_csv('03_Results/RandomSearch500.csv')

### Test Perfromance

In [28]:
random_search.best_params_

{'n_estimators': 4200,
 'min_samples_split': 5,
 'max_features': 0.5,
 'max_depth': 15}

In [30]:
rf_reg_final =RandomForestRegressor(**random_search.best_params_)
rf_reg_final.fit(X_train, y_train)

In [31]:
# Predictions
y_pred = rf_reg_final.predict(X_test)

In [32]:
y_pred

402158    3.720683
110495    2.777780
470583    3.744300
176356    3.935515
411706    3.225935
            ...   
436304    3.899265
312634    4.090048
212674    4.177930
413227    3.312201
348847    3.911872
Length: 38318, dtype: float64

In [33]:
r2_score(y_test, y_pred)

0.3745802736267626