# Hyperparameter Search

In V1 more hyperparameters are considered

In [1]:
%pwd

'/home/rapids/notebooks/DP15/PyCode_v241111'

In [2]:
%cd ..

/home/rapids/notebooks/DP15


## Import Package 

In [3]:
import cudf
from glob import glob
from joblib import dump, load
import numpy as np
import pandas as pd
import random
from cuml.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, make_scorer, accuracy_score
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV

In [4]:
import warnings

warnings.filterwarnings(
    "ignore", 
    message="To use pickling first train using float32 data to fit the estimator"
)

### Function

In [5]:
### X and y
def getXandY(Output_Vari):
    y_list = glob("01_Data/*_y_" + Output_Vari + "*.csv")
    y = pd.read_csv(y_list[0], index_col=0)
    y = y.iloc[:,0].to_numpy()
    X_list = glob("01_Data/*_X_" + Output_Vari + "*.csv")
    X = pd.read_csv(X_list[0], index_col=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, 
                                                        random_state=1)
    return X_train, X_test, y_train, y_test

## Experiment for Happiness

### Load Data

In [6]:
Output_Vari = "Happinessoverall"

In [7]:
X_train, X_test, y_train, y_test = getXandY(Output_Vari)

In [8]:
X_train.head()

Unnamed: 0,year,lat,lon,female,age,high_stress,low_stress,easy_to_relax,good_for_living,live_environment_satefy,...,housewife,retired,unemployed,college_no_diploma,bachelor,master,phd,income_indiv,NDVI,NTL
329671,2016,37.183964,138.256897,1,51,4,5,3,4,3,...,0,0,0,0,0,0,0,3.5,29.553186,4.315041
24633,2015,37.23909,140.353348,1,26,5,5,3,3,4,...,0,0,0,1,0,0,0,1.0,40.441441,2.005019
279845,2016,35.528252,140.185822,0,67,2,1,4,4,3,...,0,0,0,0,1,0,0,1.0,53.940829,7.728996
269295,2016,33.566044,130.342041,1,35,1,3,4,4,3,...,1,0,0,1,0,0,0,1.0,30.014896,17.481237
232515,2015,33.619823,130.515259,1,32,4,4,2,4,3,...,1,0,0,0,1,0,0,2.5,55.156878,5.77994


In [9]:
X_train.describe()

Unnamed: 0,year,lat,lon,female,age,high_stress,low_stress,easy_to_relax,good_for_living,live_environment_satefy,...,housewife,retired,unemployed,college_no_diploma,bachelor,master,phd,income_indiv,NDVI,NTL
count,344855.0,344855.0,344855.0,344855.0,344855.0,344855.0,344855.0,344855.0,344855.0,344855.0,...,344855.0,344855.0,344855.0,344855.0,344855.0,344855.0,344855.0,344855.0,344855.0,344855.0
mean,2015.65654,35.708914,137.51452,0.352612,48.967114,3.172557,3.606846,3.293581,3.826623,2.953085,...,0.086381,0.06441,0.046147,0.209624,0.437245,0.047295,0.013539,4.686452,36.247119,18.599653
std,0.771443,2.131133,3.052344,0.477784,11.731917,1.166244,1.060261,1.020122,0.897251,0.761716,...,0.280927,0.245482,0.209804,0.407041,0.496047,0.21227,0.115567,3.903289,13.664925,15.865111
min,2015.0,24.301767,123.762947,0.0,17.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4.860561,0.013894
25%,2015.0,34.747284,135.545242,0.0,41.0,2.0,3.0,3.0,3.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,26.247235,6.048134
50%,2015.0,35.477516,138.984985,0.0,49.0,3.0,4.0,4.0,4.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,34.654246,14.545732
75%,2016.0,35.809013,139.715393,1.0,57.0,4.0,4.0,4.0,4.0,3.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,6.5,44.238307,26.747059
max,2017.0,45.511021,145.74234,1.0,101.0,5.0,5.0,5.0,5.0,4.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,30.0,87.709065,88.380999


In [10]:
X_train.columns

Index(['year', 'lat', 'lon', 'female', 'age', 'high_stress', 'low_stress',
       'easy_to_relax', 'good_for_living', 'live_environment_satefy',
       'community_attachment', 'income', 'self_reported_health', 'student',
       'worker', 'company_owner', 'government_officer', 'self_employed',
       'professional', 'housewife', 'retired', 'unemployed',
       'college_no_diploma', 'bachelor', 'master', 'phd', 'income_indiv',
       'NDVI', 'NTL'],
      dtype='object')

In [11]:
X_train.shape

(344855, 29)

In [12]:
X_test.shape

(38318, 29)

In [13]:
X = pd.concat([X_train, X_test])

In [14]:
X.shape

(383173, 29)

In [15]:
y = np.concatenate([y_train, y_test])

In [16]:
y.shape

(383173,)

### Test Setting

In [17]:
cX_train = cudf.from_pandas(X_train)
cX_test = cudf.from_pandas(X_test)
cy_train = cudf.Series(y_train)
cy_test = cudf.Series(y_test)

In [21]:
%%time
# Train the model
model = RandomForestRegressor(n_estimators = 3800, min_samples_split = 6, max_features = 0.5, max_depth = 14, split_criterion=2)
model.fit(cX_train, cy_train)

CPU times: user 1min 50s, sys: 1min 27s, total: 3min 18s
Wall time: 1min 12s


In [18]:
%%time
# Train the model
model = RandomForestRegressor(n_estimators = 3800, min_samples_split = 6, max_features = 0.5, max_depth = 14, split_criterion=2,
                            max_batch_size = 4096 * 8)
model.fit(cX_train, cy_train)

CPU times: user 2min 39s, sys: 2min 15s, total: 4min 54s
Wall time: 2min 21s


In [19]:
%%time
# Train the model
model = RandomForestRegressor(n_estimators = 3800, min_samples_split = 6, max_features = 0.5, max_depth = 14, split_criterion=2,
                            max_batch_size = 1024)
model.fit(cX_train, cy_train)

CPU times: user 1min 43s, sys: 1min 28s, total: 3min 11s
Wall time: 1min 8s


In [20]:
%%time
# Train the model
model = RandomForestRegressor(n_estimators = 3800, min_samples_split = 6, max_features = 0.5, max_depth = 14, split_criterion=2,
                            max_batch_size = 128)
model.fit(cX_train, cy_train)

CPU times: user 2min 9s, sys: 2min 32s, total: 4min 41s
Wall time: 1min 32s


### Run Test Model

In [17]:
rf_reg =RandomForestRegressor(n_bins=256)

In [18]:
param_grid = {
    "n_estimators": list(range(100, 5_000, 100)),
    "max_depth": list(range(3, 16, 1)),
    "max_features": [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    "min_samples_split": list(range(2, 33, 1)),
    "max_samples": [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
}

In [19]:
class RandomRunNFoldsKFold(KFold):
    def __init__(self, n_splits=10, random_state=None, run_splits=3, **kwargs):
        super().__init__(n_splits=n_splits, shuffle=True, random_state=random_state, **kwargs)
        self.random_state = random_state
        self.actual_splits = run_splits  # Number of actual splits to use

    def split(self, X, y=None, groups=None):
        folds = list(super().split(X, y, groups))
        if self.random_state is not None:
            random.seed(self.random_state)
        selected_folds = random.sample(folds, self.actual_splits)
        for train_index, test_index in selected_folds:
            yield train_index, test_index

    def get_n_splits(self, X=None, y=None, groups=None):
        return self.actual_splits

In [20]:
rkfcv = RandomRunNFoldsKFold(n_splits=10, run_splits=3, random_state=42)

In [21]:
random_search = RandomizedSearchCV(
    estimator=rf_reg,
    param_distributions=param_grid,
    n_iter=500,  # Number of parameter settings to sample
    scoring="r2",
    cv=rkfcv,  # 3-fold cross-validation
    random_state=42,
    verbose=2,
    return_train_score = True
)

In [22]:
# Fit the model
random_search.fit(X, y)

Fitting 3 folds for each of 500 candidates, totalling 1500 fits
[CV] END max_depth=5, max_features=0.6, max_samples=0.7, min_samples_split=10, n_estimators=4700; total time=  55.6s
[CV] END max_depth=5, max_features=0.6, max_samples=0.7, min_samples_split=10, n_estimators=4700; total time=  57.9s
[CV] END max_depth=5, max_features=0.6, max_samples=0.7, min_samples_split=10, n_estimators=4700; total time=  57.9s
[CV] END max_depth=15, max_features=0.6, max_samples=0.8, min_samples_split=28, n_estimators=300; total time=   6.8s
[CV] END max_depth=15, max_features=0.6, max_samples=0.8, min_samples_split=28, n_estimators=300; total time=   6.5s
[CV] END max_depth=15, max_features=0.6, max_samples=0.8, min_samples_split=28, n_estimators=300; total time=   6.6s
[CV] END max_depth=5, max_features=0.7, max_samples=0.7, min_samples_split=28, n_estimators=2500; total time=  31.5s
[CV] END max_depth=5, max_features=0.7, max_samples=0.7, min_samples_split=28, n_estimators=2500; total time=  31.7s


In [23]:
CV_result = random_search.cv_results_

In [28]:
pd.DataFrame(CV_result).sort_values(by='rank_test_score', ascending=True).loc[:,:].head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_min_samples_split,param_max_samples,param_max_features,param_max_depth,params,...,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
289,72.928708,0.23034,3.997285,2.179266,4400,5,0.6,0.5,15,"{'n_estimators': 4400, 'min_samples_split': 5,...",...,0.377399,0.37843,0.379543,0.002341,1,0.50918,0.509849,0.510104,0.509711,0.00039
264,37.709139,0.025469,0.71155,0.029886,2600,26,0.5,0.5,15,"{'n_estimators': 2600, 'min_samples_split': 26...",...,0.37714,0.378211,0.379315,0.00236,2,0.463445,0.463933,0.464155,0.463844,0.000297
184,54.838359,0.059016,2.900787,1.431442,2800,4,0.8,0.5,15,"{'n_estimators': 2800, 'min_samples_split': 4,...",...,0.377249,0.378241,0.379278,0.002206,3,0.523517,0.524215,0.5244,0.524044,0.00038
249,35.024758,0.02619,0.705355,0.017505,2400,21,0.5,0.5,15,"{'n_estimators': 2400, 'min_samples_split': 21...",...,0.377071,0.378201,0.379275,0.002363,4,0.469455,0.470052,0.470255,0.469921,0.000339
241,9.661677,0.02214,0.21779,0.005726,600,13,0.6,0.5,15,"{'n_estimators': 600, 'min_samples_split': 13,...",...,0.377023,0.378177,0.379217,0.002335,5,0.490932,0.491294,0.491583,0.49127,0.000266


In [25]:
dump(random_search, '03_Results/RandomSearch500_5hyper.joblib')

['03_Results/RandomSearch500_5hyper.joblib']

In [26]:
pd.DataFrame(CV_result).sort_values(by='rank_test_score', ascending=True).to_csv('03_Results/RandomSearch500_hyper.csv')

### Test Perfromance

In [27]:
random_search.best_params_

{'n_estimators': 4400,
 'min_samples_split': 5,
 'max_samples': 0.6,
 'max_features': 0.5,
 'max_depth': 15}

In [29]:
rf_reg_final =RandomForestRegressor(**random_search.best_params_)
rf_reg_final.fit(X_train, y_train)

In [30]:
# Predictions
y_pred = rf_reg_final.predict(X_test)

In [31]:
y_pred

402158    3.723556
110495    2.785340
470583    3.757319
176356    3.932825
411706    3.231786
            ...   
436304    3.894872
312634    4.071615
212674    4.173123
413227    3.315579
348847    3.916306
Length: 38318, dtype: float64

In [32]:
r2_score(y_test, y_pred)

0.37506618059537367