# Hyperparameter Search

In V2 more hyperparameters are considered and larger range

In [1]:
%pwd

'/home/rapids/notebooks/DP15/PyCode_v241111'

In [2]:
%cd ..

/home/rapids/notebooks/DP15


## Import Package 

In [3]:
import cudf
from glob import glob
from joblib import dump, load
import numpy as np
import pandas as pd
import random
from cuml.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, make_scorer, accuracy_score
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV

In [4]:
import warnings

warnings.filterwarnings(
    "ignore", 
    message="To use pickling first train using float32 data to fit the estimator"
)

### Function

In [5]:
### X and y
def getXandY(Output_Vari):
    y_list = glob("01_Data/*_y_" + Output_Vari + "*.csv")
    y = pd.read_csv(y_list[0], index_col=0)
    y = y.iloc[:,0].to_numpy()
    X_list = glob("01_Data/*_X_" + Output_Vari + "*.csv")
    X = pd.read_csv(X_list[0], index_col=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, 
                                                        random_state=1)
    return X_train, X_test, y_train, y_test

## Experiment for Happiness

### Load Data

In [6]:
Output_Vari = "Happinessoverall"

In [7]:
X_train, X_test, y_train, y_test = getXandY(Output_Vari)

In [8]:
X_train.head()

Unnamed: 0,year,lat,lon,female,age,high_stress,low_stress,easy_to_relax,good_for_living,live_environment_satefy,...,housewife,retired,unemployed,college_no_diploma,bachelor,master,phd,income_indiv,NDVI,NTL
329671,2016,37.183964,138.256897,1,51,4,5,3,4,3,...,0,0,0,0,0,0,0,3.5,29.553186,4.315041
24633,2015,37.23909,140.353348,1,26,5,5,3,3,4,...,0,0,0,1,0,0,0,1.0,40.441441,2.005019
279845,2016,35.528252,140.185822,0,67,2,1,4,4,3,...,0,0,0,0,1,0,0,1.0,53.940829,7.728996
269295,2016,33.566044,130.342041,1,35,1,3,4,4,3,...,1,0,0,1,0,0,0,1.0,30.014896,17.481237
232515,2015,33.619823,130.515259,1,32,4,4,2,4,3,...,1,0,0,0,1,0,0,2.5,55.156878,5.77994


In [9]:
X_train.describe()

Unnamed: 0,year,lat,lon,female,age,high_stress,low_stress,easy_to_relax,good_for_living,live_environment_satefy,...,housewife,retired,unemployed,college_no_diploma,bachelor,master,phd,income_indiv,NDVI,NTL
count,344855.0,344855.0,344855.0,344855.0,344855.0,344855.0,344855.0,344855.0,344855.0,344855.0,...,344855.0,344855.0,344855.0,344855.0,344855.0,344855.0,344855.0,344855.0,344855.0,344855.0
mean,2015.65654,35.708914,137.51452,0.352612,48.967114,3.172557,3.606846,3.293581,3.826623,2.953085,...,0.086381,0.06441,0.046147,0.209624,0.437245,0.047295,0.013539,4.686452,36.247119,18.599653
std,0.771443,2.131133,3.052344,0.477784,11.731917,1.166244,1.060261,1.020122,0.897251,0.761716,...,0.280927,0.245482,0.209804,0.407041,0.496047,0.21227,0.115567,3.903289,13.664925,15.865111
min,2015.0,24.301767,123.762947,0.0,17.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4.860561,0.013894
25%,2015.0,34.747284,135.545242,0.0,41.0,2.0,3.0,3.0,3.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,26.247235,6.048134
50%,2015.0,35.477516,138.984985,0.0,49.0,3.0,4.0,4.0,4.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,34.654246,14.545732
75%,2016.0,35.809013,139.715393,1.0,57.0,4.0,4.0,4.0,4.0,3.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,6.5,44.238307,26.747059
max,2017.0,45.511021,145.74234,1.0,101.0,5.0,5.0,5.0,5.0,4.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,30.0,87.709065,88.380999


In [10]:
X_train.columns

Index(['year', 'lat', 'lon', 'female', 'age', 'high_stress', 'low_stress',
       'easy_to_relax', 'good_for_living', 'live_environment_satefy',
       'community_attachment', 'income', 'self_reported_health', 'student',
       'worker', 'company_owner', 'government_officer', 'self_employed',
       'professional', 'housewife', 'retired', 'unemployed',
       'college_no_diploma', 'bachelor', 'master', 'phd', 'income_indiv',
       'NDVI', 'NTL'],
      dtype='object')

In [11]:
X_train.shape

(344855, 29)

In [12]:
X_test.shape

(38318, 29)

In [13]:
X = pd.concat([X_train, X_test])

In [14]:
X.shape

(383173, 29)

In [15]:
y = np.concatenate([y_train, y_test])

In [16]:
y.shape

(383173,)

### Run Test Model

In [17]:
rf_reg =RandomForestRegressor(n_bins=256)

In [18]:
param_grid = {
    "n_estimators": list(range(1_000, 5_100, 100)),
    "max_depth": [2, 4, 8, 16],
    "max_features": [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    "min_samples_split": [2, 4, 8, 16, 32],
    "max_samples": [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
}

In [19]:
class RandomRunNFoldsKFold(KFold):
    def __init__(self, n_splits=10, random_state=None, run_splits=3, **kwargs):
        super().__init__(n_splits=n_splits, shuffle=True, random_state=random_state, **kwargs)
        self.random_state = random_state
        self.actual_splits = run_splits  # Number of actual splits to use

    def split(self, X, y=None, groups=None):
        folds = list(super().split(X, y, groups))
        if self.random_state is not None:
            random.seed(self.random_state)
        selected_folds = random.sample(folds, self.actual_splits)
        for train_index, test_index in selected_folds:
            yield train_index, test_index

    def get_n_splits(self, X=None, y=None, groups=None):
        return self.actual_splits

In [20]:
rkfcv = RandomRunNFoldsKFold(n_splits=10, run_splits=3, random_state=42)

In [21]:
random_search = RandomizedSearchCV(
    estimator=rf_reg,
    param_distributions=param_grid,
    n_iter=500,  # Number of parameter settings to sample
    scoring="r2",
    cv=rkfcv,  # 3-fold cross-validation
    random_state=42,
    verbose=2,
    return_train_score = False
)

In [22]:
# Fit the model
random_search.fit(X, y)

Fitting 3 folds for each of 500 candidates, totalling 1500 fits
[CV] END max_depth=2, max_features=0.6, max_samples=0.8, min_samples_split=2, n_estimators=1800; total time=  26.3s
[CV] END max_depth=2, max_features=0.6, max_samples=0.8, min_samples_split=2, n_estimators=1800; total time=  26.1s
[CV] END max_depth=2, max_features=0.6, max_samples=0.8, min_samples_split=2, n_estimators=1800; total time=  25.9s
[CV] END max_depth=2, max_features=0.5, max_samples=0.5, min_samples_split=32, n_estimators=1400; total time=  14.9s
[CV] END max_depth=2, max_features=0.5, max_samples=0.5, min_samples_split=32, n_estimators=1400; total time=  14.9s
[CV] END max_depth=2, max_features=0.5, max_samples=0.5, min_samples_split=32, n_estimators=1400; total time=  15.2s
[CV] END max_depth=8, max_features=0.4, max_samples=0.6, min_samples_split=32, n_estimators=2100; total time=  24.3s
[CV] END max_depth=8, max_features=0.4, max_samples=0.6, min_samples_split=32, n_estimators=2100; total time=  24.8s
[CV

In [23]:
CV_result = random_search.cv_results_

In [24]:
pd.DataFrame(CV_result).sort_values(by='rank_test_score', ascending=True).loc[:,:].head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_min_samples_split,param_max_samples,param_max_features,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
261,17.740023,0.091343,0.751616,0.017319,1300,4,0.4,0.4,16,"{'n_estimators': 1300, 'min_samples_split': 4,...",0.38412,0.378337,0.379244,0.380567,0.002539,1
161,67.328209,0.170901,2.549933,0.061923,3900,8,0.7,0.4,16,"{'n_estimators': 3900, 'min_samples_split': 8,...",0.383849,0.378421,0.379295,0.380521,0.00238,2
50,56.401347,0.135879,2.525426,0.246218,3900,8,0.5,0.4,16,"{'n_estimators': 3900, 'min_samples_split': 8,...",0.383926,0.378315,0.379255,0.380498,0.002454,3
387,69.821267,0.271292,2.954576,0.263909,4400,8,0.6,0.4,16,"{'n_estimators': 4400, 'min_samples_split': 8,...",0.383901,0.378257,0.379219,0.380459,0.002465,4
193,20.603827,0.318864,0.760166,0.034357,1300,8,0.6,0.4,16,"{'n_estimators': 1300, 'min_samples_split': 8,...",0.383941,0.378192,0.379205,0.380446,0.002506,5


In [25]:
dump(random_search, '03_Results/RandomSearch500_5hyper_v2.joblib')

['03_Results/RandomSearch500_5hyper_v2.joblib']

In [26]:
pd.DataFrame(CV_result).sort_values(by='rank_test_score', ascending=True).to_csv('03_Results/RandomSearch500_hyper_v2.csv')

### Test Perfromance

In [27]:
random_search.best_params_

{'n_estimators': 1300,
 'min_samples_split': 4,
 'max_samples': 0.4,
 'max_features': 0.4,
 'max_depth': 16}

In [28]:
rf_reg_final =RandomForestRegressor(**random_search.best_params_)
rf_reg_final.fit(X_train, y_train)

In [29]:
# Predictions
y_pred = rf_reg_final.predict(X_test)

In [30]:
y_pred

402158    3.717990
110495    2.785189
470583    3.737490
176356    3.944020
411706    3.241077
            ...   
436304    3.903354
312634    4.057661
212674    4.177610
413227    3.302152
348847    3.918535
Length: 38318, dtype: float64

In [31]:
r2_score(y_test, y_pred)

0.37640637959522694