#  Tuning parameters for RandomForest model

In [21]:
import numpy as np
import random

SEED = 42
np.random.seed(SEED)
random.seed(SEED)

## 1. GridSearchCV 

In [22]:
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

def random_forest_grid_search(X, y):
    """
    Thực hiện GridSearchCV để tìm siêu tham số tốt nhất cho RandomForestClassifier.

    Parameters:
        X: ndarray hoặc DataFrame - dữ liệu đặc trưng
        y: ndarray hoặc Series - nhãn

    Returns:
        grid_search: đối tượng GridSearchCV sau khi huấn luyện
    """
    # Khởi tạo mô hình
    model = RandomForestClassifier()

    # Lưới siêu tham số
    param_grid = {
        'max_depth': [None, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 5],
        'bootstrap': [True, False],
    }

    # Khởi tạo GridSearchCV
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        cv=3,
        scoring='accuracy',
        verbose=1,
        n_jobs=-1
    )

    # Theo dõi thời gian
    start_time = time.time()
    print("🌲 Starting GridSearchCV for Random Forest...")

    # Thực hiện tìm kiếm
    grid_search.fit(X, y)

    # Tính toán thời gian
    elapsed_time = time.time() - start_time

    # Kết quả
    print("✅ GridSearchCV completed!")
    print(f"⏱ Elapsed time: {elapsed_time:.2f} seconds")
    print("🏆 Best params: ", grid_search.best_params_)
    print("📈 Best score: ", grid_search.best_score_)

    return grid_search


import os
import json

def save_rf_best_params_to_json(grid_search, filename_base, directory="Best_Hyperparameter"):
    """
    Lưu các siêu tham số tốt nhất của RandomForestClassifier từ GridSearchCV vào file JSON.

    Parameters:
        grid_search: đối tượng GridSearchCV sau khi fit().
        filename_base: tên file không có đuôi .json, ví dụ: 'best_parameter_RandomForest'
        directory: thư mục lưu file (mặc định: 'Best_Hyperparameter')
    """
    # Trích xuất tham số tốt nhất
    best_params = grid_search.best_params_
    data = {
        "bootstrap": best_params.get('bootstrap'),
        "max_depth": best_params.get('max_depth'),
        "min_samples_leaf": best_params.get('min_samples_leaf'),
        "min_samples_split": best_params.get('min_samples_split')
    }

    # Tạo thư mục nếu chưa tồn tại
    os.makedirs(directory, exist_ok=True)

    # Tạo tên file và đường dẫn đầy đủ
    filename = f"{filename_base}.json"
    file_path = os.path.join(directory, filename)

    # Ghi dữ liệu vào file JSON
    with open(file_path, 'w', encoding='utf-8') as json_file:
        json.dump(data, json_file, indent=4, ensure_ascii=False)

    print(f"✅ File JSON đã được tạo tại: {file_path}")



## 2. Load data

### 2.1.1 Load data FULL FEATURES

In [7]:
import pandas as pd

data_train = pd.read_csv('../../data/processed/train_LabelEncoder_noSubject.csv')
data_test = pd.read_csv('../../data/processed/test_LabelEncoder_noSubject.csv')
data_train.head()

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)",Activity,Activity_code
0,0.288585,-0.020294,-0.132905,-0.995279,-0.983111,-0.913526,-0.995112,-0.983185,-0.923527,-0.934724,...,-0.710304,-0.112754,0.0304,-0.464761,-0.018446,-0.841247,0.179941,-0.058627,STANDING,2
1,0.278419,-0.016411,-0.12352,-0.998245,-0.9753,-0.960322,-0.998807,-0.974914,-0.957686,-0.943068,...,-0.861499,0.053477,-0.007435,-0.732626,0.703511,-0.844788,0.180289,-0.054317,STANDING,2
2,0.279653,-0.019467,-0.113462,-0.99538,-0.967187,-0.978944,-0.99652,-0.963668,-0.977469,-0.938692,...,-0.760104,-0.118559,0.177899,0.100699,0.808529,-0.848933,0.180637,-0.049118,STANDING,2
3,0.279174,-0.026201,-0.123283,-0.996091,-0.983403,-0.990675,-0.997099,-0.98275,-0.989302,-0.938692,...,-0.482845,-0.036788,-0.012892,0.640011,-0.485366,-0.848649,0.181935,-0.047663,STANDING,2
4,0.276629,-0.01657,-0.115362,-0.998139,-0.980817,-0.990482,-0.998321,-0.979672,-0.990441,-0.942469,...,-0.699205,0.12332,0.122542,0.693578,-0.615971,-0.847865,0.185151,-0.043892,STANDING,2


### 2.1.2 Create X and y variables

In [8]:
X_train_full_features = data_train.drop(['Activity', 'Activity_code'], axis=1)
y_train_full_features = np.array(data_train['Activity_code'])

### 2.1.3 Grid Search

In [9]:
grid_search_full_features = random_forest_grid_search(X_train_full_features, y_train_full_features)

# Truy cập kết quả tốt nhất
best_model = grid_search_full_features.best_estimator_

🌲 Starting GridSearchCV for Random Forest...
Fitting 3 folds for each of 54 candidates, totalling 162 fits
✅ GridSearchCV completed!
⏱ Elapsed time: 278.64 seconds
🏆 Best params:  {'bootstrap': True, 'max_depth': 30, 'min_samples_leaf': 5, 'min_samples_split': 10}
📈 Best score:  0.920702587032365


### 2.1.4 Save the best model

In [10]:
save_rf_best_params_to_json(grid_search_full_features, "BestParameter_RandomForest_full_features")

✅ File JSON đã được tạo tại: Best_Hyperparameter\BestParameter_RandomForest_full_features.json


### 2.2.1 Load data REDUCED FEATURES

In [11]:
import pandas as pd

data_train = pd.read_csv('../../data/processed/train_reduced_Correlation.csv')
data_train.head()

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Z,tBodyAcc-entropy()-X,tBodyAcc-entropy()-Y,tBodyAcc-entropy()-Z,"tBodyAcc-arCoeff()-X,1","tBodyAcc-arCoeff()-X,2",...,fBodyBodyGyroJerkMag-min(),fBodyBodyGyroJerkMag-maxInds,fBodyBodyGyroJerkMag-meanFreq(),fBodyBodyGyroJerkMag-skewness(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)",Activity_code,Activity
0,0.288585,-0.020294,-0.132905,-0.995279,-0.913526,-0.407747,-0.679338,-0.602122,0.929294,-0.853011,...,-0.991048,-1.0,-0.074323,-0.298676,-0.112754,0.0304,-0.464761,-0.018446,2,STANDING
1,0.278419,-0.016411,-0.12352,-0.998245,-0.960322,-0.714892,-0.50093,-0.570979,0.611627,-0.329549,...,-0.99444,-1.0,0.158075,-0.595051,0.053477,-0.007435,-0.732626,0.703511,2,STANDING
2,0.279653,-0.019467,-0.113462,-0.99538,-0.978944,-0.592235,-0.485821,-0.570979,0.273025,-0.086309,...,-0.995866,-0.555556,0.414503,-0.390748,-0.118559,0.177899,0.100699,0.808529,2,STANDING
3,0.279174,-0.026201,-0.123283,-0.996091,-0.990675,-0.627446,-0.85093,-0.911872,0.061436,0.07484,...,-0.995732,-0.936508,0.404573,-0.11729,-0.036788,-0.012892,0.640011,-0.485366,2,STANDING
4,0.276629,-0.01657,-0.115362,-0.998139,-0.990482,-0.786553,-0.559477,-0.761434,0.313276,-0.131208,...,-0.997418,-0.936508,0.087753,-0.351471,0.12332,0.122542,0.693578,-0.615971,2,STANDING


### 2.2.2 Create X and y variables

In [12]:
X_train_reduced_feature = data_train.drop(['Activity', 'Activity_code'], axis=1)
y_train_reduced_feature = np.array(data_train['Activity_code'])

### 2.2.3 Grid Search

In [13]:
grid_search_reduced_feature = random_forest_grid_search(X_train_reduced_feature, y_train_reduced_feature)

# Truy cập kết quả tốt nhất
best_model_reduced_feature = grid_search_reduced_feature.best_estimator_

🌲 Starting GridSearchCV for Random Forest...
Fitting 3 folds for each of 54 candidates, totalling 162 fits
✅ GridSearchCV completed!
⏱ Elapsed time: 179.49 seconds
🏆 Best params:  {'bootstrap': False, 'max_depth': 30, 'min_samples_leaf': 5, 'min_samples_split': 2}
📈 Best score:  0.9204326430694677


### 2.1.4 Save the best model

In [17]:
save_rf_best_params_to_json(grid_search_reduced_feature, "BestParameter_RandomForest_reduced_features")

✅ File JSON đã được tạo tại: Best_Hyperparameter\BestParameter_RandomForest_reduced_features.json


### 2.3.1 Load data PCA

In [23]:
import pandas as pd

data_train_PCA = pd.read_csv('../../data/processed/train_PCA.csv')
data_train_PCA.head()

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,...,PC94,PC95,PC96,PC97,PC98,PC99,PC100,PC101,PC102,Activity_code
0,-16.138544,2.152024,3.14478,-0.272464,6.798938,-4.249394,2.937159,-4.905413,-0.775515,-3.627737,...,-0.165438,1.318898,-0.161542,0.427557,1.33735,2.258507,-1.681535,-1.209325,-1.175727,2
1,-15.296194,1.387144,-0.682221,2.813677,4.26617,-2.055663,0.011205,-1.845985,0.492546,-0.180175,...,-0.446051,0.68364,-0.62482,0.347661,-0.752347,0.156624,-1.347392,0.149474,-0.730615,2
2,-15.137019,2.473351,-1.756641,3.717974,4.181557,-1.357518,0.072947,-1.388188,0.539754,-0.671712,...,0.194423,-0.488807,-0.138448,0.140349,0.436761,0.28099,0.138031,0.662263,-0.227418,2
3,-15.350884,3.915681,-1.790322,2.567521,3.20584,-0.942944,0.530736,-1.832185,1.071517,-1.496989,...,-0.160942,-0.416531,-1.29398,0.486988,0.716588,-0.07136,0.128008,-0.139504,-0.90924,2
4,-15.544814,4.598737,-2.188582,2.897578,3.08015,-1.061458,-1.048591,-0.816933,0.600063,1.118875,...,-0.218499,-0.03621,0.18111,-0.123867,-0.64082,0.704967,0.321897,-0.053266,-0.145543,2


### 2.3.2 Create X and y variables

In [24]:
X_train_PCA = data_train_PCA.drop(['Activity_code'], axis=1)
y_train_PCA = np.array(data_train_PCA['Activity_code'])

### 2.3.3 Grid Search

In [25]:
grid_search_PCA = random_forest_grid_search(X_train_PCA, y_train_PCA)

# Truy cập kết quả tốt nhất
best_model_PCA = grid_search_PCA.best_estimator_

🌲 Starting GridSearchCV for Random Forest...
Fitting 3 folds for each of 54 candidates, totalling 162 fits
✅ GridSearchCV completed!
⏱ Elapsed time: 173.85 seconds
🏆 Best params:  {'bootstrap': False, 'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 2}
📈 Best score:  0.8635754391515889


### 2.3.4 Save the best model PCA 

In [26]:
save_rf_best_params_to_json(grid_search_PCA, "BestParameter_RandomForest_PCA")

✅ File JSON đã được tạo tại: Best_Hyperparameter\BestParameter_RandomForest_PCA.json
