# Train mô hình Random Forest

## 1. Import các thư viện

In [1]:
import numpy as np
import random

SEED = 42
np.random.seed(SEED)
random.seed(SEED)

In [9]:
import os
import json
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd


## 2. Load hyperparameters từ JSON

In [6]:
# Paths đến 3 file JSON
param_files = {
    'LabelEncoder_noSubject':   '../hyperparameter_tuning/Best_Hyperparameter/BestParameter_RandomForest_full_features.json',
    'PCA':    '../hyperparameter_tuning/Best_Hyperparameter/BestParameter_RandomForest_PCA.json',
    'reduced_Correlation':'../hyperparameter_tuning/Best_Hyperparameter/BestParameter_RandomForest_reduced_features.json',
}

best_params = {}
for name, path in param_files.items():
    with open(path, 'r', encoding='utf-8') as f:
        best_params[name] = json.load(f)

## 3. Load dữ liệu train/test

In [7]:
data_sets = {}
for name in ['LabelEncoder_noSubject', 'PCA', 'reduced_Correlation']:
    df_train = pd.read_csv(f'../../data/processed/train_{name}.csv')
    df_test  = pd.read_csv(f'../../data/processed/test_{name}.csv')

    try:
        X_train = df_train.drop(['Activity', 'Activity_code'], axis=1)
        X_test  = df_test.drop(['Activity', 'Activity_code'], axis=1)
    except KeyError:
        X_train = df_train.drop('Activity_code', axis=1)
        X_test  = df_test.drop('Activity_code', axis=1)

    y_train = df_train['Activity_code']
    y_test  = df_test['Activity_code']

    data_sets[name] = (X_train, y_train, X_test, y_test)


## 4. Huấn luyện và lưu model

In [8]:
name_model = ['full', 'pca', 'reduced']
for (name, params), model_name in zip(best_params.items(), name_model):
    X_train, y_train, _, _ = data_sets[name]

    model = RandomForestClassifier(
        bootstrap=params['bootstrap'],
        max_depth=params['max_depth'],
        min_samples_leaf=params['min_samples_leaf'],
        min_samples_split=params['min_samples_split']
    )
    model.fit(X_train, y_train)

    joblib.dump(model, f'../../models/rf_{model_name}.joblib')
    print(f"› Saved rf_{model_name}.joblib")


› Saved rf_full.joblib
› Saved rf_pca.joblib
› Saved rf_reduced.joblib


## 5. Đánh giá trên tập test

In [11]:
for nameModel, name in zip(name_model, ['LabelEncoder_noSubject', 'PCA', 'reduced_Correlation']):
    _, _, X_test, y_test = data_sets[name]
    model = joblib.load(f'../../models/rf_{nameModel}.joblib')

    y_pred = model.predict(X_test)
    print(f"--- RF on {nameModel} features ---")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))


--- RF on full features ---
Accuracy: 0.9273837801153716
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       537
           1       0.93      0.91      0.92       491
           2       0.92      0.93      0.92       532
           3       0.88      0.97      0.92       496
           4       0.97      0.84      0.90       420
           5       0.88      0.90      0.89       471

    accuracy                           0.93      2947
   macro avg       0.93      0.92      0.93      2947
weighted avg       0.93      0.93      0.93      2947

--- RF on pca features ---
Accuracy: 0.8836104513064132
              precision    recall  f1-score   support

           0       0.99      0.97      0.98       537
           1       0.86      0.77      0.81       491
           2       0.82      0.91      0.86       532
           3       0.85      0.96      0.90       496
           4       0.90      0.75      0.82       420
           5       

## 6. Tính thời gian inference trên 1 mẫu (Latency)

In [13]:
# ─── Cell: Latency Measurement Function ───────────────────────────────────────
import time
import numpy as np

def measure_latency_highres_ms(model, X, warmup=10, repeats=1):
    """
    Đo latency per–sample với độ phân giải cao, trả về kết quả bằng ms.

    Thực hiện `warmup` lần predict đầu để làm nóng (cache) model.
    Với mỗi mẫu, lặp `repeats` lần và lấy trung bình để giảm nhiễu.
    """
    # Giữ X ở dạng DataFrame để tránh warning về feature_names
    if isinstance(X, np.ndarray):
        X_df = pd.DataFrame(X, columns=model.feature_names_in_)
    else:
        X_df = X.copy()

    # Warm-up
    for _ in range(warmup):
        _ = model.predict(X_df.iloc[[0]])

    times_ns = []
    for i in range(len(X_df)):
        sample = X_df.iloc[[i]]
        total_ns = 0
        for _ in range(repeats):
            t0 = time.perf_counter_ns()
            _ = model.predict(sample)
            total_ns += time.perf_counter_ns() - t0
        times_ns.append(total_ns / repeats)

    times_ns = np.array(times_ns)
    # chuyển nanô-giây -> mili-giây
    times_ms = times_ns / 1e6

    return {
        'mean_ms': times_ms.mean(),
        'p50_ms':  np.percentile(times_ms, 50),
        'p95_ms':  np.percentile(times_ms, 95)
    }
# ─── Cell: Gọi đo latency cho từng model trên mỗi feature set ────────────────
for data_name, model_name in zip(['LabelEncoder_noSubject', 'PCA', 'reduced_Correlation'], ['full', 'pca', 'reduced']):
    # load model nếu cần, hoặc dùng `model` vừa train
    model = joblib.load(f'../../models/rf_{model_name}.joblib')
    X_test = data_sets[data_name][2]  # (X_train, y_train, X_test, y_test)

    lat_ms = measure_latency_highres_ms(model, X_test, warmup=10, repeats=3)
    print(f"--- Latency of Random Forest on {model_name} features ---")
    print(f"Mean   : {lat_ms['mean_ms']:.3f} ms")
    print(f"P50    : {lat_ms['p50_ms']:.3f} ms")
    print(f"P95    : {lat_ms['p95_ms']:.3f} ms")


--- Latency of Random Forest on full features ---
Mean   : 4.173 ms
P50    : 4.028 ms
P95    : 4.965 ms
--- Latency of Random Forest on pca features ---
Mean   : 2.982 ms
P50    : 2.914 ms
P95    : 3.451 ms
--- Latency of Random Forest on reduced features ---
Mean   : 3.232 ms
P50    : 3.165 ms
P95    : 3.676 ms
