# Train mô hình Logistic Regression

## 1. Import các thư viện

In [19]:
import numpy as np
import random

SEED = 42
np.random.seed(SEED)
random.seed(SEED)

In [20]:
import json
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import joblib

## 2. Load hyperparameters từ JSON

In [21]:
# Paths đến 3 file JSON
param_files = {
    'LabelEncoder_noSubject':   '../hyperparameter_tuning/Best_Hyperparameter/BestParameter_LogisticRegression_full_features.json',
    'PCA':    '../hyperparameter_tuning/Best_Hyperparameter/BestParameter_LogisticRegression_PCA.json',
    'reduced_Correlation':'../hyperparameter_tuning/Best_Hyperparameter/BestParameter_LogisticRegression_reduced_features.json',
}

best_params = {}
for name, path in param_files.items():
    with open(path, 'r', encoding='utf-8') as f:
        best_params[name] = json.load(f)


## 3. Load dữ liệu train/test

In [22]:
data_sets = {}
for name in ['LabelEncoder_noSubject', 'PCA', 'reduced_Correlation']:
    df_train = pd.read_csv(f'../../data/processed/train_{name}.csv')
    df_test  = pd.read_csv(f'../../data/processed/test_{name}.csv')

    try:
        X_train = df_train.drop(['Activity', 'Activity_code'], axis=1)
        X_test  = df_test.drop(['Activity', 'Activity_code'], axis=1)
    except KeyError:
        X_train = df_train.drop('Activity_code', axis=1)
        X_test  = df_test.drop('Activity_code', axis=1)
        
    y_train = df_train['Activity_code']
    y_test  = df_test['Activity_code']

    data_sets[name] = (X_train, y_train, X_test, y_test)


## 4. Huấn luyện và lưu model

In [27]:
name_model = ['full', 'pca', 'reduced']
for (name, params), model_name in zip(best_params.items(), name_model):
    X_train, y_train, _, _ = data_sets[name]

    model = LogisticRegression(
        C=params['C'],
        penalty=params['penalty'],
        solver=params['solver'],
        max_iter=params['max_iter']
    )
    model.fit(X_train, y_train)

    # Lưu model
    joblib.dump(model, f'../../models/logreg_{model_name}.joblib')
    print(f"› Saved logreg_{model_name}.joblib")


› Saved logreg_full.joblib
› Saved logreg_pca.joblib
› Saved logreg_reduced.joblib


## 5. Đánh giá trên tập test

In [29]:
for nameModel, name in zip(name_model, ['LabelEncoder_noSubject', 'PCA', 'reduced_Correlation']):
    _, _, X_test, y_test = data_sets[name]
    model = joblib.load(f'../../models/logreg_{nameModel}.joblib')

    y_pred = model.predict(X_test)
    print(f"--- Results on {nameModel} features ---")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))


--- Results on full features ---
Accuracy: 0.9616559212758737
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       537
           1       0.97      0.87      0.92       491
           2       0.90      0.98      0.93       532
           3       0.95      1.00      0.97       496
           4       1.00      0.97      0.98       420
           5       0.97      0.95      0.96       471

    accuracy                           0.96      2947
   macro avg       0.96      0.96      0.96      2947
weighted avg       0.96      0.96      0.96      2947

--- Results on pca features ---
Accuracy: 0.9317950458092976
              precision    recall  f1-score   support

           0       1.00      0.99      1.00       537
           1       0.92      0.86      0.89       491
           2       0.88      0.93      0.91       532
           3       0.93      0.98      0.95       496
           4       0.94      0.90      0.92       420
         

## 6. Tính thời gian inference trên 1 mẫu (Latency)

In [45]:
# ─── Cell: Latency Measurement Function ───────────────────────────────────────
import time
import numpy as np

def measure_latency_highres_ms(model, X, warmup=10, repeats=1):
    """
    Đo latency per–sample với độ phân giải cao, trả về kết quả bằng ms.

    Thực hiện `warmup` lần predict đầu để làm nóng (cache) model.
    Với mỗi mẫu, lặp `repeats` lần và lấy trung bình để giảm nhiễu.
    """
    # Giữ X ở dạng DataFrame để tránh warning về feature_names
    if isinstance(X, np.ndarray):
        X_df = pd.DataFrame(X, columns=model.feature_names_in_)
    else:
        X_df = X.copy()

    # Warm-up
    for _ in range(warmup):
        _ = model.predict(X_df.iloc[[0]])

    times_ns = []
    for i in range(len(X_df)):
        sample = X_df.iloc[[i]]
        total_ns = 0
        for _ in range(repeats):
            t0 = time.perf_counter_ns()
            _ = model.predict(sample)
            total_ns += time.perf_counter_ns() - t0
        times_ns.append(total_ns / repeats)

    times_ns = np.array(times_ns)
    # chuyển nanô-giây -> mili-giây
    times_ms = times_ns / 1e6

    return {
        'mean_ms': times_ms.mean(),
        'p50_ms':  np.percentile(times_ms, 50),
        'p95_ms':  np.percentile(times_ms, 95)
    }
# ─── Cell: Gọi đo latency cho từng model trên mỗi feature set ────────────────
for data_name, model_name in zip(['LabelEncoder_noSubject', 'PCA', 'reduced_Correlation'], ['full', 'pca', 'reduced']):
    # load model nếu cần, hoặc dùng `model` vừa train
    model = joblib.load(f'../../models/logreg_{model_name}.joblib')
    X_test = data_sets[data_name][2]  # (X_train, y_train, X_test, y_test)

    lat_ms = measure_latency_highres_ms(model, X_test, warmup=10, repeats=3)
    print(f"--- Latency of LogisticRegression on {model_name} features ---")
    print(f"Mean   : {lat_ms['mean_ms']:.3f} ms")
    print(f"P50    : {lat_ms['p50_ms']:.3f} ms")
    print(f"P95    : {lat_ms['p95_ms']:.3f} ms")


--- Latency of LogisticRegression on full features ---
Mean   : 1.917 ms
P50    : 1.872 ms
P95    : 2.330 ms
--- Latency of LogisticRegression on pca features ---
Mean   : 0.650 ms
P50    : 0.627 ms
P95    : 0.772 ms
--- Latency of LogisticRegression on reduced features ---
Mean   : 0.936 ms
P50    : 0.903 ms
P95    : 1.118 ms
