# Train mô hình Random Forest

In [2]:
import numpy as np
import random

SEED = 42
np.random.seed(SEED)
random.seed(SEED)

In [3]:
import os
import json
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pandas as pd


## Prepare Data

In [4]:
def prepareData(dir_train, dir_test):
    data_train = pd.read_csv(dir_train)
    data_test = pd.read_csv(dir_test)

    X_train = data_train.drop(['Activity', 'Activity_code'], axis=1)
    y_train = data_train['Activity_code']

    X_test = data_test.drop(['Activity', 'Activity_code'], axis=1)
    y_test = data_test['Activity_code']

    return X_train, X_test, y_train, y_test


## Load Best Hyperparameter

In [5]:
def load_parameters_from_json(json_path):
    if not os.path.exists(json_path):
        raise FileNotFoundError(f"File {json_path} không tồn tại!")
    with open(json_path, 'r', encoding='utf-8') as file:
        params = json.load(file)
    return params


## Create train Model function

In [6]:
def train_random_forest(X_train, y_train, X_test, y_test, params):

    bootstrap = params['bootstrap']=='True'
    max_depth =  params['max_depth']
    min_samples_leaf = params['min_samples_leaf']
    min_samples_split = params['min_samples_split']
    model = RandomForestClassifier(bootstrap = bootstrap, 
                                   max_depth = max_depth, 
                                   min_samples_leaf = min_samples_leaf, 
                                   min_samples_split = min_samples_split)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy}")
    return model

## Save Model

In [7]:
def save_model(model, model_path):
    joblib.dump(model, model_path)
    print(f"Mô hình đã được lưu tại: {model_path}")

## Train model

In [8]:
dir_train = "../../data/interim/trainFinal.csv"
dir_test = "../../data/interim/testFinal.csv"
dir_param = "../hyperparameter_tuning/Best_Hyperparameter/best_parameter_RandomForest.json"
params = load_parameters_from_json(dir_param)
X_train, X_test, y_train, y_test = prepareData(dir_train, dir_test)
X_train = X_train.values
X_test = X_test.values

In [9]:
model = train_random_forest(X_train, y_train, X_test, y_test, params)
save_model(model, "../../models/RandomForest_model.pkl")

Accuracy: 0.9297590770274856
Mô hình đã được lưu tại: ../../models/RandomForest_model.pkl


## Train model with PCA

In [10]:
dir_train = "../../data/interim/train_PCA.csv"
dir_test = "../../data/interim/test_PCA.csv"

data_train_PCA = pd.read_csv(dir_train)
data_test_PCA = pd.read_csv(dir_test)

model_PCA = train_random_forest(data_train_PCA, y_train, data_test_PCA, y_test, params)

Accuracy: 0.8812351543942993
