In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Đọc dữ liệu
df = pd.read_csv("../dataset/ECL.csv")
df['date'] = pd.to_datetime(df['date'])
df = df.set_index('date')

# Điền khuyết bằng nội suy theo thời gian
df = df.interpolate(method='time')

# Chuẩn hóa dữ liệu
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df)
scaled_df = pd.DataFrame(scaled_data, index=df.index, columns=df.columns)

# Chọn 50 đặc trưng tương quan cao nhất với MT_001
correlations = df.corr()['MT_001'].abs().sort_values(ascending=False)
top_features = correlations.drop('MT_001').head(50).index.tolist()
selected_columns = top_features + ['MT_001']
reduced_df = scaled_df[selected_columns].astype(np.float32)

# Tạo dữ liệu chuỗi thời gian supervised
def create_supervised_data(data, target_column='MT_001', window_size=24):
    X, y = [], []
    for i in range(window_size, len(data)):
        X.append(data.iloc[i-window_size:i].values)
        y.append(data.iloc[i][target_column])
    return np.array(X), np.array(y)

X, y = create_supervised_data(reduced_df, window_size=24)

print("Shape of X:", X.shape)
print("Shape of y:", y.shape)


Shape of X: (26280, 24, 51)
Shape of y: (26280,)


In [9]:
from torch import nn
import torch

In [7]:
# Tạo lớp LSTM Feature Extractor
class LSTMFeatureExtractor(nn.Module):
    def __init__(self, input_size, hidden_size=64, num_layers=2):
        super(LSTMFeatureExtractor, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)

    def forward(self, x):
        out, _ = self.lstm(x)
        return out[:, -1, :]  # Lấy đặc trưng timestep cuối


In [10]:
# Trích xuất đặc trưng bằng LSTM (không cần fully-connected)
lstm_extractor = LSTMFeatureExtractor(input_size=X.shape[2])
lstm_extractor.eval()

# Chạy forward để lấy đặc trưng
with torch.no_grad():
    features = lstm_extractor(torch.tensor(X, dtype=torch.float32)).numpy()


In [11]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Chia train/test
X_train, X_test, y_train, y_test = train_test_split(features, y, test_size=0.2, random_state=42)

# Huấn luyện XGBoost
xgb_model = XGBRegressor(n_estimators=100, max_depth=5, learning_rate=0.05)
xgb_model.fit(X_train, y_train)

# Dự đoán và đánh giá
y_pred = xgb_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f"MSE: {mse:.4f}, MAE: {mae:.4f}")


MSE: 0.2821, MAE: 0.3612


In [12]:
def create_multistep_supervised_data(data, target_column='MT_001', window_size=24, horizon=12):
    X, y = [], []
    target_series = data[target_column].values
    for i in range(window_size, len(data) - horizon):
        X.append(data.iloc[i-window_size:i].values)
        y.append(target_series[i:i+horizon])
    return np.array(X), np.array(y)

# Tạo dữ liệu cho từng horizon
X_12, y_12 = create_multistep_supervised_data(reduced_df, window_size=24, horizon=12)
X_24, y_24 = create_multistep_supervised_data(reduced_df, window_size=24, horizon=24)
X_36, y_36 = create_multistep_supervised_data(reduced_df, window_size=24, horizon=36)
X_48, y_48 = create_multistep_supervised_data(reduced_df, window_size=24, horizon=48)


In [13]:
def extract_lstm_features(X, input_size, model=None):
    if model is None:
        model = LSTMFeatureExtractor(input_size)
    model.eval()
    with torch.no_grad():
        return model(torch.tensor(X, dtype=torch.float32)).numpy()

# Giả sử input_size = X_12.shape[2]
features_12 = extract_lstm_features(X_12, input_size=X_12.shape[2])
features_24 = extract_lstm_features(X_24, input_size=X_24.shape[2])
features_36 = extract_lstm_features(X_36, input_size=X_36.shape[2])
features_48 = extract_lstm_features(X_48, input_size=X_48.shape[2])


In [14]:
def train_xgboost_multioutput(X, y):
    from sklearn.multioutput import MultiOutputRegressor
    from xgboost import XGBRegressor
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import mean_squared_error, mean_absolute_error

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = MultiOutputRegressor(XGBRegressor(n_estimators=100, learning_rate=0.05))
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    return model, mse, mae


In [15]:
model_12, mse_12, mae_12 = train_xgboost_multioutput(features_12, y_12)
model_24, mse_24, mae_24 = train_xgboost_multioutput(features_24, y_24)
model_36, mse_36, mae_36 = train_xgboost_multioutput(features_36, y_36)
model_48, mse_48, mae_48 = train_xgboost_multioutput(features_48, y_48)

print(f"Horizon 12 - MSE: {mse_12:.4f}, MAE: {mae_12:.4f}")
print(f"Horizon 24 - MSE: {mse_24:.4f}, MAE: {mae_24:.4f}")
print(f"Horizon 36 - MSE: {mse_36:.4f}, MAE: {mae_36:.4f}")
print(f"Horizon 48 - MSE: {mse_48:.4f}, MAE: {mae_48:.4f}")


Horizon 12 - MSE: 0.3534, MAE: 0.4071
Horizon 24 - MSE: 0.3462, MAE: 0.4001
Horizon 36 - MSE: 0.3716, MAE: 0.4145
Horizon 48 - MSE: 0.3664, MAE: 0.4068
