In [6]:
import pandas as pd
import numpy as np
from pymongo import MongoClient
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
from tqdm import tqdm
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

In [7]:
mongo_client = MongoClient("mongodb://localhost:27017/")
mongo_db = mongo_client["traffic"]
features_coll = mongo_db["traffic_features"]

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"S·ª≠ d·ª•ng thi·∫øt b·ªã: {device}")

S·ª≠ d·ª•ng thi·∫øt b·ªã: cuda


In [9]:
required_columns = [
    'count_point_id', 'year', 'month', 'hour', 'day_of_week', 'is_weekend',
    'road_type', 'region_name', 'local_authority_name', 'link_length_km',
    'pedal_cycles', 'two_wheeled_motor_vehicles', 'cars_and_taxis',
    'buses_and_coaches', 'lgvs', 'hgvs_2_rigid_axle', 'all_hgvs',
    'all_motor_vehicles' # Target
]


In [10]:
# T·∫°o projection dictionary: {t√™n_tr∆∞·ªùng: 1} ƒë·ªÉ bao g·ªìm
projection = {col: 1 for col in required_columns}
projection['_id'] = 0  # Th∆∞·ªùng th√¨ kh√¥ng c·∫ßn _id, lo·∫°i b·ªè n√≥ s·∫Ω gi·∫£m t·∫£i

print("ƒêang t·∫£i d·ªØ li·ªáu t·ª´ MongoDB v·ªõi projection...")
# Th√™m projection v√†o c√¢u l·ªánh find()
cursor = features_coll.find({}, projection)
df = pd.DataFrame(list(cursor))
print(f"T·∫£i th√†nh c√¥ng {len(df)} b·∫£n ghi.")

ƒêang t·∫£i d·ªØ li·ªáu t·ª´ MongoDB v·ªõi projection...
T·∫£i th√†nh c√¥ng 4253812 b·∫£n ghi.


In [None]:
# ƒê·ªãnh nghƒ©a c√°c c·ªôt features v√† target
features = [
    "hour", "month", "is_weekend",
    "pedal_cycles", "two_wheeled_motor_vehicles",
    "cars_and_taxis", "buses_and_coaches", "lgvs",
    "hgvs_2_rigid_axle", "all_hgvs"
]
categorical_cols = ["day_of_week", "road_type", "region_name", "local_authority_name"]
target = "all_motor_vehicles"

In [None]:
# --- B∆Ø·ªöC L√ÄM S·∫†CH QUAN TR·ªåNG NH·∫§T ---
# Chuy·ªÉn ƒë·ªïi c√°c c·ªôt s·ªë v√† x·ª≠ l√Ω gi√° tr·ªã "b·∫©n"
numeric_cols_to_clean = features + [target, 'count_point_id', 'year', 'link_length_km']
for col in numeric_cols_to_clean:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

In [None]:
# Ki·ªÉm tra v√† x·ª≠ l√Ω NaN
print("S·ªë l∆∞·ª£ng gi√° tr·ªã NaN tr∆∞·ªõc khi x·ª≠ l√Ω:")
print(df.isnull().sum())
df.fillna(0, inplace=True)
print("\nS·ªë l∆∞·ª£ng gi√° tr·ªã NaN sau khi x·ª≠ l√Ω: Ho√†n t·∫•t.")

# X·ª≠ l√Ω c√°c c·ªôt ph√¢n lo·∫°i
for col in categorical_cols:
    if col in df.columns:
        df[col] = LabelEncoder().fit_transform(df[col].astype(str))
        features.append(col) # Th√™m c·ªôt ƒë√£ m√£ h√≥a v√†o danh s√°ch features

print("Danh s√°ch features cu·ªëi c√πng ƒë·ªÉ hu·∫•n luy·ªán:")
print(features)

S·ªë l∆∞·ª£ng gi√° tr·ªã NaN tr∆∞·ªõc khi x·ª≠ l√Ω:
count_point_id                      0
year                                0
hour                                0
region_name                         0
local_authority_name                0
road_type                           0
link_length_km                2317688
pedal_cycles                        0
two_wheeled_motor_vehicles          0
cars_and_taxis                      1
buses_and_coaches                   2
lgvs                                0
hgvs_2_rigid_axle                   4
all_hgvs                            9
all_motor_vehicles                 12
day_of_week                         0
is_weekend                          0
month                               0
dtype: int64

S·ªë l∆∞·ª£ng gi√° tr·ªã NaN sau khi x·ª≠ l√Ω: Ho√†n t·∫•t.
Danh s√°ch features cu·ªëi c√πng ƒë·ªÉ hu·∫•n luy·ªán:
['hour', 'month', 'is_weekend', 'pedal_cycles', 'two_wheeled_motor_vehicles', 'cars_and_taxis', 'buses_and_coaches', 'lgvs', 'hgvs_2_ri

In [None]:
# 3Ô∏è‚É£ GIAI ƒêO·∫†N 1: HU·∫§N LUY·ªÜN LSTM (PYTORCH) ƒê·ªÇ T·∫†O FEATURE
# =========================================================
print("\nB·∫Øt ƒë·∫ßu giai ƒëo·∫°n 1: Hu·∫•n luy·ªán LSTM tr√™n t·ª´ng chu·ªói th·ªùi gian...")

# --- ƒê·ªãnh nghƒ©a Class m√¥ h√¨nh LSTM ---
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        return out



B·∫Øt ƒë·∫ßu giai ƒëo·∫°n 1: Hu·∫•n luy·ªán LSTM tr√™n t·ª´ng chu·ªói th·ªùi gian...


In [None]:
# --- B·∫Øt ƒë·∫ßu v√≤ng l·∫∑p hu·∫•n luy·ªán ---
df = df.sort_values(by=['count_point_id', 'year', 'month', 'hour']).reset_index(drop=True)
unique_point_ids = df['count_point_id'].unique()
all_lstm_results = []
n_steps = 5

In [None]:
for point_id in tqdm(unique_point_ids, desc="Hu·∫•n luy·ªán LSTM (PyTorch)"):
    df_point = df[df['count_point_id'] == point_id].copy()
    
    if len(df_point) < n_steps + 15: # C·∫ßn ƒë·ªß d·ªØ li·ªáu ƒë·ªÉ t·∫°o chu·ªói v√† hu·∫•n luy·ªán
        continue

    # =========================================================
    # B∆Ø·ªöC 1: CHU·∫®N B·ªä D·ªÆ LI·ªÜU CHO M√î H√åNH
    # =========================================================
    data_to_scale = df_point[features + [target]]
    scaler = MinMaxScaler()
    scaled_data = scaler.fit_transform(data_to_scale)

    # T·∫°o chu·ªói X, y
    X_lstm, y_lstm = [], []
    for i in range(len(scaled_data) - n_steps):
        X_lstm.append(scaled_data[i:i+n_steps, :-1])
        y_lstm.append(scaled_data[i+n_steps, -1])
    
    if len(X_lstm) == 0: 
        continue
    
    # Chuy·ªÉn d·ªØ li·ªáu sang PyTorch Tensors
    X_tensor = torch.FloatTensor(np.array(X_lstm)).to(device)
    y_tensor = torch.FloatTensor(np.array(y_lstm)).unsqueeze(1).to(device)

    # T·∫°o DataLoader
    dataset = TensorDataset(X_tensor, y_tensor)
    dataloader = DataLoader(dataset, batch_size=64, shuffle=True)
    
    # =========================================================
    # B∆Ø·ªöC 2: KH·ªûI T·∫†O V√Ä HU·∫§N LUY·ªÜN M√î H√åNH
    # =========================================================
    model_lstm = LSTMModel(input_size=X_tensor.shape[2], hidden_size=32, num_layers=1, output_size=1).to(device)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model_lstm.parameters(), lr=0.001)
    
    num_epochs = 5
    for epoch in range(num_epochs):
        for inputs, labels in dataloader:
            outputs = model_lstm(inputs)
            loss = criterion(outputs, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    # =========================================================
    # B∆Ø·ªöC 3: D·ª∞ B√ÅO V√Ä X·ª¨ L√ù K·∫æT QU·∫¢
    # =========================================================
    model_lstm.eval() # Chuy·ªÉn sang ch·∫ø ƒë·ªô ƒë√°nh gi√°
    with torch.no_grad():
        # BI·∫æN `predicted_scaled` ƒê∆Ø·ª¢C T·∫†O RA ·ªû ƒê√ÇY!
        predicted_scaled = model_lstm(X_tensor).cpu().numpy()
    
    # Kh√¥i ph·ª•c gi√° tr·ªã g·ªëc (B√¢y gi·ªù code n√†y s·∫Ω ho·∫°t ƒë·ªông)
    inverse_helper = np.zeros((len(predicted_scaled), len(features) + 1))
    inverse_helper[:, -1] = predicted_scaled.flatten()
    predicted_volume_lstm = scaler.inverse_transform(inverse_helper)[:, -1]

    # L∆∞u k·∫øt qu·∫£
    df_point_result = df_point.iloc[n_steps:].copy()
    df_point_result['lstm_predicted_volume'] = predicted_volume_lstm
    all_lstm_results.append(df_point_result)


# --- K·∫æT TH√öC V√íNG L·∫∂P ---
# Gh√©p t·∫•t c·∫£ k·∫øt qu·∫£ t·ª´ LSTM l·∫°i
df_stacked = pd.concat(all_lstm_results, ignore_index=True)
print(f"\nHo√†n t·∫•t giai ƒëo·∫°n 1. ƒê√£ t·∫°o ƒë∆∞·ª£c {len(df_stacked)} b·∫£n ghi v·ªõi feature t·ª´ LSTM.")

Hu·∫•n luy·ªán LSTM (PyTorch): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 41647/41647 [20:10<00:00, 34.40it/s]



Ho√†n t·∫•t giai ƒëo·∫°n 1. ƒê√£ t·∫°o ƒë∆∞·ª£c 4043687 b·∫£n ghi v·ªõi feature t·ª´ LSTM.


In [None]:
# 4Ô∏è‚É£ GIAI ƒêO·∫†N 2: HU·∫§N LUY·ªÜN XGBOOST V·ªöI FEATURE T·ª™ LSTM
# =========================================================
from sklearn.model_selection import train_test_split

print("\nB·∫Øt ƒë·∫ßu giai ƒëo·∫°n 2: Hu·∫•n luy·ªán XGBoost v·ªõi feature ƒë∆∞·ª£c l√†m gi√†u...")

# --- Chu·∫©n b·ªã d·ªØ li·ªáu cho XGBoost ---
# Feature m·ªõi c·ªßa ch√∫ng ta l√† 'lstm_predicted_volume'
features_for_xgb = features + ['lstm_predicted_volume']
target_for_xgb = target

# ƒê·∫£m b·∫£o kh√¥ng c√≥ gi√° tr·ªã NaN/inf n√†o c√≤n s√≥t l·∫°i
df_stacked.replace([np.inf, -np.inf], np.nan, inplace=True)
df_stacked.fillna(0, inplace=True)

# L·∫•y d·ªØ li·ªáu X v√† y cu·ªëi c√πng
X = df_stacked[features_for_xgb]
y = df_stacked[target_for_xgb]

# --- Chia d·ªØ li·ªáu th√†nh t·∫≠p hu·∫•n luy·ªán v√† t·∫≠p ki·ªÉm tra ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"K√≠ch th∆∞·ªõc t·∫≠p hu·∫•n luy·ªán XGBoost: {X_train.shape}")
print(f"K√≠ch th∆∞·ªõc t·∫≠p ki·ªÉm tra XGBoost: {X_test.shape}")


B·∫Øt ƒë·∫ßu giai ƒëo·∫°n 2: Hu·∫•n luy·ªán XGBoost v·ªõi feature ƒë∆∞·ª£c l√†m gi√†u...
K√≠ch th∆∞·ªõc t·∫≠p hu·∫•n luy·ªán XGBoost: (3234949, 15)
K√≠ch th∆∞·ªõc t·∫≠p ki·ªÉm tra XGBoost: (808738, 15)


In [None]:
# --- Kh·ªüi t·∫°o v√† Hu·∫•n luy·ªán m√¥ h√¨nh XGBoost ---
model_xgb_stacked = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=1000,         # TƒÉng s·ªë l∆∞·ª£ng c√¢y ƒë·ªÉ m√¥ h√¨nh h·ªçc s√¢u h∆°n
    max_depth=7,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,                 # S·ª≠ d·ª•ng t·∫•t c·∫£ c√°c CPU cores
    # Th√™m tham s·ªë tree_method ƒë·ªÉ s·ª≠ d·ª•ng GPU n·∫øu c√≥
    tree_method='gpu_hist' if device.type == 'cuda' else 'auto'
)


In [None]:
print("\nƒêang hu·∫•n luy·ªán XGBoost...")
# Hu·∫•n luy·ªán m√¥ h√¨nh v·ªõi early stopping
model_xgb_stacked.fit(X_train, y_train,
                    eval_set=[(X_test, y_test)], # D·ª´ng n·∫øu sau 50 v√≤ng m√† hi·ªáu su·∫•t kh√¥ng c·∫£i thi·ªán
                    verbose=100) # In ra k·∫øt qu·∫£ sau m·ªói 100 v√≤ng



ƒêang hu·∫•n luy·ªán XGBoost...



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)


[0]	validation_0-rmse:795.61793
[100]	validation_0-rmse:38.90523
[200]	validation_0-rmse:36.35525
[300]	validation_0-rmse:35.46785
[400]	validation_0-rmse:35.02281
[500]	validation_0-rmse:34.80072
[600]	validation_0-rmse:34.66774
[700]	validation_0-rmse:34.59657
[800]	validation_0-rmse:34.55204
[900]	validation_0-rmse:34.56449
[999]	validation_0-rmse:34.55129


In [None]:
# 5Ô∏è‚É£ ƒê√ÅNH GI√Å M√î H√åNH K·∫æT H·ª¢P V√Ä D·ª∞ ƒêO√ÅN
# =========================================================
print("\nƒê√°nh gi√° hi·ªáu su·∫•t m√¥ h√¨nh k·∫øt h·ª£p v√† th·ª±c hi·ªán d·ª± ƒëo√°n...")

# D·ª± b√°o tr√™n t·∫≠p ki·ªÉm tra
y_pred_stacked = model_xgb_stacked.predict(X_test)

# T√≠nh to√°n c√°c ch·ªâ s·ªë hi·ªáu su·∫•t
mse = mean_squared_error(y_test, y_pred_stacked)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred_stacked)

print("\n--- K·∫æT QU·∫¢ ƒê√ÅNH GI√Å M√î H√åNH CU·ªêI C√ôNG ---")
print(f"  - Mean Squared Error (MSE): {mse:.2f}")
print(f"  - Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"  - R-squared (R¬≤): {r2:.4f}")
print("---------------------------------------------")


ƒê√°nh gi√° hi·ªáu su·∫•t m√¥ h√¨nh k·∫øt h·ª£p v√† th·ª±c hi·ªán d·ª± ƒëo√°n...



    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)



--- K·∫æT QU·∫¢ ƒê√ÅNH GI√Å M√î H√åNH CU·ªêI C√ôNG ---
  - Mean Squared Error (MSE): 1193.79
  - Root Mean Squared Error (RMSE): 34.55
  - R-squared (R¬≤): 0.9983
---------------------------------------------


In [None]:
# 6Ô∏è‚É£ PH√ÇN T√çCH K·∫æT QU·∫¢ D·ª∞ ƒêO√ÅN V√Ä PH√ÇN LO·∫†I √ôN T·∫ÆC
# =========================================================
print("\nPh√¢n t√≠ch k·∫øt qu·∫£ d·ª± ƒëo√°n tr√™n t·∫≠p ki·ªÉm tra...")

# T·∫°o m·ªôt DataFrame k·∫øt qu·∫£ ƒë·ªÉ d·ªÖ ph√¢n t√≠ch
df_final_result = X_test.copy()
df_final_result['actual_volume'] = y_test
df_final_result['predicted_volume'] = y_pred_stacked
df_final_result['predicted_volume'] = df_final_result['predicted_volume'].apply(lambda x: max(0, x)) # ƒê·∫£m b·∫£o kh√¥ng √¢m
df_final_result['error'] = df_final_result['actual_volume'] - df_final_result['predicted_volume']

# Th√™m l·∫°i th√¥ng tin 'lstm_predicted_volume' ƒë·ªÉ so s√°nh
df_final_result['lstm_predicted_volume'] = df_stacked.loc[df_final_result.index, 'lstm_predicted_volume']


Ph√¢n t√≠ch k·∫øt qu·∫£ d·ª± ƒëo√°n tr√™n t·∫≠p ki·ªÉm tra...


In [None]:
# ƒê·ªãnh nghƒ©a quy t·∫Øc ph√¢n lo·∫°i √πn t·∫Øc
# B·∫°n n√™n ƒëi·ªÅu ch·ªânh c√°c ng∆∞·ª°ng n√†y d·ª±a tr√™n df['all_motor_vehicles'].describe()
def classify_congestion(volume):
    if volume < 50:
        return "1 - R·∫•t th√¥ng tho√°ng"
    elif volume < 150:
        return "2 - Th√¥ng tho√°ng"
    elif volume < 300:
        return "3 - ƒê√¥ng ƒë√∫c"
    else:
        return "4 - √ôn t·∫Øc"

df_final_result['congestion_level'] = df_final_result['predicted_volume'].apply(classify_congestion)

print("\n5 d√≤ng k·∫øt qu·∫£ d·ª± b√°o cu·ªëi c√πng:")
print(df_final_result[[
    'actual_volume', 
    'lstm_predicted_volume', # D·ª± b√°o c·ªßa LSTM
    'predicted_volume',      # D·ª± b√°o cu·ªëi c√πng c·ªßa XGBoost
    'error',                 # Sai s·ªë
    'congestion_level'
]].head())


5 d√≤ng k·∫øt qu·∫£ d·ª± b√°o cu·ªëi c√πng:
         actual_volume  lstm_predicted_volume  predicted_volume     error  \
1118089          704.0             779.353698        700.983582  3.016418   
3318591           56.0              47.042337         55.524445  0.475555   
2366743           92.0              81.109572         93.991364 -1.991364   
572103           167.0             161.211343        166.162811  0.837189   
3160545           15.0              24.794047         15.284058 -0.284058   

             congestion_level  
1118089            4 - √ôn t·∫Øc  
3318591      2 - Th√¥ng tho√°ng  
2366743      2 - Th√¥ng tho√°ng  
572103           3 - ƒê√¥ng ƒë√∫c  
3160545  1 - R·∫•t th√¥ng tho√°ng  


In [None]:
import os
os.makedirs("saved_models/lstm", exist_ok=True)

for point_id in unique_point_ids:
    model_path = f"saved_models/lstm/model_lstm_{point_id}.pth"
    torch.save(model_lstm.state_dict(), model_path)


In [None]:
torch.save(model_lstm.state_dict(), "model_lstm.pth")
print("‚úÖ ƒê√£ l∆∞u m√¥ h√¨nh LSTM th√†nh c√¥ng.")


‚úÖ ƒê√£ l∆∞u m√¥ h√¨nh LSTM th√†nh c√¥ng.


In [None]:
import joblib
joblib.dump(scaler, "scaler.pkl")
print("‚úÖ ƒê√£ l∆∞u scaler th√†nh c√¥ng.")


‚úÖ ƒê√£ l∆∞u scaler th√†nh c√¥ng.


In [None]:
import joblib
joblib.dump(model_xgb_stacked, "model_xgb_stacked.pkl")
print("‚úÖ ƒê√£ l∆∞u m√¥ h√¨nh XGBoost (pickle) th√†nh c√¥ng.")


‚úÖ ƒê√£ l∆∞u m√¥ h√¨nh XGBoost (pickle) th√†nh c√¥ng.


In [None]:
# T·∫°o th∆∞ m·ª•c n·∫øu ch∆∞a c√≥
os.makedirs("model", exist_ok=True)

# L∆∞u ra file JSON
model_xgb_stacked.save_model("model/model_xgb_stacked.json")
print("‚úÖ Model XGBoost ƒë√£ ƒë∆∞·ª£c l∆∞u th√†nh c√¥ng d∆∞·ªõi d·∫°ng JSON.")



    E.g. tree_method = "hist", device = "cuda"

  self.get_booster().save_model(fname)


‚úÖ Model XGBoost ƒë√£ ƒë∆∞·ª£c l∆∞u th√†nh c√¥ng d∆∞·ªõi d·∫°ng JSON.


In [None]:
# Khi train
input_size = X_tensor.shape[2]   # N·∫øu X_train c√≥ d·∫°ng (samples, time_steps, features)
model_lstm_loaded = LSTMModel(input_size, hidden_size=32, num_layers=1, output_size=1)

In [None]:

model_lstm_loaded.load_state_dict(torch.load("model_lstm.pth", map_location=device))
model_lstm_loaded.to(device)
model_lstm_loaded.eval()
print("‚úÖ ƒê√£ t·∫£i l·∫°i m√¥ h√¨nh LSTM th√†nh c√¥ng.")


‚úÖ ƒê√£ t·∫£i l·∫°i m√¥ h√¨nh LSTM th√†nh c√¥ng.


  model_lstm_loaded.load_state_dict(torch.load("model_lstm.pth", map_location=device))


In [None]:
import torch
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# üîπ L·∫•y 5 gi√° tr·ªã volume cu·ªëi c√πng trong d·ªØ li·ªáu th·ª±c t·∫ø
last_sequence = df["all_motor_vehicles"].values[-n_steps:]

scaler_volume = MinMaxScaler()
scaler_volume.fit(df[["all_motor_vehicles"]])

last_sequence = df["all_motor_vehicles"].values[-n_steps:]
scaled_last_seq = scaler_volume.transform(df[["all_motor_vehicles"]].iloc[-n_steps:])


In [None]:
print(df.columns.tolist())


['count_point_id', 'year', 'hour', 'region_name', 'local_authority_name', 'road_type', 'link_length_km', 'pedal_cycles', 'two_wheeled_motor_vehicles', 'cars_and_taxis', 'buses_and_coaches', 'lgvs', 'hgvs_2_rigid_axle', 'all_hgvs', 'all_motor_vehicles', 'day_of_week', 'is_weekend', 'month']


In [None]:
# 7Ô∏è‚É£ X√ÇY D·ª∞NG CH·ª®C NƒÇNG D·ª∞ B√ÅO TH·ª∞C T·∫æ
# =========================================================
print("\n" + "="*50)
print("X√ÇY D·ª∞NG CH·ª®C NƒÇNG D·ª∞ B√ÅO TH·ª∞C T·∫æ")
print("="*50)

# --- A. L∆∞u tr·ªØ c√°c ƒë·ªëi t∆∞·ª£ng c·∫ßn thi·∫øt cho vi·ªác d·ª± b√°o ---
# 1. L∆∞u l·∫°i c√°c LabelEncoder ƒë√£ fit
label_encoders = {}
for col in categorical_cols:
    if col in df.columns:
        le = LabelEncoder()
        le.fit(df[col].astype(str)) # Fit tr√™n to√†n b·ªô d·ªØ li·ªáu g·ªëc
        label_encoders[col] = le
        
print("ƒê√£ l∆∞u c√°c LabelEncoder.")


X√ÇY D·ª∞NG CH·ª®C NƒÇNG D·ª∞ B√ÅO TH·ª∞C T·∫æ
ƒê√£ l∆∞u c√°c LabelEncoder.


In [None]:
def predict_next_day(count_point_id, df, features, model_lstm, model_xgb_stacked, scaler, device, n_steps=5):
    """
    D·ª± b√°o l∆∞u l∆∞·ª£ng xe cho 24 gi·ªù ti·∫øp theo t·∫°i m·ªôt ƒëi·ªÉm ƒë·∫øm c·ª• th·ªÉ (count_point_id).

    Args:
        count_point_id (int): ID c·ªßa ƒëi·ªÉm ƒë·∫øm
        df (pd.DataFrame): d·ªØ li·ªáu g·ªëc ƒë√£ hu·∫•n luy·ªán
        features (list): danh s√°ch c·ªôt features
        model_lstm (torch.nn.Module): m√¥ h√¨nh LSTM ƒë√£ hu·∫•n luy·ªán
        model_xgb_stacked: m√¥ h√¨nh XGBoost ƒë√£ hu·∫•n luy·ªán
        scaler: MinMaxScaler ƒë√£ fit tr√™n d·ªØ li·ªáu
        device: torch.device
        n_steps (int): s·ªë b∆∞·ªõc chu·ªói LSTM

    Returns:
        pd.DataFrame: d·ª± b√°o 24 gi·ªù ti·∫øp theo
    """
    df_point = df[df['count_point_id'] == count_point_id].sort_values(by=['year','month','hour']).reset_index(drop=True)
    
    if len(df_point) < n_steps:
        raise ValueError(f"Kh√¥ng ƒë·ªß d·ªØ li·ªáu cho count_point_id={count_point_id} ƒë·ªÉ t·∫°o chu·ªói LSTM.")

    results = []
    
    for hour in range(24):
        # L·∫•y n_steps cu·ªëi ƒë·ªÉ LSTM d·ª± b√°o
        last_seq = df_point[features].iloc[-n_steps:].replace([np.inf, -np.inf], 0).fillna(0).astype(float).values
        last_seq_tensor = torch.FloatTensor(last_seq).unsqueeze(0).to(device)
        
        # D·ª± b√°o LSTM
        model_lstm.eval()
        with torch.no_grad():
            lstm_pred_scaled = model_lstm(last_seq_tensor).cpu().numpy().flatten()
        
        # Kh√¥i ph·ª•c gi√° tr·ªã g·ªëc
        inverse_helper = np.zeros((len(lstm_pred_scaled), len(features)+1))
        inverse_helper[:, -1] = lstm_pred_scaled
        lstm_pred_volume = scaler.inverse_transform(inverse_helper)[:, -1][0]
        
        # Chu·∫©n b·ªã row cho XGBoost
        last_row = df_point[features].iloc[[-1]].copy()
        last_row['lstm_predicted_volume'] = lstm_pred_volume
        last_row = last_row.replace([np.inf, -np.inf], 0).fillna(0).astype(float)
        X_xgb = last_row[features + ['lstm_predicted_volume']].values.reshape(1, -1)
        
        # D·ª± b√°o b·∫±ng XGBoost
        xgb_pred = model_xgb_stacked.predict(X_xgb)[0]
        xgb_pred = max(0, xgb_pred)  # ƒë·∫£m b·∫£o >= 0

        # Ph√¢n lo·∫°i m·ª©c ƒë·ªô √πn t·∫Øc
        if xgb_pred < 50:
            congestion = "1 - R·∫•t th√¥ng tho√°ng"
        elif xgb_pred < 150:
            congestion = "2 - Th√¥ng tho√°ng"
        elif xgb_pred < 300:
            congestion = "3 - ƒê√¥ng ƒë√∫c"
        else:
            congestion = "4 - √ôn t·∫Øc"
        
        # L∆∞u k·∫øt qu·∫£
        results.append({
            "hour": hour,
            "lstm_predicted_volume": lstm_pred_volume,
            "predicted_volume": xgb_pred,
            "congestion_level": congestion
        })
        
        # C·∫≠p nh·∫≠t df_point v·ªõi gi√° tr·ªã d·ª± b√°o ƒë·ªÉ d√πng cho gi·ªù ti·∫øp theo
        next_row = last_row.copy()
        next_row['hour'] = hour
        next_row['all_motor_vehicles'] = xgb_pred
        df_point = pd.concat([df_point, next_row], ignore_index=True)
    
    return pd.DataFrame(results)


In [None]:
# Kh·ªüi t·∫°o XGBRegressor r·ªóng
model_xgb_stacked_loaded = xgb.XGBRegressor()

# Load model t·ª´ JSON
model_xgb_stacked_loaded.load_model("model/model_xgb_stacked.json")

# N·∫øu mu·ªën d√πng GPU khi predict
model_xgb_stacked_loaded.tree_method = 'gpu_hist'

print("‚úÖ Model XGBoost ƒë√£ ƒë∆∞·ª£c load th√†nh c√¥ng t·ª´ JSON.")


‚úÖ Model XGBoost ƒë√£ ƒë∆∞·ª£c load th√†nh c√¥ng t·ª´ JSON.


In [None]:
# üîπ D·ª± b√°o 24 gi·ªù ti·∫øp theo
result_df = predict_next_day(
    count_point_id=54,
    df=df,
    features=features,
    model_lstm=model_lstm,                  # LSTM ƒë√£ load ho·∫∑c train s·∫µn
    model_xgb_stacked=model_xgb_stacked_loaded,  # D√πng model ƒë√£ load
    scaler=scaler,
    device=device,
    n_steps=10
)

  last_seq = df_point[features].iloc[-n_steps:].replace([np.inf, -np.inf], 0).fillna(0).astype(float).values
  last_row = last_row.replace([np.inf, -np.inf], 0).fillna(0).astype(float)
  last_seq = df_point[features].iloc[-n_steps:].replace([np.inf, -np.inf], 0).fillna(0).astype(float).values
  last_row = last_row.replace([np.inf, -np.inf], 0).fillna(0).astype(float)
  last_seq = df_point[features].iloc[-n_steps:].replace([np.inf, -np.inf], 0).fillna(0).astype(float).values
  last_row = last_row.replace([np.inf, -np.inf], 0).fillna(0).astype(float)
  last_seq = df_point[features].iloc[-n_steps:].replace([np.inf, -np.inf], 0).fillna(0).astype(float).values
  last_row = last_row.replace([np.inf, -np.inf], 0).fillna(0).astype(float)
  last_seq = df_point[features].iloc[-n_steps:].replace([np.inf, -np.inf], 0).fillna(0).astype(float).values
  last_row = last_row.replace([np.inf, -np.inf], 0).fillna(0).astype(float)
  last_seq = df_point[features].iloc[-n_steps:].replace([np.inf, -np.inf], 

In [None]:
print(result_df)

    hour  lstm_predicted_volume  predicted_volume      congestion_level
0      0              72.024397         17.236866  1 - R·∫•t th√¥ng tho√°ng
1      1               5.527166         15.644845  1 - R·∫•t th√¥ng tho√°ng
2      2               5.119615         15.644845  1 - R·∫•t th√¥ng tho√°ng
3      3               2.792055         15.644845  1 - R·∫•t th√¥ng tho√°ng
4      4              10.234891         15.644845  1 - R·∫•t th√¥ng tho√°ng
5      5              15.908782         15.644845  1 - R·∫•t th√¥ng tho√°ng
6      6              16.679357         15.644845  1 - R·∫•t th√¥ng tho√°ng
7      7              23.087625         15.422552  1 - R·∫•t th√¥ng tho√°ng
8      8              32.009406         15.702405  1 - R·∫•t th√¥ng tho√°ng
9      9              42.645827         15.896573  1 - R·∫•t th√¥ng tho√°ng
10    10              54.042291         16.096846  1 - R·∫•t th√¥ng tho√°ng
11    11              54.828234         16.096846  1 - R·∫•t th√¥ng tho√°ng
12    12        

In [11]:
print(df.columns.tolist())

['count_point_id', 'year', 'hour', 'region_name', 'local_authority_name', 'road_type', 'link_length_km', 'pedal_cycles', 'two_wheeled_motor_vehicles', 'cars_and_taxis', 'buses_and_coaches', 'lgvs', 'hgvs_2_rigid_axle', 'all_hgvs', 'all_motor_vehicles', 'day_of_week', 'is_weekend', 'month']
