# 1. Import Libraries & Setup

In [3]:
# TCN Forecasting Pipeline
# Mục tiêu: sử dụng bộ dữ liệu đã được feature engineering để huấn luyện mô hình TCN dự báo doanh số
import os
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.nn.utils import weight_norm
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from pathlib import Path

# 2. Load Data

In [4]:
train_data = pd.read_parquet('data/model/train_fe.parquet')
test_data = pd.read_parquet('data/model/test_fe.parquet')

In [5]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 830972 entries, 0 to 830971
Data columns (total 28 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   Store                    830972 non-null  int64  
 1   DayOfWeek                830972 non-null  int32  
 2   Sales                    830972 non-null  float64
 3   Customers                830972 non-null  int64  
 4   Open                     830972 non-null  int64  
 5   Promo                    830972 non-null  int64  
 6   StateHoliday             830972 non-null  object 
 7   SchoolHoliday            830972 non-null  int64  
 8   StoreType                830972 non-null  object 
 9   Assortment               830972 non-null  object 
 10  CompetitionDistance      830972 non-null  float64
 11  Promo2                   830972 non-null  int64  
 12  CompetitionMissingFlag   830972 non-null  int64  
 13  LogSales                 830972 non-null  float64
 14  Year

In [6]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1113 entries, 0 to 1112
Data columns (total 26 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Store                    1113 non-null   int64  
 1   DayOfWeek                1113 non-null   int32  
 2   Open                     1113 non-null   int64  
 3   Promo                    1113 non-null   int64  
 4   StateHoliday             1113 non-null   object 
 5   SchoolHoliday            1113 non-null   int64  
 6   StoreType                1113 non-null   object 
 7   Assortment               1113 non-null   object 
 8   CompetitionDistance      1113 non-null   float64
 9   Promo2                   1113 non-null   int64  
 10  CompetitionMissingFlag   1113 non-null   int64  
 11  LogSales                 0 non-null      float64
 12  Year                     1113 non-null   int32  
 13  Month                    1113 non-null   int32  
 14  Day                     

# 3. Data Preprocessing & Feature Engineering

In [7]:
TARGET_COL = "Sales"
ID_COL = "Store"
DATE_PARTS = ["Year", "Month", "Day"]
CAT_COLS = ["StateHoliday", "StoreType", "Assortment"]
LEAK_COLS = {TARGET_COL, "Customers", "LogSales", "Date"}
NA_ZERO_COLS = [
    "CompetitionMonthsActive",
    "Promo2WeeksActive",
    "Lag_1",
    "Lag_7",
    "Rolling_Mean_7",
    "Rolling_Std_7",
]

# Đảm bảo các cột tùy chọn tồn tại
for col in [TARGET_COL, "Customers", "LogSales"]:
    if col not in test_data.columns:
        test_data[col] = np.nan

# Tạo cột Date từ Year/Month/Day
for df in [train_data, test_data]:
    date_frame = df[DATE_PARTS].rename(columns={"Year": "year", "Month": "month", "Day": "day"})
    df["Date"] = pd.to_datetime(date_frame)

# Gộp train và test để xử lý đồng nhất
train_data["dataset"] = "train"
test_data["dataset"] = "test"
combined = pd.concat([train_data, test_data], ignore_index=True)

# Xử lý categorical và missing values
for col in CAT_COLS:
    combined[col] = combined[col].astype(str)

for col in NA_ZERO_COLS:
    if col in combined.columns:
        combined[col] = combined[col].fillna(0)

# One-hot encoding
combined = pd.get_dummies(combined, columns=CAT_COLS, drop_first=True)
combined = combined.sort_values([ID_COL, "Date"]).reset_index(drop=True)

# Tách lại train và test
train_data = combined[combined["dataset"] == "train"].drop(columns=["dataset"]).reset_index(drop=True)
test_data = combined[combined["dataset"] == "test"].drop(columns=["dataset"]).reset_index(drop=True)

FEATURE_COLS = [col for col in train_data.columns if col not in LEAK_COLS and col != ID_COL]

print(f"Train shape (post-encoding): {train_data.shape}")
print(f"Test shape  (post-encoding): {test_data.shape}")
print(f"Feature count: {len(FEATURE_COLS)}")

Train shape (post-encoding): (830972, 34)
Test shape  (post-encoding): (1113, 34)
Feature count: 29


# 4. Train/Validation Split & Scaling

In [8]:
VAL_WEEKS = 6
seq_horizon = pd.Timedelta(weeks=VAL_WEEKS)
split_date = train_data["Date"].max() - seq_horizon

train_main = train_data[train_data["Date"] < split_date].copy()
val_main = train_data[train_data["Date"] >= split_date].copy()

# Fit scalers trên train set
feature_scaler = StandardScaler()
target_scaler = StandardScaler()

train_main[FEATURE_COLS] = feature_scaler.fit_transform(train_main[FEATURE_COLS])
val_main[FEATURE_COLS] = feature_scaler.transform(val_main[FEATURE_COLS])
test_scaled = test_data.copy()
test_scaled[FEATURE_COLS] = feature_scaler.transform(test_scaled[FEATURE_COLS])

train_main[[TARGET_COL]] = target_scaler.fit_transform(train_main[[TARGET_COL]])
val_main[[TARGET_COL]] = target_scaler.transform(val_main[[TARGET_COL]])

print(f"Split date: {split_date.date()}")
print(f"Train rows: {len(train_main):,} | Val rows: {len(val_main):,}")

Split date: 2015-06-05
Train rows: 789,557 | Val rows: 41,415


# 5. Build Sequences

In [9]:
SEQ_LEN = 30  # days
BATCH_SIZE = 256

def build_sequences(df: pd.DataFrame, feature_cols, target_col):
    """Tạo sequences cho time series forecasting"""
    sequences, targets = [], []
    for _, group in df.groupby(ID_COL):
        group = group.sort_values("Date")
        values = group[feature_cols + [target_col]].to_numpy()
        if len(values) <= SEQ_LEN:
            continue
        for start in range(len(values) - SEQ_LEN):
            seq_x = values[start:start + SEQ_LEN, :-1]
            seq_y = values[start + SEQ_LEN, -1]
            sequences.append(seq_x)
            targets.append(seq_y)
    return np.array(sequences, dtype=np.float32), np.array(targets, dtype=np.float32)

X_train, y_train = build_sequences(train_main, FEATURE_COLS, TARGET_COL)
X_val, y_val = build_sequences(val_main, FEATURE_COLS, TARGET_COL)

print(f"Train sequences: {X_train.shape}")
print(f"Val sequences  : {X_val.shape}")

Train sequences: (756107, 30, 29)
Val sequences  : (7977, 30, 29)


# 6. Dataset & DataLoader

In [10]:
class SequenceDataset(Dataset):
    def __init__(self, X, y=None):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32) if y is not None else None

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        if self.y is None:
            return self.X[idx]
        return self.X[idx], self.y[idx]

train_ds = SequenceDataset(X_train, y_train)
val_ds = SequenceDataset(X_val, y_val)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, drop_last=False)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, drop_last=False)

print(f"Batches -> train: {len(train_loader)}, val: {len(val_loader)}")

Batches -> train: 2954, val: 32


# 7. TCN Architecture

In [11]:
class Chomp1d(nn.Module):
    """Loại bỏ padding thừa ở cuối để đảm bảo causal convolution"""
    def __init__(self, chomp_size):
        super(Chomp1d, self).__init__()
        self.chomp_size = chomp_size

    def forward(self, x):
        return x[:, :, :-self.chomp_size].contiguous()


class TemporalBlock(nn.Module):
    """Temporal Block là thành phần cơ bản của TCN"""
    def __init__(self, n_inputs, n_outputs, kernel_size, stride, dilation, padding, dropout=0.2):
        super(TemporalBlock, self).__init__()

        # Dilated causal convolution
        self.conv1 = weight_norm(nn.Conv1d(n_inputs, n_outputs, kernel_size,
                                           stride=stride, padding=padding, dilation=dilation))
        self.chomp1 = Chomp1d(padding)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(dropout)

        self.conv2 = weight_norm(nn.Conv1d(n_outputs, n_outputs, kernel_size,
                                           stride=stride, padding=padding, dilation=dilation))
        self.chomp2 = Chomp1d(padding)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(dropout)

        self.net = nn.Sequential(self.conv1, self.chomp1, self.relu1, self.dropout1,
                                self.conv2, self.chomp2, self.relu2, self.dropout2)

        # Residual connection
        self.downsample = nn.Conv1d(n_inputs, n_outputs, 1) if n_inputs != n_outputs else None
        self.relu = nn.ReLU()
        self.init_weights()

    def init_weights(self):
        self.conv1.weight.data.normal_(0, 0.01)
        self.conv2.weight.data.normal_(0, 0.01)
        if self.downsample is not None:
            self.downsample.weight.data.normal_(0, 0.01)

    def forward(self, x):
        out = self.net(x)
        res = x if self.downsample is None else self.downsample(x)
        return self.relu(out + res)


class TemporalConvNet(nn.Module):
    """TCN Network gồm nhiều Temporal Blocks xếp chồng"""
    def __init__(self, num_inputs, num_channels, kernel_size=2, dropout=0.2):
        super(TemporalConvNet, self).__init__()
        layers = []
        num_levels = len(num_channels)

        for i in range(num_levels):
            dilation_size = 2 ** i  # Exponentially increasing dilation
            in_channels = num_inputs if i == 0 else num_channels[i-1]
            out_channels = num_channels[i]
            layers += [TemporalBlock(in_channels, out_channels, kernel_size, stride=1,
                                    dilation=dilation_size,
                                    padding=(kernel_size-1) * dilation_size,
                                    dropout=dropout)]

        self.network = nn.Sequential(*layers)

    def forward(self, x):
        return self.network(x)


class SalesTCN(nn.Module):
    """TCN model cho Sales forecasting"""
    def __init__(self, input_dim, num_channels, kernel_size=3, dropout=0.2):
        super(SalesTCN, self).__init__()

        # TCN expects (batch, channels, seq_len)
        # Input sẽ là (batch, seq_len, input_dim)
        # Cần transpose

        self.tcn = TemporalConvNet(input_dim, num_channels, kernel_size, dropout)
        self.linear = nn.Linear(num_channels[-1], 1)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # x shape: (batch, seq_len, input_dim)
        # Transpose để phù hợp với Conv1d: (batch, input_dim, seq_len)
        x = x.transpose(1, 2)

        # TCN forward
        y = self.tcn(x)

        # Lấy output của timestep cuối cùng
        y = y[:, :, -1]

        # Fully connected layer
        out = self.linear(self.dropout(y))
        return out.squeeze(-1)

# 8. Model Configuration & Initialization

In [12]:
INPUT_DIM = len(FEATURE_COLS)
NUM_CHANNELS = [64, 64, 128, 128]  # Số channels cho mỗi temporal block
KERNEL_SIZE = 3
DROPOUT = 0.2
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = SalesTCN(INPUT_DIM, NUM_CHANNELS, KERNEL_SIZE, DROPOUT).to(DEVICE)
print(f"\nDevice: {DEVICE}")
print(f"\nModel Architecture:")
print(model)

# Đếm số parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"\nTotal parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")


Device: cpu

Model Architecture:
SalesTCN(
  (tcn): TemporalConvNet(
    (network): Sequential(
      (0): TemporalBlock(
        (conv1): Conv1d(29, 64, kernel_size=(3,), stride=(1,), padding=(2,))
        (chomp1): Chomp1d()
        (relu1): ReLU()
        (dropout1): Dropout(p=0.2, inplace=False)
        (conv2): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(2,))
        (chomp2): Chomp1d()
        (relu2): ReLU()
        (dropout2): Dropout(p=0.2, inplace=False)
        (net): Sequential(
          (0): Conv1d(29, 64, kernel_size=(3,), stride=(1,), padding=(2,))
          (1): Chomp1d()
          (2): ReLU()
          (3): Dropout(p=0.2, inplace=False)
          (4): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(2,))
          (5): Chomp1d()
          (6): ReLU()
          (7): Dropout(p=0.2, inplace=False)
        )
        (downsample): Conv1d(29, 64, kernel_size=(1,), stride=(1,))
        (relu): ReLU()
      )
      (1): TemporalBlock(
        (conv1): Conv1d



# 9. Training Setup

In [None]:
EPOCHS = 10
LR = 1e-3
PATIENCE = 5

criterion = nn.MSELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=1e-5)

# Learning rate scheduler (optional but recommended)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='min', factor=0.5, patience=3, verbose=True
)

def run_epoch(loader, train_mode=True):
    """Chạy một epoch training hoặc validation"""
    epoch_loss, epoch_mae = 0.0, 0.0
    steps = 0

    if train_mode:
        model.train()
    else:
        model.eval()

    for batch in loader:
        features, targets = [b.to(DEVICE) for b in batch]

        if train_mode:
            optimizer.zero_grad()

        with torch.set_grad_enabled(train_mode):
            preds = model(features)
            loss = criterion(preds, targets)
            mae = torch.mean(torch.abs(preds - targets))

            if train_mode:
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                optimizer.step()

        epoch_loss += loss.item()
        epoch_mae += mae.item()
        steps += 1

    return epoch_loss / steps, epoch_mae / steps





# 10. Training Loop

In [14]:
print("\n" + "="*70)
print("TRAINING TCN MODEL")
print("="*70)

best_loss = float("inf")
patience_counter = 0
history = []

for epoch in range(1, EPOCHS + 1):
    train_loss, train_mae = run_epoch(train_loader, train_mode=True)
    val_loss, val_mae = run_epoch(val_loader, train_mode=False)

    history.append({
        "epoch": epoch,
        "train_loss": train_loss,
        "val_loss": val_loss,
        "val_mae": val_mae
    })

    print(f"Epoch {epoch:02d} | Train MSE {train_loss:.4f} | Val MSE {val_loss:.4f} | Val MAE {val_mae:.4f}")

    # Learning rate scheduling
    scheduler.step(val_loss)

    # Early stopping & model checkpoint
    if val_loss < best_loss:
        best_loss = val_loss
        patience_counter = 0
        torch.save(model.state_dict(), "tcn_best.pt")
        print(f"  → New best model saved (val_loss: {val_loss:.4f})")
    else:
        patience_counter += 1
        if patience_counter >= PATIENCE:
            print(f"\nEarly stopping triggered after {epoch} epochs")
            break

print("\n" + "="*70)
print("TRAINING COMPLETED")
print("="*70)


TRAINING TCN MODEL
Epoch 01 | Train MSE 0.1353 | Val MSE 0.0684 | Val MAE 0.1907
  → New best model saved (val_loss: 0.0684)
Epoch 02 | Train MSE 0.0920 | Val MSE 0.0746 | Val MAE 0.2014
Epoch 03 | Train MSE 0.0836 | Val MSE 0.0658 | Val MAE 0.1898
  → New best model saved (val_loss: 0.0658)
Epoch 04 | Train MSE 0.0785 | Val MSE 0.0675 | Val MAE 0.1951
Epoch 05 | Train MSE 0.0756 | Val MSE 0.0732 | Val MAE 0.1995
Epoch 06 | Train MSE 0.0733 | Val MSE 0.0733 | Val MAE 0.2004
Epoch 07 | Train MSE 0.0713 | Val MSE 0.0674 | Val MAE 0.1934
Epoch 08 | Train MSE 0.0667 | Val MSE 0.0659 | Val MAE 0.1922

Early stopping triggered after 8 epochs

TRAINING COMPLETED


# 11. Evaluation Metrics

In [15]:
def calculate_mape(y_true, y_pred):
    """Mean Absolute Percentage Error"""
    non_zero_mask = y_true != 0
    if np.sum(non_zero_mask) == 0:
        return np.nan
    y_true_filtered = y_true[non_zero_mask]
    y_pred_filtered = y_pred[non_zero_mask]
    return np.mean(np.abs((y_true_filtered - y_pred_filtered) / y_true_filtered)) * 100

def calculate_rmspe(y_true, y_pred):
    """Root Mean Squared Percentage Error"""
    non_zero_mask = y_true != 0
    if np.sum(non_zero_mask) == 0:
        return np.nan
    y_true_filtered = y_true[non_zero_mask]
    y_pred_filtered = y_pred[non_zero_mask]
    return np.sqrt(np.mean(((y_true_filtered - y_pred_filtered) / y_true_filtered)**2)) * 100

def calculate_smape(y_true, y_pred):
    """Symmetric Mean Absolute Percentage Error"""
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    ratio = np.where(denominator == 0, 0, np.abs(y_pred - y_true) / denominator)
    return np.mean(ratio) * 100

def calculate_mase(y_true, y_pred, naive_forecast_error):
    """Mean Absolute Scaled Error"""
    if np.isnan(naive_forecast_error) or naive_forecast_error == 0:
        return np.nan
    mae_val = mean_absolute_error(y_true, y_pred)
    return mae_val / naive_forecast_error

# 12. Final Validation Evaluation

In [16]:
print("\n" + "="*70)
print("VALIDATION EVALUATION")
print("="*70)

# Load best model
best_path = "tcn_best.pt"
if os.path.exists(best_path):
    model.load_state_dict(torch.load(best_path, map_location=DEVICE))
    print("✓ Loaded best model weights")

model.eval()

# Generate predictions
all_preds, all_targets = [], []
with torch.no_grad():
    for features, targets in val_loader:
        features = features.to(DEVICE)
        preds = model(features).cpu().numpy()
        all_preds.append(preds)
        all_targets.append(targets.numpy())

val_preds_scaled = np.concatenate(all_preds)
val_targets_scaled = np.concatenate(all_targets)

# Inverse transform to original scale
val_preds = target_scaler.inverse_transform(val_preds_scaled.reshape(-1, 1)).ravel()
val_targets = target_scaler.inverse_transform(val_targets_scaled.reshape(-1, 1)).ravel()

# Calculate metrics
mae = mean_absolute_error(val_targets, val_preds)
rmse = np.sqrt(mean_squared_error(val_targets, val_preds))
mape = calculate_mape(val_targets, val_preds)
rmspe = calculate_rmspe(val_targets, val_preds)
smape = calculate_smape(val_targets, val_preds)

# MASE calculation
original_train_sales = train_data[train_data["Date"] < split_date][TARGET_COL].values
if len(original_train_sales) > 1:
    naive_forecast_error = np.mean(np.abs(original_train_sales[1:] - original_train_sales[:-1]))
    mase = calculate_mase(val_targets, val_preds, naive_forecast_error)
else:
    mase = np.nan

# Print results
print(f"\nValidation Metrics:")
print(f"  MAE  : {mae:,.2f}")
print(f"  RMSE : {rmse:,.2f}")
print(f"  MAPE : {mape:,.2f}%")
print(f"  RMSPE: {rmspe:,.2f}%")
print(f"  sMAPE: {smape:,.2f}%")
print(f"  MASE : {mase:,.2f}")


VALIDATION EVALUATION
✓ Loaded best model weights

Validation Metrics:
  MAE  : 583.85
  RMSE : 790.36
  MAPE : 8.38%
  RMSPE: 12.71%
  sMAPE: 8.12%
  MASE : 0.42


# 13. Test Set Inference

In [18]:
print("\n" + "="*70)
print("TEST SET INFERENCE")
print("="*70)

def build_test_sequences(test_scaled_df, historical_df, feature_cols, seq_len):
    """Tạo sequences cho test set từ historical data"""
    sequences, store_ids, forecast_dates = [], [], []

    for index, row in test_scaled_df.iterrows():
        store_id = row[ID_COL]
        forecast_date = row["Date"]
        end_date_for_sequence = forecast_date - pd.Timedelta(days=1)

        store_history = historical_df[
            (historical_df[ID_COL] == store_id) &
            (historical_df["Date"] <= end_date_for_sequence)
        ].sort_values("Date")

        if len(store_history) < seq_len:
            continue

        seq_features = store_history[feature_cols].tail(seq_len).to_numpy()
        sequences.append(seq_features.astype(np.float32))
        store_ids.append(store_id)
        forecast_dates.append(forecast_date)

    return np.array(sequences), store_ids, forecast_dates

# Prepare full historical data
full_scaled_historical_df = train_data.copy()
full_scaled_historical_df[FEATURE_COLS] = feature_scaler.transform(train_data[FEATURE_COLS])
full_scaled_historical_df = full_scaled_historical_df.sort_values([ID_COL, "Date"]).reset_index(drop=True)

# Build test sequences
X_test_seq, test_store_ids, forecast_dates = build_test_sequences(
    test_scaled,
    full_scaled_historical_df,
    FEATURE_COLS,
    SEQ_LEN
)
print(f"Test sequences: {X_test_seq.shape}")

# Create test loader
test_ds = SequenceDataset(X_test_seq)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False)

# Generate predictions
model.eval()
test_preds_scaled = []
with torch.no_grad():
    for batch in test_loader:
        preds = model(batch.to(DEVICE)).cpu().numpy()
        test_preds_scaled.append(preds)

test_preds = np.concatenate(test_preds_scaled)
test_preds = target_scaler.inverse_transform(test_preds.reshape(-1, 1)).ravel()

# Create submission file
submission = pd.DataFrame({
    "Store": test_store_ids,
    "ForecastDate": forecast_dates,
    "PredictedSales": test_preds
})

output_path = "tcn_predictions.csv"
submission.to_csv(output_path, index=False)
print(f"\n✓ Predictions saved to: {output_path}")
print(f"  Submission shape: {submission.shape}")
print(f"\nSample predictions:")
print(submission.head(10))

print("\n" + "="*70)
print("PIPELINE COMPLETED SUCCESSFULLY")
print("="*70)


TEST SET INFERENCE
Test sequences: (1113, 30, 29)

✓ Predictions saved to: tcn_predictions.csv
  Submission shape: (1113, 3)

Sample predictions:
   Store ForecastDate  PredictedSales
0      1   2015-07-31     3869.942871
1      2   2015-07-31     2509.787354
2      3   2015-07-31     3952.994141
3      4   2015-07-31     9365.440430
4      5   2015-07-31     2189.773193
5      6   2015-07-31     2566.241943
6      7   2015-07-31     6223.457031
7      8   2015-07-31     3686.457275
8      9   2015-07-31     5929.711914
9     10   2015-07-31     4481.862793

PIPELINE COMPLETED SUCCESSFULLY
