# Evaluation Strategies of Time-Series Models


In [None]:
import numpy as np
import pandas as pd
import altair as alt

from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
import xgboost as xgb

Data for Demonstration: Cleaned data with daily sales after preprocessing from the previous chapter

In [None]:
url = 'https://drive.google.com/uc?id=1M1ryHCBP55fhv8wZPWURun4MgNQfN6zb'

df = pd.read_csv(url)

In [None]:
df.shape

(739, 4)

In [None]:
df.sample(10)

Unnamed: 0,invoice_date,total_transaction,total_quantity,total_sales
280,2010-09-07,50,14220,24343.21
339,2010-11-05,124,0,29664.75
178,2010-05-28,95,11495,16937.27
302,2010-09-29,121,19981,35256.98
6,2009-12-07,110,15499,38926.74
60,2010-01-30,0,0,0.0
28,2009-12-29,0,0,0.0
575,2011-06-29,62,8983,17271.73
675,2011-10-07,95,25657,39820.47
275,2010-09-02,84,14805,24092.22


In [None]:
df.describe()

Unnamed: 0,total_transaction,total_quantity,total_sales
count,739.0,739.0,739.0
mean,66.451962,11846.346414,24662.88
std,43.475393,10239.964118,531826.1
min,0.0,0.0,-8049547.0
25%,41.0,5345.0,8948.495
50%,68.0,10697.0,19694.92
75%,94.0,16619.0,29656.37
max,207.0,119118.0,11833820.0


In [None]:
df.columns

Index(['invoice_date', 'total_transaction', 'total_quantity', 'total_sales'], dtype='object')

## Evaluation Strategies


### Train-Test Split for Time Series

  The train-test split for time-series data ensures that models are evaluated on unseen future data, respecting the temporal order inherent in time series. Unlike traditional machine learning, where data can be randomly shuffled, time-series data requires a chronological split to mimic real-world forecasting scenarios. In real-world applications, the test size is often set based on specific problem requirements, such as forecasting the next 30 days of sales, rather than a fixed percentage split (e.g., 80-20).


#### Implementation
The test set will consist of the last 30 days of sales data, with the remaining data used for training.


In [None]:
df['invoice_date'] = pd.to_datetime(df['invoice_date'])
df.set_index('invoice_date', inplace=True)
df = df[['total_quantity']]

# Ensure continuous date range by filling missing dates with zero quantities
all_dates = pd.date_range(start=df.index.min(), end=df.index.max(), freq='D')

In [None]:
# Display the first few rows
print("First few rows of daily_quantity:")
df.head()

First few rows of daily_quantity:


Unnamed: 0_level_0,total_quantity
invoice_date,Unnamed: 1_level_1
2009-12-01,20736
2009-12-02,25657
2009-12-03,44557
2009-12-04,19550
2009-12-05,4636


In [None]:
# Step 2: Chronological Train-Test Split
# Calculate the cutoff date for the last 30 days
cutoff_date = df.index.max() - pd.Timedelta(days=30)

# Split into train and test sets
train_data = df[df.index <= cutoff_date]
test_data = df[df.index > cutoff_date]

# Display shapes and cut-off date
print(f"Cut-off date: {cutoff_date}")
print(f"Training data shape: {train_data.shape}")
print(f"Test data shape: {test_data.shape}")

Cut-off date: 2011-11-09 00:00:00
Training data shape: (709, 1)
Test data shape: (30, 1)


In [None]:
# Save the train and test data for use in future chapters
train_data.to_csv('train_data.csv')
test_data.to_csv('test_data.csv')

In [None]:
# Step 3: Visualize the Train-Test Split with Altair
# Prepare data for Altair
plot_data = df.reset_index()
plot_data['Dataset'] = plot_data['invoice_date'].apply(lambda x: 'Test Data' if x > cutoff_date else 'Training Data')

# Create Altair line plot
line = alt.Chart(plot_data).mark_line().encode(
    x=alt.X('invoice_date:T', title='Date'),
    y=alt.Y('total_quantity:Q', title='Total Quantity Sold'),
    color=alt.Color('Dataset:N', scale=alt.Scale(domain=['Training Data', 'Test Data'], range=['blue', 'orange']))
).properties(
    title='Train-Test Split for Daily Sales Time Series',
    width=800,
    height=400
)

# Add vertical line for cutoff
cutoff_line = alt.Chart(pd.DataFrame({'invoice_date': [cutoff_date]})).mark_rule(color='red', strokeDash=[5, 5]).encode(
    x='invoice_date:T'
)

# Combine plot and cutoff line
chart = (line + cutoff_line).configure_axis(grid=True).interactive()

# Display or save the chart
chart.display()
# Optionally, save the chart if display doesn't work in your environment
# chart.save('train_test_split.html')

> This chart illustrates the train-test split of daily sales data over a 30-day forecast horizon, with training data in blue and test data in orange, separated by a red dashed line at the cutoff date.

#### Introducing a 7-Day Gap

To simulate a **real-world scenario** such as **delayed data availability**, we introduce a **7-day gap** between the training and test sets.

- This means the **test set starts 7 days after the training set ends**, skipping one week of data between the two sets.


In [None]:
# Introduce a 7-day gap
gap_days = 7
test_start_date = cutoff_date + pd.Timedelta(days=gap_days)

# Redefine train and test sets with the gap
train_data_with_gap = df[df.index <= cutoff_date]
test_data_with_gap = df[df.index >= test_start_date]

# Display shapes and test start date
print(f"New test start date (after gap): {test_start_date}")
print(f"Training data shape (with gap): {train_data_with_gap.shape}")
print(f"Test data shape (with gap): {test_data_with_gap.shape}")

New test start date (after gap): 2011-11-16 00:00:00
Training data shape (with gap): (709, 1)
Test data shape (with gap): (24, 1)


In [None]:
########################## Visualize with gap ##########################

# --- Step 1: Combine the DataFrames for Altair ---
# Altair works best when all data for a single layered chart is in one DataFrame.

# Add a 'Dataset' column to each part to identify them for coloring
train_plot_df = train_data_with_gap.copy()
train_plot_df['Dataset'] = 'Training Data'

test_plot_df = test_data_with_gap.copy()
test_plot_df['Dataset'] = 'Test Data'

# Concatenate them and reset the index to use the date as a plottable column
plot_data = pd.concat([train_plot_df, test_plot_df]).reset_index()

# --- Step 2: Create the Chart Components ---

# The main line chart showing both train and test data, colored by the 'Dataset' column
line_chart = alt.Chart(plot_data).mark_line().encode(
    x=alt.X('invoice_date:T', title='Date'),
    y=alt.Y('total_quantity:Q', title='Total Quantity Sold'),
    color=alt.Color('Dataset:N',
                    scale=alt.Scale(
                        domain=['Training Data', 'Test Data'],
                        range=['blue', 'orange']
                    ),
                    legend=alt.Legend(title="Dataset")
                   )
)

# A red, dashed vertical line for the cut-off date
cutoff_line = alt.Chart(pd.DataFrame({'invoice_date': [cutoff_date]})).mark_rule(
    color='red',
    strokeDash=[5, 5]  # [dash_length, gap_length]
).encode(
    x='invoice_date:T'
)

# A green, dashed vertical line for the actual test start date
test_start_line = alt.Chart(pd.DataFrame({'invoice_date': [test_start_date]})).mark_rule(
    color='green',
    strokeDash=[5, 5]
).encode(
    x='invoice_date:T'
)

# --- Step 3: Layer the Components and Finalize the Chart ---
final_chart = (line_chart + cutoff_line + test_start_line).properties(
    title='Train-Test Split with 7-Day Gap',
    width=800,
    height=400
).configure_axis(
    grid=True
).interactive()

# Display the final chart
final_chart.display()
# Optionally, save the chart if display doesn't work in your environment
# final_chart.save('train_test_split_with_7day_gap.html')


#### The Limitation of a Single Train-Test Split

A single, fixed train-test split provides only **one look** at your model's performance. While it is simple and essential for a final holdout set, relying solely on it for model selection and tuning can be risky and misleading.

Here are the primary limitations of the single-split approach (shown conceptually in the time-based estimation):

1. **Split-Point Sensitivity (The "Lucky Split" Problem):** The model's performance metric (e.g., RMSE, MAE) heavily depends on the *specific period* chosen for the test set.
   - If the test set coincides with an unusually stable or predictable period, you might get an overly optimistic score, leading to deploying a model that fails when volatility returns.
   - Conversely, if the test set includes a unique, disruptive event (like the start of a pandemic or a massive promotional campaign not seen before), your score will be overly pessimistic, causing you to discard a potentially good model.

2. **Lack of Robustness and Confidence:** A single performance score offers no insight into the *variability* of your model's performance. A robust model performs well consistently across different time periods, and a single split cannot provide this assurance; you cannot determine if your score was a fluke or typical.

3. **Inefficient Use of Data:** In a single split, the model is never trained on the data from the test set. Valuable, recent information that could enhance the model is reserved solely for one-time testing, making the evaluation less comprehensive than it could be.

**Time-based cross-validation**, illustrated by the methods in the time-based cross-validation section, directly addresses these issues by creating *multiple* train-test splits across the timeline. By averaging the performance scores from these splits, we obtain a much more reliable and stable estimate of how the model will perform on future, unseen data.

### Time-Based Cross-Validation

This approach addresses the limitations of a **single train-test split** and provides a better estimate of how a model will perform on future, unseen data.


#### Types of Time-Based Cross-Validation

We’ll explore three specific techniques in the following subsections:

1.  **Rolling Forecast Origin (Expanding Window)**: Gradually increases the training data by including previous test periods.
2.  **Sliding Window Forecast Origin (Rolling Window)**: Uses a fixed-size training window that moves forward in time.


#### Rolling Forecast Origin (Expanding Window Cross-Validation)

A simple train-test split evaluates the model on only one specific period. A more rigorous approach is **walk-forward validation**, also known as evaluation on a rolling forecast origin. This method provides a better estimate of how the model is likely to perform over time in a real deployment scenario.

The process works iteratively:

- Select an initial training period (e.g., the first 70% of the data).
- Train the model on this initial period.
- Forecast the next single time step (or multiple steps, depending on your needs).
- Record the forecast and the actual value for that time step.
- Expand the training data to include the actual value from the step you just predicted.
- Re-train the model (or efficiently update it, if the model allows) using the expanded training set.
- Repeat steps 3-6, moving the forecast origin forward one step at a time until you have forecasts covering the desired evaluation period (the equivalent of the test set in the simple split).

In [None]:
# --- Step 1: Prepare the Data ---
plot_data = df.copy().reset_index()
plot_data['invoice_date'] = pd.to_datetime(plot_data['invoice_date'])

# --- Step 2: Define Parameters ---
n_folds = 2
validation_days = 30
total_validation_days = n_folds * validation_days

# Start from the end of the dataset
cv_start_date = plot_data['invoice_date'].max() - pd.Timedelta(days=total_validation_days - 1)

# --- Step 3: Create Data for Visualization ---
def create_fold_data(fold_idx):
    validation_start = cv_start_date + pd.Timedelta(days=fold_idx * validation_days)
    validation_end = validation_start + pd.Timedelta(days=validation_days - 1)
    train_end = validation_start - pd.Timedelta(days=1)
    train_start = plot_data['invoice_date'].min()

    # Training rectangle
    train_rect = pd.DataFrame({
        'invoice_date': [train_start, train_end],
        'type': ['Training Data'] * 2,
        'start_date': [train_start] * 2,
        'end_date': [train_end] * 2
    })
    # Validation rectangle
    val_rect = pd.DataFrame({
        'invoice_date': [validation_start, validation_end],
        'type': ['Validation Data'] * 2,
        'start_date': [validation_start] * 2,
        'end_date': [validation_end] * 2
    })
    return pd.concat([train_rect, val_rect])

# Generate data for two folds
fold1_data = create_fold_data(0)
fold2_data = create_fold_data(1)

# --- Step 4: Build the Altair Charts ---

# Chart for Fold 1
chart1 = alt.Chart(plot_data).mark_line(color='black').encode(
    x=alt.X('invoice_date:T', title='Date'),
    y=alt.Y('total_quantity:Q', title='Total Quantity Sold')
).properties(
    title='Fold 1',
    width=800,
    height=200
) + alt.Chart(fold1_data).mark_rect(opacity=0.3).encode(
    x='start_date:T',
    x2='end_date:T',
    color=alt.Color('type:N', scale=alt.Scale(domain=['Training Data', 'Validation Data'], range=['#4C72B0', '#DD8452']))
)

# Chart for Fold 2
chart2 = alt.Chart(plot_data).mark_line(color='black').encode(
    x=alt.X('invoice_date:T', title='Date'),
    y=alt.Y('total_quantity:Q', title='Total Quantity Sold')
).properties(
    title='Fold 2',
    width=800,
    height=200
) + alt.Chart(fold2_data).mark_rect(opacity=0.3).encode(
    x='start_date:T',
    x2='end_date:T',
    color=alt.Color('type:N', scale=alt.Scale(domain=['Training Data', 'Validation Data'], range=['#4C72B0', '#DD8452']))
)

# Combine charts vertically
final_chart = alt.vconcat(chart1, chart2).configure_axis(
    grid=True
).properties(
    title='Time-Based Cross-Validation with Two Folds'
)

# Display the chart
final_chart.display()
# Optionally, save the chart if display doesn't work
# final_chart.save('time_based_cross_validation.html')

**Explanation of the Figure**

This chart visualizes the **Expanding Window Cross-Validation** method.

*   **Subplots for Clarity:** Each subplot, "Fold 1" and "Fold 2," represents a separate evaluation cycle.
*   **Data Usage:** The light blue area is the **Training Data** used to build the model, while the light orange area is the **Validation Data** where the model's forecast is tested.
*   **The "Expanding" Process:** From Fold 1 to Fold 2, the validation window slides forward to a more recent 30-day period. Crucially, the training data **expands** to include all historical data up to that point. This is visible as the blue region in Fold 2 is wider than in Fold 1.

This method is superior to a single train-test split because it assesses the model's performance across multiple, consecutive time periods, providing a more reliable estimate of its true forecasting ability. The interactivity allows for detailed, synchronized comparison between the folds.


#### Sliding Window Forecast Origin (Rolling Window Cross-Validation)

In the **sliding window cross-validation** technique, a model is trained and evaluated on windows of data that "slide" forward through the time series. Unlike an expanding window, the training window maintains a **fixed size**, dropping the oldest data as it moves.

This method is particularly useful when you believe that recent data is more representative of the future than older data, a phenomenon known as **concept drift**. By discarding the distant past, the model focuses only on the most relevant patterns.

This method is also helpful when you are working with a sequence-to-sequence model, for which both input and ouput sequences must have fixed length.



In [None]:
###################### VISUALIZATION ######################
# --- Step 1: Prepare data for Altair ---z
plot_data_full = df.copy().reset_index()
plot_data_full['invoice_date'] = pd.to_datetime(plot_data_full['invoice_date'])

# --- Step 2: Define Parameters for Sliding Window ---
n_folds = 2
validation_days = 30
train_days = 180  # Fixed training window size from mentor feedback
total_validation_days = n_folds * validation_days
cv_start_date = plot_data_full['invoice_date'].max() - pd.Timedelta(days=total_validation_days - 1)

# --- Step 3: Create the Single, Unified DataFrame for Plotting ---
unified_sliding_data = []

# Generate rectangle and line data
for i in range(n_folds):
    fold_name = f'Fold {i + 1}'
    validation_start = cv_start_date + pd.Timedelta(days=i * validation_days)
    validation_end = validation_start + pd.Timedelta(days=validation_days - 1)

    # *** THIS IS THE KEY LOGIC CHANGE for a Sliding Window ***
    train_end = validation_start - pd.Timedelta(days=1)
    train_start = train_end - pd.Timedelta(days=train_days - 1) # Start date moves to maintain fixed size

    # Add rectangle definitions
    unified_sliding_data.append({'method': fold_name, 'type': 'Training Data', 'start_date': train_start, 'end_date': train_end, 'mark_type': 'rect'})
    unified_sliding_data.append({'method': fold_name, 'type': 'Validation Data', 'start_date': validation_start, 'end_date': validation_end, 'mark_type': 'rect'})

# Add line data for each fold
for i in range(n_folds):
    for _, row in plot_data_full.iterrows():
        unified_sliding_data.append({'method': f'Fold {i + 1}', 'invoice_date': row['invoice_date'], 'total_quantity': row['total_quantity'], 'mark_type': 'line'})

master_sliding_df = pd.DataFrame(unified_sliding_data)


# --- Step 4: Build the Interactive, Faceted Altair Chart ---
brush = alt.selection_interval(bind='scales', encodings=['x'])

rect_chart = alt.Chart().mark_rect(opacity=0.3).encode(x='start_date:T', x2='end_date:T', color='type:N').transform_filter(alt.datum.mark_type == 'rect')

# Line chart definition
line_chart = alt.Chart().mark_line(color='black', opacity=0.6).encode(
    x=alt.X('invoice_date:T', title='Date'),
    y=alt.Y('total_quantity:Q'), # Simplified y-axis encoding
    tooltip=[
        alt.Tooltip('Date:T', format='%Y-%m-%d', title='Date'),
        alt.Tooltip('Quantity:Q', format=',', title='Quantity')
    ]
).transform_filter(alt.datum.mark_type == 'line')

sliding_window_chart = alt.layer(
    rect_chart, line_chart, data=master_sliding_df
).add_params(brush).facet(
    column=alt.Column('method:N', header=alt.Header(title=None, labelFontSize=16, labelFontWeight='bold')),
    title=alt.TitleParams(
        "Interactive Sliding Window Cross-Validation",
        subtitle="The fixed-size training window slides forward to focus on more recent data.",
        fontSize=20, subtitleFontSize=14, anchor='start'
    )
).resolve_scale(y='shared').configure_view(stroke=None)

sliding_window_chart


**Explanation of the Figure**

*   **Subplots for Clarity:** Each subplot ("Fold 1", "Fold 2") represents a distinct model training and evaluation cycle.
*    **Data Usage:** The light blue area is the **Training Data** used to build the model; the light orange area is the **Validation Data** used to test the model's forecast.
*   **The "Sliding" Process:**
    *   The blue training window maintains a **fixed size** (e.g., 180 days).
    *    From Fold 1 to Fold 2, both the training (blue) and validation (orange) windows **slide forward** chronologically.
    * Crucially, the training window does *not* expand; it maintains its fixed size by dropping the oldest data as it moves forward.

This technique is valuable for models where recent trends are more important than long-term history, helping to prevent the model from learning outdated patterns.

