# FPT Stock Price Forecasting - Complete Pipeline

## Objective:
- Predict FPT closing prices for the next 100 days
- Use Linear, DLinear, NLinear models with univariate time series
- Input window sizes: 7d, 30d, 120d, 480d
- Train with Early Stopping (max 1000 epochs)
- Save predictions to submission file (101 lines: header + 100 predictions)
- Plot and compare results

## 1. Import Libraries and Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from datetime import timedelta

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8')
sns.set_palette('husl')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")
print(f"PyTorch version: {torch.__version__}")

## 2. Load and Explore Data

In [None]:
# Load FPT training data
df = pd.read_csv('FPT_train.csv')
df['time'] = pd.to_datetime(df['time'])
df = df.sort_values('time').reset_index(drop=True)

print(f"Dataset shape: {df.shape}")
print(f"Date range: {df['time'].min()} to {df['time'].max()}")
print(f"Total data points: {len(df)} rows")
print(f"\nLast date in training data: {df['time'].iloc[-1]}")
print(f"Next 100 days will start from: {df['time'].iloc[-1] + timedelta(days=1)}")

display(df.head())
display(df.tail())
print("\nBasic Statistics:")
display(df.describe())

In [None]:
# Visualize FPT stock price
fig, axes = plt.subplots(2, 2, figsize=(16, 10))
fig.suptitle('FPT Stock Data Analysis', fontsize=16, fontweight='bold')

# Close price over time
axes[0, 0].plot(df['time'], df['close'], linewidth=1.5, color='blue', alpha=0.8)
axes[0, 0].set_title('FPT Closing Price Over Time')
axes[0, 0].set_xlabel('Date')
axes[0, 0].set_ylabel('Price (VND)')
axes[0, 0].grid(True, alpha=0.3)
axes[0, 0].tick_params(axis='x', rotation=45)

# Distribution
axes[0, 1].hist(df['close'], bins=50, color='green', alpha=0.7, edgecolor='black')
axes[0, 1].set_title('Price Distribution')
axes[0, 1].set_xlabel('Price (VND)')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].grid(True, alpha=0.3)

# Daily returns
df['daily_return'] = df['close'].pct_change()
axes[1, 0].plot(df['time'], df['daily_return'], linewidth=1, alpha=0.7, color='red')
axes[1, 0].axhline(0, color='black', linewidth=1, alpha=0.5)
axes[1, 0].set_title('Daily Returns')
axes[1, 0].set_xlabel('Date')
axes[1, 0].set_ylabel('Return')
axes[1, 0].grid(True, alpha=0.3)
axes[1, 0].tick_params(axis='x', rotation=45)

# Volume
axes[1, 1].plot(df['time'], df['volume'], linewidth=1, color='purple', alpha=0.7)
axes[1, 1].set_title('Trading Volume')
axes[1, 1].set_xlabel('Date')
axes[1, 1].set_ylabel('Volume')
axes[1, 1].grid(True, alpha=0.3)
axes[1, 1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 3. Dataset Preparation

In [None]:
# Univariate Time Series Dataset
class UnivariateTimeSeriesDataset(Dataset):
    """Dataset for univariate time series forecasting"""
    def __init__(self, data, seq_len, pred_len, target_col='close'):
        self.data = data.dropna().reset_index(drop=True)
        self.seq_len = seq_len
        self.pred_len = pred_len
        self.target_col = target_col
        self.series = self.data[target_col].values
        
    def __len__(self):
        return len(self.series) - self.seq_len - self.pred_len + 1
    
    def __getitem__(self, idx):
        x = self.series[idx:idx+self.seq_len].copy()
        y = self.series[idx+self.seq_len:idx+self.seq_len+self.pred_len].copy()
        return torch.FloatTensor(x), torch.FloatTensor(y)

# Create datasets for different window sizes
seq_lengths = [7, 30, 120, 480]
pred_len = 1  # Predict 1 step for validation, but we'll use for multi-step later

datasets = {}
for seq_len in seq_lengths:
    dataset = UnivariateTimeSeriesDataset(
        data=df, 
        seq_len=seq_len, 
        pred_len=pred_len,
        target_col='close'
    )
    datasets[f'{seq_len}d'] = dataset
    print(f"Dataset {seq_len}d: {len(dataset)} samples")

print(f"\nCreated {len(datasets)} datasets for window sizes: {seq_lengths}")

In [None]:
# Train/Test Split - Preserving temporal order (no shuffling)
# Given small dataset (1149 rows), use 80/20 split
def create_time_based_splits(dataset, train_ratio=0.8):
    """Create time-based train/test splits"""
    total_len = len(dataset)
    train_len = int(total_len * train_ratio)
    
    train_indices = list(range(0, train_len))
    test_indices = list(range(train_len, total_len))
    
    train_dataset = torch.utils.data.Subset(dataset, train_indices)
    test_dataset = torch.utils.data.Subset(dataset, test_indices)
    
    return train_dataset, test_dataset

# Create splits
data_splits = {}
print("Creating train/test splits (80/20):")
print("Note: Temporal order preserved to avoid data leakage\n")

for seq_name, dataset in datasets.items():
    train, test = create_time_based_splits(dataset, train_ratio=0.8)
    data_splits[seq_name] = {'train': train, 'test': test}
    print(f"{seq_name}: Train={len(train)}, Test={len(test)}")

print("\n✓ Data splits created successfully!")

## 4. Model Implementations

In [None]:
# Linear Model
class Linear(nn.Module):
    """Simple Linear model for time series forecasting"""
    def __init__(self, seq_len, pred_len=1):
        super(Linear, self).__init__()
        self.seq_len = seq_len
        self.pred_len = pred_len
        self.linear = nn.Linear(seq_len, pred_len)
    
    def forward(self, x):
        return self.linear(x)

# DLinear Model
class DLinear(nn.Module):
    """Decomposition Linear - handles trend and seasonality"""
    def __init__(self, seq_len, pred_len=1, moving_avg=5):
        super(DLinear, self).__init__()
        self.seq_len = seq_len
        self.pred_len = pred_len
        self.moving_avg = min(moving_avg, seq_len - 1)
        
        self.linear_trend = nn.Linear(seq_len, pred_len)
        self.linear_seasonal = nn.Linear(seq_len, pred_len)
        
        # Moving average kernel
        self.register_buffer('avg_kernel', torch.ones(1, 1, self.moving_avg) / self.moving_avg)
    
    def decompose(self, x):
        """Decompose into trend and seasonal components"""
        batch_size, seq_len = x.shape
        x_reshaped = x.unsqueeze(1)
        
        padding = self.moving_avg // 2
        x_padded = torch.nn.functional.pad(x_reshaped, (padding, padding), mode='replicate')
        trend = torch.nn.functional.conv1d(x_padded, self.avg_kernel, padding=0)
        trend = trend.squeeze(1)
        
        if trend.shape[1] != seq_len:
            trend = torch.nn.functional.interpolate(
                trend.unsqueeze(1), size=seq_len, mode='linear', align_corners=False
            ).squeeze(1)
        
        seasonal = x - trend
        return trend, seasonal
    
    def forward(self, x):
        trend, seasonal = self.decompose(x)
        trend_pred = self.linear_trend(trend)
        seasonal_pred = self.linear_seasonal(seasonal)
        return trend_pred + seasonal_pred

# NLinear Model
class NLinear(nn.Module):
    """Normalized Linear - handles distribution shift"""
    def __init__(self, seq_len, pred_len=1):
        super(NLinear, self).__init__()
        self.seq_len = seq_len
        self.pred_len = pred_len
        self.linear = nn.Linear(seq_len, pred_len)
    
    def forward(self, x):
        # Normalize by last value
        last_value = x[:, -1:]
        x_normalized = x - last_value
        pred_normalized = self.linear(x_normalized)
        pred = pred_normalized + last_value
        return pred

print("✓ Models implemented: Linear, DLinear, NLinear")

## 5. Training with Early Stopping

In [None]:
# Early Stopping Class
class EarlyStopping:
    """Early stopping to stop training when validation loss doesn't improve"""
    def __init__(self, patience=50, min_delta=0.0001, verbose=True):
        self.patience = patience
        self.min_delta = min_delta
        self.verbose = verbose
        self.counter = 0
        self.best_loss = None
        self.early_stop = False
        self.best_model_state = None
    
    def __call__(self, val_loss, model):
        if self.best_loss is None:
            self.best_loss = val_loss
            self.best_model_state = model.state_dict().copy()
        elif val_loss > self.best_loss - self.min_delta:
            self.counter += 1
            if self.verbose and self.counter % 10 == 0:
                print(f'EarlyStopping counter: {self.counter}/{self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_loss = val_loss
            self.best_model_state = model.state_dict().copy()
            self.counter = 0
    
    def load_best_model(self, model):
        if self.best_model_state is not None:
            model.load_state_dict(self.best_model_state)

# Training function with Early Stopping
def train_model_with_early_stopping(
    model, train_loader, test_loader, 
    num_epochs=1000, lr=0.001, 
    patience=50, device='cpu', verbose=True
):
    """Train model with early stopping"""
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='min', factor=0.5, patience=20, verbose=False
    )
    
    early_stopping = EarlyStopping(patience=patience, verbose=False)
    
    model.to(device)
    train_losses, test_losses = [], []
    
    for epoch in range(num_epochs):
        # Training
        model.train()
        epoch_train_loss = 0.0
        for batch_x, batch_y in train_loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            optimizer.zero_grad()
            preds = model(batch_x)
            loss = criterion(preds, batch_y)
            loss.backward()
            optimizer.step()
            epoch_train_loss += loss.item()
        
        avg_train_loss = epoch_train_loss / len(train_loader)
        train_losses.append(avg_train_loss)
        
        # Validation
        model.eval()
        epoch_test_loss = 0.0
        with torch.no_grad():
            for batch_x, batch_y in test_loader:
                batch_x, batch_y = batch_x.to(device), batch_y.to(device)
                preds = model(batch_x)
                loss = criterion(preds, batch_y)
                epoch_test_loss += loss.item()
        
        avg_test_loss = epoch_test_loss / len(test_loader)
        test_losses.append(avg_test_loss)
        
        scheduler.step(avg_test_loss)
        
        # Early stopping check
        early_stopping(avg_test_loss, model)
        
        if verbose and (epoch + 1) % 100 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}] - '
                  f'Train Loss: {avg_train_loss:.6f}, '
                  f'Test Loss: {avg_test_loss:.6f}')
        
        if early_stopping.early_stop:
            if verbose:
                print(f'Early stopping triggered at epoch {epoch+1}')
            break
    
    # Load best model
    early_stopping.load_best_model(model)
    
    return model, train_losses, test_losses

print("✓ Training function with Early Stopping ready")

In [None]:
# Evaluation function
def evaluate_model(model, test_loader, device='cpu'):
    """Evaluate model performance"""
    model.eval()
    predictions = []
    actuals = []
    
    with torch.no_grad():
        for batch_x, batch_y in test_loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            outputs = model(batch_x)
            predictions.extend(outputs.cpu().numpy().flatten())
            actuals.extend(batch_y.cpu().numpy().flatten())
    
    predictions = np.array(predictions)
    actuals = np.array(actuals)
    
    mse = mean_squared_error(actuals, predictions)
    mae = mean_absolute_error(actuals, predictions)
    rmse = np.sqrt(mse)
    
    try:
        r2 = r2_score(actuals, predictions)
    except:
        r2 = -999
    
    return {
        'mse': mse,
        'mae': mae,
        'rmse': rmse,
        'r2': r2,
        'predictions': predictions,
        'actuals': actuals
    }

print("✓ Evaluation function ready")

## 6. Train All Models

In [None]:
# Training configuration
batch_size = 16
num_epochs = 1000
learning_rate = 0.001
early_stopping_patience = 50

# Storage for results
trained_models = {}
loss_histories = {}
results = {}

model_types = ['Linear', 'DLinear', 'NLinear']
window_sizes = ['7d', '30d', '120d', '480d']

print("="*80)
print("TRAINING ALL MODELS")
print("="*80)
print(f"Configuration:")
print(f"  - Max Epochs: {num_epochs}")
print(f"  - Early Stopping Patience: {early_stopping_patience}")
print(f"  - Learning Rate: {learning_rate}")
print(f"  - Batch Size: {batch_size}")
print(f"  - Device: {device}")
print("="*80)

for window in window_sizes:
    print(f"\n{'='*80}")
    print(f"WINDOW SIZE: {window}")
    print(f"{'='*80}")
    
    seq_len = int(window.replace('d', ''))
    
    # Create data loaders
    train_loader = DataLoader(
        data_splits[window]['train'],
        batch_size=batch_size,
        shuffle=True,
        drop_last=True
    )
    
    test_loader = DataLoader(
        data_splits[window]['test'],
        batch_size=batch_size,
        shuffle=False,
        drop_last=False
    )
    
    for model_type in model_types:
        print(f"\n--- Training {model_type} with {window} input ---")
        
        # Create model
        if model_type == 'Linear':
            model = Linear(seq_len, pred_len=1)
        elif model_type == 'DLinear':
            model = DLinear(seq_len, pred_len=1)
        else:  # NLinear
            model = NLinear(seq_len, pred_len=1)
        
        # Train model
        trained_model, train_losses, test_losses = train_model_with_early_stopping(
            model=model,
            train_loader=train_loader,
            test_loader=test_loader,
            num_epochs=num_epochs,
            lr=learning_rate,
            patience=early_stopping_patience,
            device=device,
            verbose=True
        )
        
        # Store model and losses
        key = f"{model_type}_{window}"
        trained_models[key] = trained_model
        loss_histories[key] = {
            'train': train_losses,
            'test': test_losses
        }
        
        # Evaluate
        test_results = evaluate_model(trained_model, test_loader, device)
        results[key] = test_results
        
        print(f"Final Test RMSE: {test_results['rmse']:.4f}")
        print(f"Final Test MAE: {test_results['mae']:.4f}")
        print(f"Epochs trained: {len(train_losses)}")

print("\n" + "="*80)
print("✓ ALL MODELS TRAINED SUCCESSFULLY!")
print("="*80)

## 7. Results Analysis and Visualization

In [None]:
# Plot loss curves for all models
fig, axes = plt.subplots(2, 2, figsize=(18, 12))
fig.suptitle('Training and Test Loss Curves', fontsize=16, fontweight='bold')

colors = {'Linear': 'red', 'DLinear': 'green', 'NLinear': 'blue'}

for idx, window in enumerate(window_sizes):
    ax = axes[idx // 2, idx % 2]
    
    for model_type in model_types:
        key = f"{model_type}_{window}"
        train_losses = loss_histories[key]['train']
        test_losses = loss_histories[key]['test']
        
        ax.plot(train_losses, 
                color=colors[model_type], 
                linewidth=1.5, 
                label=f'{model_type} - Train',
                alpha=0.7)
        ax.plot(test_losses, 
                color=colors[model_type], 
                linewidth=2, 
                linestyle='--',
                label=f'{model_type} - Test',
                alpha=0.9)
    
    ax.set_title(f'Loss Curves - {window} Input', fontsize=12, fontweight='bold')
    ax.set_xlabel('Epoch')
    ax.set_ylabel('MSE Loss')
    ax.legend(fontsize=9)
    ax.grid(True, alpha=0.3)
    ax.set_yscale('log')

plt.tight_layout()
plt.show()

In [None]:
# Performance comparison table
print("\n" + "="*100)
print("PERFORMANCE COMPARISON TABLE")
print("="*100)

comparison_data = []
for window in window_sizes:
    for model_type in model_types:
        key = f"{model_type}_{window}"
        result = results[key]
        comparison_data.append({
            'Window': window,
            'Model': model_type,
            'RMSE': f"{result['rmse']:.4f}",
            'MAE': f"{result['mae']:.4f}",
            'MSE': f"{result['mse']:.4f}",
            'R²': f"{result['r2']:.4f}",
            'Epochs': len(loss_histories[key]['train'])
        })

comparison_df = pd.DataFrame(comparison_data)
display(comparison_df)

# Find best model
comparison_df['RMSE_num'] = comparison_df['RMSE'].astype(float)
best_idx = comparison_df['RMSE_num'].idxmin()
best_model_info = comparison_df.iloc[best_idx]

print("\n" + "="*100)
print("BEST MODEL (by RMSE):")
print(f"  Model: {best_model_info['Model']}")
print(f"  Window: {best_model_info['Window']}")
print(f"  RMSE: {best_model_info['RMSE']}")
print(f"  MAE: {best_model_info['MAE']}")
print(f"  R²: {best_model_info['R²']}")
print("="*100)

In [None]:
# Visualization: Test predictions vs Actual
fig, axes = plt.subplots(2, 2, figsize=(20, 14))
fig.suptitle('FPT Stock Price: Test Predictions vs Actual', fontsize=18, fontweight='bold')

for idx, window in enumerate(window_sizes):
    ax = axes[idx // 2, idx % 2]
    
    # Plot actual (from first model, all are same)
    key = f"Linear_{window}"
    actual = results[key]['actuals']
    ax.plot(actual, 'black', linewidth=2, label='Actual', alpha=0.8, marker='o', markersize=2)
    
    # Plot predictions for each model
    for model_type in model_types:
        key = f"{model_type}_{window}"
        preds = results[key]['predictions']
        rmse = results[key]['rmse']
        
        ax.plot(preds, 
                color=colors[model_type],
                linewidth=1.5,
                label=f'{model_type} (RMSE: {rmse:.2f})',
                alpha=0.7,
                linestyle='--')
    
    ax.set_title(f'{window} Input Window', fontsize=14, fontweight='bold')
    ax.set_xlabel('Test Sample Index')
    ax.set_ylabel('Price (VND)')
    ax.legend(fontsize=10)
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 8. Generate 100-Day Future Predictions

In [None]:
def predict_future_autoregressive(model, initial_sequence, n_steps, device='cpu'):
    """
    Autoregressive prediction: use model's own predictions to predict future
    
    Args:
        model: trained model
        initial_sequence: initial input sequence (seq_len,)
        n_steps: number of future steps to predict
        device: computation device
    
    Returns:
        predictions: array of n_steps predictions
    """
    model.eval()
    predictions = []
    
    # Start with initial sequence
    current_seq = initial_sequence.copy()
    
    with torch.no_grad():
        for _ in range(n_steps):
            # Prepare input
            x = torch.FloatTensor(current_seq).unsqueeze(0).to(device)
            
            # Predict next value
            pred = model(x)
            pred_value = pred.cpu().numpy().flatten()[0]
            predictions.append(pred_value)
            
            # Update sequence: remove oldest, add newest prediction
            current_seq = np.append(current_seq[1:], pred_value)
    
    return np.array(predictions)

print("✓ Autoregressive prediction function ready")

In [None]:
# Generate 100-day predictions for each model
n_future_days = 100
future_predictions = {}

print("="*80)
print("GENERATING 100-DAY FUTURE PREDICTIONS")
print("="*80)

# Get last values from training data for each window size
for window in window_sizes:
    seq_len = int(window.replace('d', ''))
    
    # Get initial sequence (last seq_len values from training data)
    initial_sequence = df['close'].values[-seq_len:]
    
    print(f"\nWindow {window}:")
    print(f"  Initial sequence: last {seq_len} values from training data")
    print(f"  Initial sequence range: [{initial_sequence.min():.2f}, {initial_sequence.max():.2f}]")
    
    for model_type in model_types:
        key = f"{model_type}_{window}"
        model = trained_models[key]
        
        # Generate predictions
        preds = predict_future_autoregressive(
            model=model,
            initial_sequence=initial_sequence,
            n_steps=n_future_days,
            device=device
        )
        
        future_predictions[key] = preds
        
        print(f"  {model_type}: Generated {len(preds)} predictions")
        print(f"    Prediction range: [{preds.min():.2f}, {preds.max():.2f}]")

print("\n" + "="*80)
print("✓ All 100-day predictions generated!")
print("="*80)

## 9. Save Submission Files

In [None]:
import os

# Create submissions directory
os.makedirs('submissions', exist_ok=True)

# Get last date from training data
last_date = df['time'].iloc[-1]
print(f"Last date in training data: {last_date}")
print(f"Predictions will cover: {last_date + timedelta(days=1)} to {last_date + timedelta(days=100)}")

# Save predictions for each model
print("\n" + "="*80)
print("SAVING SUBMISSION FILES")
print("="*80)

for window in window_sizes:
    for model_type in model_types:
        key = f"{model_type}_{window}"
        preds = future_predictions[key]
        
        # Create submission dataframe
        submission_df = pd.DataFrame({
            'id': range(1, n_future_days + 1),
            'close': preds
        })
        
        # Save to CSV
        filename = f'submissions/FPT_submission_{model_type}_{window}.csv'
        submission_df.to_csv(filename, index=False)
        
        print(f"✓ Saved: {filename}")
        print(f"  Shape: {submission_df.shape} (101 lines including header)")

# Also save best model prediction
best_key = f"{best_model_info['Model']}_{best_model_info['Window']}"
best_preds = future_predictions[best_key]

best_submission_df = pd.DataFrame({
    'id': range(1, n_future_days + 1),
    'close': best_preds
})

best_filename = 'submissions/FPT_submission_BEST.csv'
best_submission_df.to_csv(best_filename, index=False)

print("\n" + "="*80)
print(f"✓ BEST MODEL SUBMISSION: {best_filename}")
print(f"  Model: {best_model_info['Model']} with {best_model_info['Window']} window")
print(f"  Test RMSE: {best_model_info['RMSE']}")
print("="*80)

# Display sample of best submission
print("\nSample from BEST submission:")
display(best_submission_df.head(10))
display(best_submission_df.tail(10))

## 10. Visualize Future Predictions

In [None]:
# Plot historical data + future predictions
fig, axes = plt.subplots(2, 2, figsize=(20, 14))
fig.suptitle('FPT Stock Price: Historical + 100-Day Future Predictions', 
             fontsize=18, fontweight='bold')

# Create date range for future predictions
last_date = df['time'].iloc[-1]
future_dates = pd.date_range(start=last_date + timedelta(days=1), periods=n_future_days)

for idx, window in enumerate(window_sizes):
    ax = axes[idx // 2, idx % 2]
    
    # Plot historical data (last 200 days)
    historical_window = 200
    ax.plot(df['time'].iloc[-historical_window:], 
            df['close'].iloc[-historical_window:],
            'black', linewidth=2, label='Historical', alpha=0.8)
    
    # Plot future predictions for each model
    for model_type in model_types:
        key = f"{model_type}_{window}"
        preds = future_predictions[key]
        
        ax.plot(future_dates, preds,
                color=colors[model_type],
                linewidth=2,
                label=f'{model_type} Forecast',
                linestyle='--',
                alpha=0.7,
                marker='o',
                markersize=2)
    
    # Add vertical line at prediction start
    ax.axvline(x=last_date, color='red', linestyle=':', linewidth=2, alpha=0.5)
    ax.text(last_date, ax.get_ylim()[1], 'Forecast Start', 
            rotation=90, verticalalignment='top')
    
    ax.set_title(f'{window} Input Window', fontsize=14, fontweight='bold')
    ax.set_xlabel('Date')
    ax.set_ylabel('Price (VND)')
    ax.legend(fontsize=10)
    ax.grid(True, alpha=0.3)
    ax.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig('submissions/FPT_forecast_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Forecast comparison plot saved: submissions/FPT_forecast_comparison.png")

In [None]:
# Plot only best model prediction
fig, ax = plt.subplots(figsize=(16, 8))

# Historical data
ax.plot(df['time'], df['close'], 
        'blue', linewidth=2, label='Historical Data', alpha=0.8)

# Best model prediction
best_key = f"{best_model_info['Model']}_{best_model_info['Window']}"
best_preds = future_predictions[best_key]

ax.plot(future_dates, best_preds,
        'red', linewidth=2.5, label=f"100-Day Forecast ({best_model_info['Model']} - {best_model_info['Window']})",
        linestyle='--', marker='o', markersize=3, alpha=0.9)

# Add prediction interval shading (simple ±10% bands)
upper_band = best_preds * 1.1
lower_band = best_preds * 0.9
ax.fill_between(future_dates, lower_band, upper_band, alpha=0.2, color='red')

# Vertical line at forecast start
ax.axvline(x=last_date, color='green', linestyle=':', linewidth=2, alpha=0.7)
ax.text(last_date, ax.get_ylim()[1]*0.95, 'Forecast Start →', 
        fontsize=12, fontweight='bold')

ax.set_title(f'FPT Stock Price: Best Model 100-Day Forecast\n'
             f'Model: {best_model_info["Model"]} | Window: {best_model_info["Window"]} | '
             f'Test RMSE: {best_model_info["RMSE"]}',
             fontsize=16, fontweight='bold')
ax.set_xlabel('Date', fontsize=12)
ax.set_ylabel('Price (VND)', fontsize=12)
ax.legend(fontsize=11)
ax.grid(True, alpha=0.3)
plt.xticks(rotation=45)

plt.tight_layout()
plt.savefig('submissions/FPT_best_forecast.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Best model forecast plot saved: submissions/FPT_best_forecast.png")

## 11. Summary and Statistics

In [None]:
# Summary statistics for all predictions
print("="*100)
print("PREDICTION SUMMARY STATISTICS")
print("="*100)

summary_stats = []
for window in window_sizes:
    for model_type in model_types:
        key = f"{model_type}_{window}"
        preds = future_predictions[key]
        
        summary_stats.append({
            'Model': model_type,
            'Window': window,
            'Mean': f"{preds.mean():.2f}",
            'Std': f"{preds.std():.2f}",
            'Min': f"{preds.min():.2f}",
            'Max': f"{preds.max():.2f}",
            'Trend': 'Up' if preds[-1] > preds[0] else 'Down',
            'Change%': f"{((preds[-1] - preds[0]) / preds[0] * 100):.2f}%"
        })

summary_df = pd.DataFrame(summary_stats)
display(summary_df)

# Historical comparison
print("\n" + "="*100)
print("HISTORICAL DATA REFERENCE")
print("="*100)
print(f"Last price in training data: {df['close'].iloc[-1]:.2f} VND")
print(f"Training data mean: {df['close'].mean():.2f} VND")
print(f"Training data std: {df['close'].std():.2f} VND")
print(f"Training data range: [{df['close'].min():.2f}, {df['close'].max():.2f}]")

# Final summary
print("\n" + "="*100)
print("FINAL SUMMARY")
print("="*100)
print(f"✓ Trained {len(model_types)} model types with {len(window_sizes)} window sizes")
print(f"✓ Total models trained: {len(trained_models)}")
print(f"✓ Generated {n_future_days}-day predictions for all models")
print(f"✓ All submission files saved to: submissions/")
print(f"\nBest Model: {best_model_info['Model']} with {best_model_info['Window']} window")
print(f"  - Test RMSE: {best_model_info['RMSE']}")
print(f"  - Test MAE: {best_model_info['MAE']}")
print(f"  - R²: {best_model_info['R²']}")
print("="*100)