<a href="https://colab.research.google.com/github/ManuelBagasina/DATCapstone/blob/ML-Manuel/TFT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Test

## Environment Setup

In [None]:
!git clone -b ML-Manuel https://github.com/ManuelBagasina/DATCapstone.git
%cd DATCapstone/data

In [None]:
!pip install pytorch-lightning
!pip install pytorch-forecasting

## Import

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import torch
import lightning.pytorch as pl
from pytorch_forecasting import TemporalFusionTransformer, TimeSeriesDataSet
from pytorch_forecasting.data import GroupNormalizer
from pytorch_forecasting.metrics import MAE, RMSE, SMAPE
import os
import warnings
warnings.filterwarnings('ignore')
from zipfile import ZipFile
from pytorch_forecasting.data.encoders import NaNLabelEncoder

In [None]:
# Extract and load the dataset
with ZipFile('_data.csv.zip', 'r') as z:
    with z.open('data.csv') as f:  # Ignore macOS metadata files
        df = pd.read_csv(f, index_col=0)

In [None]:
# Check if GPU is available
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"GPU device: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("Using CPU")

## Data Exploration

In [None]:
# Display basic info about the dataset
print(f"Original dataframe shape: {df.shape}")

print("\nFirst few rows:")
print(df.head())

print("\nUnique tickers:")
print(df['Ticker'].unique())
print(f"Number of unique tickers: {df['Ticker'].nunique()}")

# Convert date to datetime
df['Date'] = pd.to_datetime(df['Date'])

## Data Preparation

In [None]:
# Remove lag features since TFT will handle time dependencies
lag_columns = [col for col in df.columns if '_lag' in col]
print(f"\nRemoving {len(lag_columns)} lag columns from the dataset")
df_no_lag = df.drop(columns=lag_columns)

# Handle embedding columns - Either keep them or use PCA to reduce dimensionality
# Identify embedding columns
emb_columns = [col for col in df_no_lag.columns if col.startswith('emb_')]
print(f"\nFound {len(emb_columns)} embedding columns")

# Option 1: Remove embedding columns since they might be too many for TFT
df_no_emb = df_no_lag.drop(columns=emb_columns)

# We'll work with the version without embeddings for simplicity
df_processed = df_no_emb

In [None]:
# Select target and features for the model
target = 'Close'  # Or could be 'Close' , 'Target_1day', 'Target_1week', 'Target_1month', 'Target_1year'

# Select relevant features for prediction
# Exclude Date, target variables, and other non-predictive columns
exclude_columns = ['Date'] + [col for col in df_processed.columns if col.startswith('Target_')]
if target not in exclude_columns:
    exclude_columns.append(target)
features = [col for col in df_processed.columns if col not in exclude_columns]
print(f"\nUsing {len(features)} features for prediction")

In [None]:
# Create time index for TFT
df_processed['time_idx'] = df_processed.groupby('Ticker')['Date'].rank(method='dense').astype(int) - 1

# Verify time_idx is properly set for each ticker
for ticker in df_processed['Ticker'].unique():
    ticker_data = df_processed[df_processed['Ticker'] == ticker]
    print(f"{ticker}: time_idx from {ticker_data['time_idx'].min()} to {ticker_data['time_idx'].max()}")

# Set parameters for prediction
# If predicting Target_1day, max_prediction_length=1
# If predicting Target_1week, max_prediction_length=5 (assuming 5 trading days)
# If predicting Target_1month, max_prediction_length=20
max_prediction_length = 1  # Adjust based on your prediction horizon
max_encoder_length = 30    # Use 30 days of history for prediction

## Training Setup

In [None]:
# Create a training dataset - Use the last 20% of the data for testing
val_cutoff = df_processed['time_idx'].max() - max_prediction_length
cutoffs = {}
for ticker in df_processed['Ticker'].unique():
    ticker_data = df_processed[df_processed['Ticker'] == ticker]
    cutoffs[ticker] = ticker_data['time_idx'].max() * 0.8

df_processed['is_train'] = True
for ticker, cutoff in cutoffs.items():
    df_processed.loc[(df_processed['Ticker'] == ticker) &
                    (df_processed['time_idx'] > cutoff), 'is_train'] = False

In [None]:
# Check which columns have missing values
missing_columns = df_processed.isna().sum()
print("\nColumns with missing values:")
print(missing_columns[missing_columns > 0].sort_values(ascending=False))

# Check for infinite values
df_processed = df_processed.replace([np.inf, -np.inf], np.nan)

# Handle missing values in Inventory specifically (since that's causing the error)
if 'Inventory' in df_processed.columns:
    # For each ticker, fill missing Inventory values with median (or 0 if median is NaN)
    for ticker in df_processed['Ticker'].unique():
        ticker_mask = df_processed['Ticker'] == ticker
        ticker_inventory_median = df_processed.loc[ticker_mask, 'Inventory'].median()
        if pd.isna(ticker_inventory_median):
            ticker_inventory_median = 0
        df_processed.loc[ticker_mask, 'Inventory'] = df_processed.loc[ticker_mask, 'Inventory'].fillna(ticker_inventory_median)

In [None]:
# Check all features for missing values and fill appropriately
for feature in features:
    if df_processed[feature].isna().sum() > 0:
        print(f"Filling missing values in {feature}")
        # Fill by ticker
        for ticker in df_processed['Ticker'].unique():
            ticker_mask = df_processed['Ticker'] == ticker
            feature_median = df_processed.loc[ticker_mask, feature].median()
            if pd.isna(feature_median):  # If median is NaN (all values are NaN)
                feature_median = 0
            df_processed.loc[ticker_mask, feature] = df_processed.loc[ticker_mask, feature].fillna(feature_median)

# Verify all missing values are fixed
remaining_missing = df_processed[features].isna().sum()
if remaining_missing.sum() > 0:
    print("Warning: There are still missing values:")
    print(remaining_missing[remaining_missing > 0])
else:
    print("All missing values have been handled.")

## Create TimeSeriesDataSets

In [None]:
# Reduced feature set for simplicity
reduced_features = features[:20]  # Use only the first 20 features to simplify

# Create training dataset
training = TimeSeriesDataSet(
    data=df_processed[df_processed['is_train']],
    time_idx="time_idx",
    target=target,
    group_ids=["Ticker"],
    min_encoder_length=15,  # Reduced from 30
    max_encoder_length=15,  # Reduced from 30
    min_prediction_length=1,
    max_prediction_length=1,
    static_categoricals=["Ticker"],
    static_reals=[],
    time_varying_known_categoricals=[],
    time_varying_known_reals=["time_idx"],
    time_varying_unknown_categoricals=[],
    time_varying_unknown_reals=reduced_features,  # Use reduced feature set
    target_normalizer=GroupNormalizer(
        groups=["Ticker"], transformation="softplus"
    ),
    categorical_encoders={
        "Ticker": NaNLabelEncoder(add_nan=True)
    },
    add_relative_time_idx=True,
    add_target_scales=True,
    add_encoder_length=True,
)

In [None]:
# Verify there are no lag features in the data
print("Lag feature verification:")
lag_features = [col for col in df.columns if '_lag' in col]
print(f"Number of lag features in original dataframe: {len(lag_features)}")
if len(lag_features) == 0:
    print("No lag features found in dataframe. These will be handled by TFT automatically.")
else:
    print("Lag features found in dataframe. Consider removing them to let TFT handle temporal dependencies.")
    print(f"First few lag features: {lag_features[:5]}")

In [None]:
# Check ticker sequence organization
ticker_counts = df_processed.groupby('Ticker').size()
print("\nRows per ticker:")
print(ticker_counts)

In [None]:
# Check if there are enough data points per ticker for the encoder length
min_required = max_encoder_length + max_prediction_length
print(f"\nTickers with insufficient data (<{min_required} points):")
print(ticker_counts[ticker_counts < min_required])

In [None]:
# Show time index consistency by ticker
print("\nTime index range by ticker:")
for ticker in df_processed['Ticker'].unique():
    ticker_data = df_processed[df_processed['Ticker'] == ticker]
    print(f"{ticker}: {ticker_data['time_idx'].min()} to {ticker_data['time_idx'].max()} ({len(ticker_data)} rows)")

In [None]:
# Handle NaN values in target columns
print(f"Number of rows with NA in {target}: {df_processed[target].isna().sum()}")
df_processed[target] = df_processed[target].fillna(0)  # Fill with 0 or another appropriate value
print(f"NAs remaining in {target}: {df_processed[target].isna().sum()}")

In [None]:
# Create validation dataset and dataloaders
validation = TimeSeriesDataSet.from_dataset(
    training, df_processed[~df_processed['is_train']], predict=True, stop_randomization=True
)

In [None]:
# Create dataloaders for model training
batch_size = 32  # Adjust based on  GPU memory
train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=0, shuffle=False)
val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size, num_workers=0, shuffle=False)

## Model Training

In [None]:
pl.seed_everything(42)  # For reproducibility

# Create the TFT model properly as a LightningModule
tft = TemporalFusionTransformer.from_dataset(
    training,
    learning_rate=0.01,
    hidden_size=16,
    attention_head_size=2,
    dropout=0.1,
    hidden_continuous_size=8,
    loss=RMSE(),
    log_interval=10,
    reduce_on_plateau_patience=5,
)

# Confirm it's a LightningModule
print(f"Is LightningModule: {isinstance(tft, pl.LightningModule)}")

In [None]:
# Configure trainer
early_stop_callback = pl.callbacks.EarlyStopping(
    monitor="val_loss", min_delta=1e-4, patience=10, verbose=False, mode="min"
)
lr_logger = pl.callbacks.LearningRateMonitor()
trainer = pl.Trainer(
    max_epochs=30,
    accelerator='auto',  # Use GPU if available
    gradient_clip_val=0.1,
    limit_train_batches=30,  # Adjust based on dataset size
    callbacks=[early_stop_callback, lr_logger],
)

In [None]:
# Train the model
trainer.fit(
    tft,
    train_dataloaders=train_dataloader,
    val_dataloaders=val_dataloader,
)

## Model Evaluation and Prediction

In [None]:
# Make predictions on the validation set
predictions = tft.predict(val_dataloader, return_y=True)

# Extract raw predictions and actual values
raw_predictions = predictions.output.detach().cpu().numpy()
raw_actuals = predictions.y[0].detach().cpu().numpy()

# Convert predictions to dataframe for easier analysis
pred_df = pd.DataFrame({
    'prediction': raw_predictions.flatten(),
    'actual': raw_actuals.flatten()
})

In [None]:
# Calculate metrics
mae = mean_absolute_error(pred_df['actual'], pred_df['prediction'])
mse = mean_squared_error(pred_df['actual'], pred_df['prediction'])
rmse = np.sqrt(mse)
r2 = r2_score(pred_df['actual'], pred_df['prediction'])

print("\nModel Performance:")
print(f"MAE: {mae:.4f}")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"R²: {r2:.4f}")

## Visualization

In [None]:
# Visualize predictions vs actuals
plt.figure(figsize=(12, 6))
plt.scatter(pred_df['actual'], pred_df['prediction'], alpha=0.5)
plt.plot([pred_df['actual'].min(), pred_df['actual'].max()],
       [pred_df['actual'].min(), pred_df['actual'].max()],
       'r--', lw=2)
plt.xlabel('Actual Close Price')
plt.ylabel('Predicted Close Price')
plt.title('TFT Model Performance: Actual vs Predicted')
plt.grid(True)
plt.savefig('tft_performance.png')
plt.show()

In [None]:
# Visualize predictions vs actuals over time for a specific stock
ticker_to_plot = df_processed['Ticker'].unique()[0]  # Choose first ticker
ticker_val_data = df_processed[(df_processed['Ticker'] == ticker_to_plot) & (~df_processed['is_train'])].copy()

# Plot time series data for the selected ticker
if len(ticker_val_data) > 0:
    plt.figure(figsize=(15, 6))
    plt.plot(ticker_val_data['Date'], ticker_val_data[target], label='Actual')
    # You would add predictions here after matching them to dates
    plt.title(f'TFT Predictions for {ticker_to_plot}')
    plt.xlabel('Date')
    plt.ylabel(target)
    plt.legend()
    plt.grid(True)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(f'tft_predictions_{ticker_to_plot}.png')
    plt.show()

## Feature Importance

In [None]:
# Get feature importance from the TFT model

In [None]:
# Variable importance. Show graph

## Saving Model