<a href="https://colab.research.google.com/github/ManuelBagasina/DATCapstone/blob/ML-Manuel/TFT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Environment Setup

In [31]:
!git clone -b ML-Manuel https://github.com/ManuelBagasina/DATCapstone.git
%cd DATCapstone/data

Cloning into 'DATCapstone'...
remote: Enumerating objects: 706, done.[K
remote: Counting objects: 100% (189/189), done.[K
remote: Compressing objects: 100% (164/164), done.[K
remote: Total 706 (delta 116), reused 25 (delta 25), pack-reused 517 (from 3)[K
Receiving objects: 100% (706/706), 68.60 MiB | 24.01 MiB/s, done.
Resolving deltas: 100% (369/369), done.
/content/DATCapstone/data/DATCapstone/data


In [32]:
!pip install pytorch-lightning
!pip install pytorch-forecasting



## Import

In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import torch
import lightning.pytorch as pl
from pytorch_forecasting import TemporalFusionTransformer, TimeSeriesDataSet
from pytorch_forecasting.data import GroupNormalizer
from pytorch_forecasting.metrics import MAE, RMSE, SMAPE
import os
import warnings
warnings.filterwarnings('ignore')
from zipfile import ZipFile
from pytorch_forecasting.data.encoders import NaNLabelEncoder

In [34]:
# Extract and load the dataset
with ZipFile('_data.csv.zip', 'r') as z:
    with z.open('data.csv') as f:  # Ignore macOS metadata files
        df = pd.read_csv(f, index_col=0)

In [35]:
# Check if GPU is available
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"GPU device: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("Using CPU")

CUDA available: False
Using CPU


## Data Exploration

In [36]:
# Display basic info about the dataset
print(f"Original dataframe shape: {df.shape}")

print("\nFirst few rows:")
print(df.head())

print("\nUnique tickers:")
print(df['Ticker'].unique())
print(f"Number of unique tickers: {df['Ticker'].nunique()}")

# Convert date to datetime
df['Date'] = pd.to_datetime(df['Date'])

Original dataframe shape: (8103, 817)

First few rows:
         Date       Open       High        Low      Close    Volume  \
0  2021-06-01  75.393341  75.630189  74.351213  75.383865  10485300   
1  2021-06-02  75.507015  76.672309  75.327016  76.047028  12249300   
2  2021-06-03  75.601763  77.174428  75.459653  76.823891  12038700   
3  2021-06-04  77.136525  79.542898  77.089160  78.529190  14502900   
4  2021-06-07  78.576573  79.817653  78.491304  79.523964  10445600   

   Dividends  Stock Splits  GDP (Billions USD)  Unemployment Rate (%)  ...  \
0        0.0           0.0           23368.861                    5.9  ...   
1        0.0           0.0           23368.861                    5.9  ...   
2        0.0           0.0           23368.861                    5.9  ...   
3        0.0           0.0           23368.861                    5.9  ...   
4        0.0           0.0           23368.861                    5.9  ...   

   emb_763  emb_764  emb_765  emb_766  emb_767  T

## Data Preparation

In [37]:
# Remove lag features since TFT will handle time dependencies
lag_columns = [col for col in df.columns if '_lag' in col]
print(f"\nRemoving {len(lag_columns)} lag columns from the dataset")
df_no_lag = df.drop(columns=lag_columns)

# Handle embedding columns - Either keep them or use PCA to reduce dimensionality
# Identify embedding columns
emb_columns = [col for col in df_no_lag.columns if col.startswith('emb_')]
print(f"\nFound {len(emb_columns)} embedding columns")

# Option 1: Remove embedding columns since they might be too many for TFT
df_no_emb = df_no_lag.drop(columns=emb_columns)

# We'll work with the version without embeddings for simplicity
df_processed = df_no_emb


Removing 0 lag columns from the dataset

Found 768 embedding columns


In [38]:
# Select target and features for the model
target = 'Close'  # Or could be 'Close' , 'Target_1day', 'Target_1week', 'Target_1month', 'Target_1year'

# Select relevant features for prediction
# Exclude Date, target variables, and other non-predictive columns
exclude_columns = ['Date'] + [col for col in df_processed.columns if col.startswith('Target_')]
if target not in exclude_columns:
    exclude_columns.append(target)
features = [col for col in df_processed.columns if col not in exclude_columns]
print(f"\nUsing {len(features)} features for prediction")


Using 43 features for prediction


In [39]:
# Create time index for TFT
df_processed['time_idx'] = df_processed.groupby('Ticker')['Date'].rank(method='dense').astype(int) - 1

# Verify time_idx is properly set for each ticker
for ticker in df_processed['Ticker'].unique():
    ticker_data = df_processed[df_processed['Ticker'] == ticker]
    print(f"{ticker}: time_idx from {ticker_data['time_idx'].min()} to {ticker_data['time_idx'].max()}")

# Set parameters for prediction
# If predicting Target_1day, max_prediction_length=1
# If predicting Target_1week, max_prediction_length=5 (assuming 5 trading days)
# If predicting Target_1month, max_prediction_length=20
max_prediction_length = 1  # Adjust based on your prediction horizon
max_encoder_length = 30    # Use 30 days of history for prediction

ORCL: time_idx from 0 to 922
MSFT: time_idx from 0 to 901
AAPL: time_idx from 0 to 837
AVGO: time_idx from 0 to 815
AMD: time_idx from 0 to 773
AMZN: time_idx from 0 to 773
GOOGL: time_idx from 0 to 773
META: time_idx from 0 to 773
TSLA: time_idx from 0 to 773
NVDA: time_idx from 0 to 753


## Training Setup

In [40]:
# Create a training dataset - Use the last 20% of the data for testing
val_cutoff = df_processed['time_idx'].max() - max_prediction_length
cutoffs = {}
for ticker in df_processed['Ticker'].unique():
    ticker_data = df_processed[df_processed['Ticker'] == ticker]
    cutoffs[ticker] = ticker_data['time_idx'].max() * 0.8

df_processed['is_train'] = True
for ticker, cutoff in cutoffs.items():
    df_processed.loc[(df_processed['Ticker'] == ticker) &
                    (df_processed['time_idx'] > cutoff), 'is_train'] = False

In [41]:
# Check which columns have missing values
missing_columns = df_processed.isna().sum()
print("\nColumns with missing values:")
print(missing_columns[missing_columns > 0].sort_values(ascending=False))

# Check for infinite values
df_processed = df_processed.replace([np.inf, -np.inf], np.nan)

# Handle missing values in Inventory specifically (since that's causing the error)
if 'Inventory' in df_processed.columns:
    # For each ticker, fill missing Inventory values with median (or 0 if median is NaN)
    for ticker in df_processed['Ticker'].unique():
        ticker_mask = df_processed['Ticker'] == ticker
        ticker_inventory_median = df_processed.loc[ticker_mask, 'Inventory'].median()
        if pd.isna(ticker_inventory_median):
            ticker_inventory_median = 0
        df_processed.loc[ticker_mask, 'Inventory'] = df_processed.loc[ticker_mask, 'Inventory'].fillna(ticker_inventory_median)


Columns with missing values:
neutral                        5378
negative                       5378
vote                           5378
positive                       5378
Target_1year                   2500
Inventory                      1697
Repurchase Of Capital Stock     774
Long Term Debt                  252
Target_1month                   200
Target_1week                     50
Target_1day                      10
dtype: int64


In [43]:
# Check all features for missing values and fill appropriately
for feature in features:
    if df_processed[feature].isna().sum() > 0:
        print(f"Filling missing values in {feature}")
        # Fill by ticker
        for ticker in df_processed['Ticker'].unique():
            ticker_mask = df_processed['Ticker'] == ticker
            feature_median = df_processed.loc[ticker_mask, feature].median()
            if pd.isna(feature_median):  # If median is NaN (all values are NaN)
                feature_median = 0
            df_processed.loc[ticker_mask, feature] = df_processed.loc[ticker_mask, feature].fillna(feature_median)

# Verify all missing values are fixed
remaining_missing = df_processed[features].isna().sum()
if remaining_missing.sum() > 0:
    print("Warning: There are still missing values:")
    print(remaining_missing[remaining_missing > 0])
else:
    print("All missing values have been handled.")

All missing values have been handled.


## Create TimeSeriesDataSets

In [44]:
# Reduced feature set for simplicity
reduced_features = features[:20]  # Use only the first 20 features to simplify

# Create training dataset
training = TimeSeriesDataSet(
    data=df_processed[df_processed['is_train']],
    time_idx="time_idx",
    target=target,
    group_ids=["Ticker"],
    min_encoder_length=15,  # Reduced from 30
    max_encoder_length=15,  # Reduced from 30
    min_prediction_length=1,
    max_prediction_length=1,
    static_categoricals=["Ticker"],
    static_reals=[],
    time_varying_known_categoricals=[],
    time_varying_known_reals=["time_idx"],
    time_varying_unknown_categoricals=[],
    time_varying_unknown_reals=reduced_features,  # Use reduced feature set
    target_normalizer=GroupNormalizer(
        groups=["Ticker"], transformation="softplus"
    ),
    categorical_encoders={
        "Ticker": NaNLabelEncoder(add_nan=True)
    },
    add_relative_time_idx=True,
    add_target_scales=True,
    add_encoder_length=True,
)

In [45]:
# Verify there are no lag features in the data
print("Lag feature verification:")
lag_features = [col for col in df.columns if '_lag' in col]
print(f"Number of lag features in original dataframe: {len(lag_features)}")
if len(lag_features) == 0:
    print("No lag features found in dataframe. These will be handled by TFT automatically.")
else:
    print("Lag features found in dataframe. Consider removing them to let TFT handle temporal dependencies.")
    print(f"First few lag features: {lag_features[:5]}")

Lag feature verification:
Number of lag features in original dataframe: 0
No lag features found in dataframe. These will be handled by TFT automatically.


In [46]:
# Check ticker sequence organization
ticker_counts = df_processed.groupby('Ticker').size()
print("\nRows per ticker:")
print(ticker_counts)


Rows per ticker:
Ticker
AAPL     838
AMD      774
AMZN     774
AVGO     816
GOOGL    774
META     774
MSFT     902
NVDA     754
ORCL     923
TSLA     774
dtype: int64


In [47]:
# Check if there are enough data points per ticker for the encoder length
min_required = max_encoder_length + max_prediction_length
print(f"\nTickers with insufficient data (<{min_required} points):")
print(ticker_counts[ticker_counts < min_required])


Tickers with insufficient data (<31 points):
Series([], dtype: int64)


In [48]:
# Show time index consistency by ticker
print("\nTime index range by ticker:")
for ticker in df_processed['Ticker'].unique():
    ticker_data = df_processed[df_processed['Ticker'] == ticker]
    print(f"{ticker}: {ticker_data['time_idx'].min()} to {ticker_data['time_idx'].max()} ({len(ticker_data)} rows)")


Time index range by ticker:
ORCL: 0 to 922 (923 rows)
MSFT: 0 to 901 (902 rows)
AAPL: 0 to 837 (838 rows)
AVGO: 0 to 815 (816 rows)
AMD: 0 to 773 (774 rows)
AMZN: 0 to 773 (774 rows)
GOOGL: 0 to 773 (774 rows)
META: 0 to 773 (774 rows)
TSLA: 0 to 773 (774 rows)
NVDA: 0 to 753 (754 rows)


In [49]:
# Handle NaN values in target columns
print(f"Number of rows with NA in {target}: {df_processed[target].isna().sum()}")
df_processed[target] = df_processed[target].fillna(0)  # Fill with 0 or another appropriate value
print(f"NAs remaining in {target}: {df_processed[target].isna().sum()}")

Number of rows with NA in Close: 0
NAs remaining in Close: 0


In [50]:
# Create validation dataset and dataloaders
validation = TimeSeriesDataSet.from_dataset(
    training, df_processed[~df_processed['is_train']], predict=True, stop_randomization=True
)

In [51]:
# Create dataloaders for model training
batch_size = 32  # Adjust based on  GPU memory
train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=0, shuffle=False)
val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size, num_workers=0, shuffle=False)

## Model Training

## Model Evaluation and Prediction

## Visualization

## Feature Importance

## Saving Model