In [1]:
# Update Cell 0 imports to include regression metrics
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from scipy.stats import zscore
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import (
    # Classification metrics (for DecisionTreeClassifier)
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report, roc_auc_score,
    roc_curve, precision_recall_curve, average_precision_score,
    # Regression metrics (for RandomForestRegressor)
    mean_squared_error, mean_absolute_error, r2_score,
    mean_absolute_percentage_error, explained_variance_score
)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score, cross_val_predict
import torch
from torch import nn
from tsai.basics import *
from tsai.inference import load_learner



In [2]:
df = pd.read_csv('../../data/processed/data.csv')

In [3]:
df.head()

Unnamed: 0,date,bullish_sentiment_score,bearish_sentiment_score,neutral_sentiment_score,dominant_sentiment_type,sentiment_confidence,sentiment_strength,open,high,low,close,volume,price_change
0,2020-08-16,0,0,0,neutral,0.0,0.0,11866.685547,11934.901367,11737.188477,11892.803711,20583375490,26.118164
1,2020-08-17,0,0,0,neutral,0.0,0.0,11895.658203,12359.056641,11806.696289,12254.402344,28227687027,358.744141
2,2020-08-18,0,0,0,neutral,0.0,0.0,12251.895508,12335.707031,11954.525391,11991.233398,26043227672,-260.662109
3,2020-08-19,0,0,0,neutral,0.0,0.0,11990.884766,12028.923828,11687.333008,11758.283203,24502851117,-232.601562
4,2020-08-20,0,0,0,neutral,0.0,0.0,11761.5,11900.411133,11710.063477,11878.37207,20175242945,116.87207


In [3]:
df[df['sentiment_strength'] != 0]

Unnamed: 0,date,bullish_sentiment_score,bearish_sentiment_score,neutral_sentiment_score,dominant_sentiment_type,sentiment_confidence,sentiment_strength,open,high,low,close,volume,price_change


In [10]:
df

Unnamed: 0,date,bullish_sentiment_score,bearish_sentiment_score,neutral_sentiment_score,dominant_sentiment_type,sentiment_confidence,sentiment_strength,open,high,low,close,volume,price_change


In [None]:
# Cell 3 - Prepare data for time series forecasting
for col in df.columns:
        if col != 'date' and col != 'Bearish' and col != 'Bullish' and col != 'Neutral':
                df[col] = zscore(df[col])
display(df.head())

# Keep volume as target (we'll predict August 8, 2025 volume)
target_col = df['volume']
df = df.drop(columns=['volume', 'date'], axis=1)

# Now we need to create sequences for time series forecasting
def create_forecasting_sequences(data, target, lookback=30):
    """
    Create sequences where each sample uses 'lookback' days to predict the next day
    """
    X, y = [], []
    for i in range(lookback, len(data)):
        X.append(data[i-lookback:i])
        y.append(target[i])
    return np.array(X), np.array(y)

# Create sequences
lookback = 30  # Use last 30 days to predict next day
# X_sequences, y_sequences = create_forecasting_sequences(df.values, target_col.values, lookback)

# print(f"Sequences shape: {X_sequences.shape}")
# print(f"Targets shape: {y_sequences.shape}")

# # Split data - use all data up to August 7, 2025 for training
# # The last sequence will predict August 8, 2025
# train_size = len(X_sequences) - 1  # Keep last sequence for final prediction

# X_train = X_sequences[:train_size]
# y_train = y_sequences[:train_size]
# X_test = X_sequences[train_size:]  # This will be the sequence to predict August 8
# y_test = y_sequences[train_size:]  # This is the actual August 8 volume

# print(f"Training sequences: {X_train.shape}")
# print(f"Test sequence (for August 8): {X_test.shape}")

In [None]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")
display(df.head())
# Convert to PyTorch tensors
# X_train_tensor = torch.FloatTensor(X_train).to(device)
# y_train_tensor = torch.FloatTensor(y_train).to(device)
# X_test_tensor = torch.FloatTensor(X_test).to(device)
# y_test_tensor = torch.FloatTensor(y_test).to(device)


# # LSTM model for time series forecasting
# class ForecastingLSTMModel(nn.Module):
#     def __init__(self, input_size, hidden_size, num_layers, lookback, dropout=0.4):
#         super(ForecastingLSTMModel, self).__init__()
#         self.hidden_size = hidden_size
#         self.num_layers = num_layers
#         self.lookback = lookback

#         self.lstm = nn.LSTM(
#             input_size=input_size,
#             hidden_size=hidden_size,
#             num_layers=num_layers,
#             batch_first=True,
#             dropout=dropout if num_layers > 1 else 0,
#         )

#         self.dropout = nn.Dropout(dropout)
#         self.fc = nn.Linear(hidden_size, 1)

#     def forward(self, x):
#         # x shape: (batch_size, lookback, features)
#         h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
#         c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)

#         out, _ = self.lstm(x, (h0, c0))
#         out = self.dropout(out[:, -1, :])  # Use last timestep
#         out = self.fc(out)
#         return out.squeeze()


# # Create and train the model
# model_2 = ForecastingLSTMModel(
#     input_size=df.shape[1],
#     hidden_size=1024,
#     num_layers=8,
#     lookback=lookback,
#     dropout=0.2,
# ).to(device)

# criterion = nn.HuberLoss()
# optimizer = torch.optim.Adam(model_2.parameters(), lr=0.0001)

# # Training loop
# model_2.train()
# epochs = 200
# for epoch in range(epochs):
#     outputs = model_2(X_train_tensor)
#     loss = criterion(outputs, y_train_tensor)

#     optimizer.zero_grad()
#     loss.backward()
#     optimizer.step()

#     if epoch % 20 == 0:
#         print(f"Epoch [{epoch}/{epochs}], Loss: {loss.item():.4f}")

In [None]:
# # Make prediction for August 8, 2025
# model_2.eval()
# with torch.no_grad():
#     august_8_prediction = model_2(X_test_tensor).item()

# print("=== PREDICTION FOR AUGUST 8, 2025 ===")
# print(f"Predicted volume for August 8, 2025: {august_8_prediction:,.10f}")
# print(f"Actual volume for August 8, 2025: {y_test[0]:,.10f}")

# # Calculate prediction error
# prediction_error = abs(august_8_prediction - y_test[0])
# percentage_error = (prediction_error / y_test[0]) * 100

# print(f"Absolute error: {prediction_error:,.10f}")
# print(f"Percentage error: {percentage_error:.10f}%")

# # Compare with historical average
# historical_avg = np.mean(y_train)
# print(f"Historical average volume: {historical_avg:,.10f}")
# print(f"Prediction vs Historical avg: {((august_8_prediction - historical_avg) / historical_avg) * 100:.2f}%")

In [None]:
x, y = SlidingWindow(
        90, horizon = 1
)(df)
splits = TimeSplitter(
        100    
)(y)
tfms = [None, TSForecasting()]
batch_tfms = None
fcst = TSForecaster(x, y, splits=splits, path='models', tfms=tfms, batch_tfms=batch_tfms, bs=64, arch="TSTPlus", metrics=mse, cbs=ShowGraph())
fcst.fit_one_cycle(70, 1e-3)
