To upload local file in google colab

In [2]:
from google.colab import files
# Upload the CSV file from your computer if you are on Google Collab
uploaded = files.upload()

Saving stock_market_dataset.csv to stock_market_dataset.csv


Librairies

In [18]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn as nn
from sklearn.metrics import r2_score
import joblib

2. Load and Preprocess the Dataset

Download the stock market dataset.
Drop unnecessary columns and create a target column for the next day’s closing price.
Normalize the dataset using MinMaxScaler.

In [11]:
df = pd.read_csv("stock_market_dataset.csv")
# 1. Remove duplicates
df.drop_duplicates(inplace=True)

# 2. Drop unneeded columns
df.drop(labels=['date', 'symbol', 'open', 'Volume USDT'], axis=1, inplace=True)

# 3. Sort by time (oldest to newest)
df.sort_values(by='unix', inplace=True)
df.reset_index(drop=True, inplace=True)

# 4. Create next-day target
df['next_day_close'] = df['close'].shift(-1)

# 5. Remove the last row (which now has NaN as the target)
df.dropna(inplace=True)

df.head()

# Select columns to scale (exclude 'unix')
cols_to_scale = df.select_dtypes(include=['float64', 'int64']).columns.difference(['unix'])

scaler = MinMaxScaler()

# Fit scaler and transform
df[cols_to_scale] = scaler.fit_transform(df[cols_to_scale])

df.head()


Unnamed: 0,unix,high,low,close,Volume XRP,next_day_close
0,1525390000000.0,0.743116,0.257043,0.443982,0.001632,0.451574
1,1525480000000.0,0.432251,0.507182,0.451574,0.001158,0.429228
2,1525560000000.0,0.422897,0.47478,0.429228,0.001063,0.405729
3,1525650000000.0,0.39586,0.450449,0.405729,0.001215,0.395
4,1525740000000.0,0.384394,0.445292,0.395,0.000711,0.38802


3. Prepare the Dataset for Training

Split the dataset into training, validation, and testing sets.
Create a custom PyTorch Dataset class to handle the data.
Use DataLoader to create iterable datasets for training and evaluation.

In [15]:
#Since this is time series data, we plit chronologically:
#Training: e.g., first 70%
#Validation: next 15%
#Testing: last 15%

data_len = len(df)
train_end = int(data_len * 0.7)
val_end = int(data_len * 0.85)

train_df = df.iloc[:train_end]
val_df = df.iloc[train_end:val_end]
test_df = df.iloc[val_end:]

#Create a custom PyTorch Dataset class to handle the data.
class TimeSeriesDataset(Dataset):
    def __init__(self, dataframe, feature_cols, target_col):
        self.X = dataframe[feature_cols].values.astype(np.float32)
        self.y = dataframe[target_col].values.astype(np.float32)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return torch.tensor(self.X[idx]), torch.tensor(self.y[idx])

#Use DataLoader to create iterable datasets for training and evaluation.
#Define feature and target columns
feature_cols = ['high', 'low', 'close', 'Volume XRP']  # example features after drop
target_col = 'next_day_close'

#Create Dataset instances
train_dataset = TimeSeriesDataset(train_df, feature_cols, target_col)
val_dataset = TimeSeriesDataset(val_df, feature_cols, target_col)
test_dataset = TimeSeriesDataset(test_df, feature_cols, target_col)

#Create DataLoader for batching
batch_size = 64

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)  # no shuffle for time series
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)



Define the LSTM Model

Create an LSTM model using PyTorch.
Define the model architecture, including GRU layers, dropout, and a dense layer.

In [16]:
class GRUModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout=0.2):
        super(GRUModel, self).__init__()

        # GRU layers
        self.gru = nn.GRU(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0
        )

        # Dropout layer after GRU
        self.dropout = nn.Dropout(dropout)

        # Final fully connected layer to output one value
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        # x shape: (batch_size, seq_length, input_size)

        # Pass through GRU
        out, _ = self.gru(x)  # out shape: (batch_size, seq_length, hidden_size)

        # Use output of last time step
        out = out[:, -1, :]  # shape: (batch_size, hidden_size)

        # Dropout
        out = self.dropout(out)

        # Fully connected layer
        out = self.fc(out)   # shape: (batch_size, 1)

        return out.squeeze(1)  # output shape: (batch_size,)

model = GRUModel(input_size=4, hidden_size=64, num_layers=2, dropout=0.2)


Train the Model

Set up the optimizer and loss function.
Implement training and validation loops.
Train the model for a specified number of epochs.

In [24]:
#Optimizer and loss function
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

criterion = nn.MSELoss()  # Mean Squared Error for regression
optimizer = optim.Adam(model.parameters(), lr=0.001)

#Training and validation loops
def train_one_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    for X_batch, y_batch in dataloader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        X_batch = X_batch.unsqueeze(1)  # Add sequence dimension
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * X_batch.size(0)
    return running_loss / len(dataloader.dataset)


def validate(model, dataloader, criterion, device):
    model.eval()
    running_loss = 0.0
    with torch.no_grad():
        for X_batch, y_batch in dataloader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            X_batch = X_batch.unsqueeze(1)  # Add sequence dimension
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            running_loss += loss.item() * X_batch.size(0)
    return running_loss / len(dataloader.dataset)

#Training for 30 epochs
num_epochs = 30

for epoch in range(1, num_epochs + 1):
    train_loss = train_one_epoch(model, train_loader, criterion, optimizer, device)
    val_loss = validate(model, val_loader, criterion, device)
    print(f"Epoch {epoch:02d}: Train Loss = {train_loss:.6f}, Val Loss = {val_loss:.6f}")


Epoch 01: Train Loss = 0.011110, Val Loss = 0.107842
Epoch 02: Train Loss = 0.005455, Val Loss = 0.103578
Epoch 03: Train Loss = 0.004590, Val Loss = 0.103170
Epoch 04: Train Loss = 0.004731, Val Loss = 0.093438
Epoch 05: Train Loss = 0.004504, Val Loss = 0.084552
Epoch 06: Train Loss = 0.004009, Val Loss = 0.075514
Epoch 07: Train Loss = 0.003627, Val Loss = 0.064666
Epoch 08: Train Loss = 0.003051, Val Loss = 0.053189
Epoch 09: Train Loss = 0.002533, Val Loss = 0.040434
Epoch 10: Train Loss = 0.002018, Val Loss = 0.028382
Epoch 11: Train Loss = 0.001411, Val Loss = 0.017714
Epoch 12: Train Loss = 0.001075, Val Loss = 0.009494
Epoch 13: Train Loss = 0.000769, Val Loss = 0.005260
Epoch 14: Train Loss = 0.000535, Val Loss = 0.003755
Epoch 15: Train Loss = 0.000509, Val Loss = 0.003476
Epoch 16: Train Loss = 0.000500, Val Loss = 0.003553
Epoch 17: Train Loss = 0.000500, Val Loss = 0.003715
Epoch 18: Train Loss = 0.000477, Val Loss = 0.003290
Epoch 19: Train Loss = 0.000468, Val Loss = 0.

Evaluate the Model

Calculate the R² score to evaluate the model’s performance on the test set.
Save the scaler object for future predictions.

In [26]:
#Calculate the R² score
model.eval()
predictions = []
true_vals = []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch = X_batch.to(device)
        X_batch = X_batch.unsqueeze(1)  # Add sequence dimension
        outputs = model(X_batch)
        predictions.extend(outputs.cpu().numpy())
        true_vals.extend(y_batch.numpy())

r2 = r2_score(true_vals, predictions)
print(f"Test R² score: {r2:.4f}")

#Save the scaler object
joblib.dump(scaler, 'scaler.save')
print("Scaler saved!")

Test R² score: 0.9331
Scaler saved!
