In [17]:
import pandas as pd

df = pd.read_csv(r'C:\Users\marco\Documents\AFC\Diffusion-Models-for-Time-Series\data\TSLA\TSLA_2015-01-02_2015-01-30\TSLA_2015-01-02_34200000_57600000_message_10.csv', header=None)
print(df.head())


              0  1        2      3        4  5
0  34200.006789  5        0     13  2228300  1
1  34200.034290  6       -1  17011  2228700 -1
2  34200.034290  1  5519048     11  2227500  1
3  34200.034290  1  2142274     10  2230000 -1
4  34200.034290  1  2629509    200  2226000  1


In [18]:
# delete column "2"
df.drop(df.columns[2], axis=1, inplace=True)

In [19]:
print(df.head())

              0  1      3        4  5
0  34200.006789  5     13  2228300  1
1  34200.034290  6  17011  2228700 -1
2  34200.034290  1     11  2227500  1
3  34200.034290  1     10  2230000 -1
4  34200.034290  1    200  2226000  1


In [20]:
# aggiungi una feature "6" che casualmente per ogni istanza può assumere valore 0 o 1 (50 e 50)
import random
random.seed(42)
df['6'] = [random.randint(0, 1) for _ in range(len(df))]
print(df.head())

              0  1      3        4  5  6
0  34200.006789  5     13  2228300  1  0
1  34200.034290  6  17011  2228700 -1  0
2  34200.034290  1     11  2227500  1  1
3  34200.034290  1     10  2230000 -1  0
4  34200.034290  1    200  2226000  1  0


In [21]:
# rename columns as "time", "event_type", "size", "price", "direction", "generated"
df.columns = ["time", "event_type", "size", "price", "direction", "generated"]
print(df.head())

           time  event_type   size    price  direction  generated
0  34200.006789           5     13  2228300          1          0
1  34200.034290           6  17011  2228700         -1          0
2  34200.034290           1     11  2227500          1          1
3  34200.034290           1     10  2230000         -1          0
4  34200.034290           1    200  2226000          1          0


In [22]:
# direction one hot, generated one hot, price normalization between 0 and 1, size normalization between 0 and 1, event_type one hot, time divide by 100000

# direction one hot
df = pd.get_dummies(df, columns=['direction'])

# price normalization between 0 and 1
df['price'] = (df['price'] - df['price'].min()) / (df['price'].max() - df['price'].min())

# size normalization between 0 and 1
df['size'] = (df['size'] - df['size'].min()) / (df['size'].max() - df['size'].min())

# event_type one hot
df = pd.get_dummies(df, columns=['event_type'])

# time divide by 100000
df['time'] = df['time'] / 100000

In [23]:
df.head()

Unnamed: 0,time,size,price,generated,direction_-1,direction_1,event_type_1,event_type_2,event_type_3,event_type_4,event_type_5,event_type_6
0,0.342,0.000705,0.885996,0,0,1,0,0,0,0,1,0
1,0.342,1.0,0.889587,0,1,0,0,0,0,0,0,1
2,0.342,0.000588,0.878815,1,0,1,1,0,0,0,0,0
3,0.342,0.000529,0.901257,0,1,0,1,0,0,0,0,0
4,0.342,0.011699,0.86535,0,0,1,1,0,0,0,0,0


In [29]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [26]:
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split

# Assuming df is your DataFrame and it's already preprocessed
features = df.drop('generated', axis=1).values
labels = df['generated'].values

# Reshape input to be 3D [samples, timesteps, features]
features = features.reshape((features.shape[0], 1, features.shape[1]))

# Split the data into training and test sets
train_X, test_X, train_y, test_y = train_test_split(features, labels, test_size=0.2, random_state=42)

# Convert to PyTorch tensors
train_X = torch.tensor(train_X, dtype=torch.float32)
train_y = torch.tensor(train_y, dtype=torch.float32)
test_X = torch.tensor(test_X, dtype=torch.float32)
test_y = torch.tensor(test_y, dtype=torch.float32)

# Create data loaders
train_data = TensorDataset(train_X, train_y)
train_loader = DataLoader(train_data, batch_size=72)
test_data = TensorDataset(test_X, test_y)
test_loader = DataLoader(test_data, batch_size=72)

# Define the LSTM model
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size, device=device)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device) 
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.lstm(x, (h0, c0))  
        out = self.fc(out[:, -1, :])
        return out

# Instantiate the model
model = LSTMModel(input_size=train_X.shape[2], hidden_size=128, num_layers=2, output_size=1, device=device)

# Define loss and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters())

# Train the model
for epoch in range(50):
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        output = model(inputs)
        loss = criterion(output, labels.unsqueeze(1))
        loss.backward()
        optimizer.step()

In [28]:
# Switch to evaluation mode
model.eval()

# Initialize the prediction and label lists(tensors)
test_preds = torch.Tensor().to(train_X.device)
test_labels = torch.Tensor().to(train_X.device)

with torch.no_grad():
    for inputs, labels in test_loader:
        output = model(inputs)
        test_preds = torch.cat((test_preds, output), dim=0)
        test_labels = torch.cat((test_labels, labels.unsqueeze(1)), dim=0)

# Apply sigmoid function to output probabilities between 0 and 1
test_preds = torch.sigmoid(test_preds)

# Convert probabilities to binary predictions (0 or 1)
test_preds_binary = (test_preds > 0.5).float()

# Calculate the accuracy of the model
accuracy = (test_preds_binary == test_labels).sum().item() / test_labels.numel()

print(f'Test Accuracy: {accuracy * 100:.2f}%')

Test Accuracy: 49.98%
