Imports

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import mean_absolute_error
from torch.nn import Module, LSTM, Linear
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
import pandas as pd
import glob

  from .autonotebook import tqdm as notebook_tqdm


Model creation

In [2]:
class Net(Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(Net, self).__init__()
        self.lstm = LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.fc = Linear(hidden_size * 2, num_classes)
        
        
    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        return out

Sentiment Analysis Dataset and declaration

In [58]:
class SentimentAnalysisDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=128):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        label = self.dataframe.iloc[idx, 0] 
        message = self.dataframe.iloc[idx, 1]  

        encoding = self.tokenizer(
            message,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        input_ids = encoding['input_ids'].squeeze().float()

        return input_ids, torch.tensor(label, dtype=torch.long)

In [68]:
sentiment_analysis_csv = pd.read_csv('FirstReportData/sentiment_analysis.csv', header=None, encoding='ISO-8859-1')

sentiment_analysis_csv = sentiment_analysis_csv[:200000]

train_data, test_data = train_test_split(sentiment_analysis_csv, test_size=0.4, random_state=42)

print("Training data size:", len(train_data))
print("Testing data size:", len(test_data))

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_dataset = SentimentAnalysisDataset(train_data, tokenizer, max_length=128)
test_dataset = SentimentAnalysisDataset(test_data, tokenizer, max_length=128)

Training data size: 120000
Testing data size: 80000


In [24]:
device = torch.device("mps")

input_size = 128
hidden_size = 64
num_layers = 2
num_classes = 2

model = Net(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, num_classes=num_classes).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

Weather Forecast Dataset and declaration

In [3]:
class WeatherForecastDataset(Dataset):
    def __init__(self):
        data = pd.read_csv('FirstReportData/weather_forecast_cleaned.csv')
        data = data.dropna()
        temperatures = data["T (degC)"].values.reshape(-1, 1)
        features = data.drop(columns=["T (degC)"]).values
        
        self.sequences = []
        self.targets = []
        
        for i in range(len(features) - 5):
            self.sequences.append(features[i:i + 5])
            self.targets.append(temperatures[i + 5])
            
        self.sequences = torch.tensor(self.sequences, dtype=torch.float32)
        self.targets = torch.tensor(self.targets, dtype=torch.float32)

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return self.sequences[idx], self.targets[idx]
    

In [4]:
weather_forecast_csv = pd.read_csv('FirstReportData/weather_forecast_cleaned.csv')

weather_forecast_dataset = WeatherForecastDataset()

train_dataset, test_dataset = train_test_split(weather_forecast_dataset, test_size=0.2, random_state=42)

print("Training data size:", len(train_dataset))
print("Testing data size:", len(test_dataset))

  self.sequences = torch.tensor(self.sequences, dtype=torch.float32)


Training data size: 336436
Testing data size: 84110


In [5]:
device = torch.device("mps")

input_size = 11
hidden_size = 32
num_layers = 2
num_classes = 1

model = Net(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, num_classes=num_classes).to(device)

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

Crypto Currency Dataset and declaration

In [7]:
class CryptoDataset(Dataset):
    def __init__(self, data_file, sequence_length=3):
        data = pd.read_csv(data_file) 
        prices = data['Close'].values.reshape(-1, 1)
        features = data.drop(columns=['Date', 'Close', 'Currency']).values
        
        # self.scaler = MinMaxScaler(feature_range=(0, 1))
        # normalized_prices = self.scaler.fit_transform(prices)

        self.sequences = []
        self.targets = []
        for i in range(len(prices) - sequence_length):
            self.sequences.append(features[i:i + sequence_length])
            self.targets.append(prices[i + sequence_length])
            
        self.sequences = torch.tensor(self.sequences, dtype = torch.float32)
        self.targets = torch.tensor(self.targets, dtype = torch.float32)

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return self.sequences[idx], self.targets[idx]

In [8]:
crypto = CryptoDataset('FirstReportData/CryptoCurrencies/Binance USD.csv')
train_data, test_data = train_test_split(crypto, test_size=0.2, random_state=42)
device = torch.device("mps")

input_size = 4
hidden_size = 32
num_layers = 2
num_classes = 1

model = Net(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, num_classes=num_classes).to(device)

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

train_dataloader = DataLoader(train_data, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=32, shuffle=False)

Training and evaluation functions

In [6]:
def train(model, dataloader, criterion, optimizer):
    model.train()
    total_loss = 0
    
    for inputs, labels in dataloader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        
        outputs = model(inputs)
            
        # if len(outputs.shape) == 2:
        #     print(len(labels.shape))
        #     outputs = outputs.squeeze()
        
        # outputs = outputs.squeeze()
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    return total_loss / len(dataloader)


def evaluate(model, dataloader, criterion):
    model.eval()
    total_loss = 0
    total_r2_score = 0
    correct = 0
    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs = inputs.to(device)
            labels = labels.to(device)
            
            outputs = model(inputs)

            # if len(outputs.shape) == 2:
            #     print(len(labels.shape))
            #     outputs = outputs.squeeze()
            # outputs = outputs.squeeze()
            loss = criterion(outputs, labels)
            r2 = mean_absolute_error(outputs.cpu(), labels.cpu())
            
            # correct += (outputs == labels).sum().item()
            total_r2_score += r2
            total_loss += loss.item()
                
            # _, predicted = torch.max(outputs, 1)
            # correct += (predicted == labels).sum().item()
            # total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    avg_r2_score = total_r2_score / len(dataloader)
    # accuracy = correct / len(dataloader.dataset)
    # return avg_loss, accuracy
    return avg_loss, avg_r2_score

 
num_epochs = 5
for epoch in range(num_epochs):
    train_loss = train(model, train_dataloader, criterion, optimizer)
    test_loss, test_accuracy = evaluate(model, test_dataloader, criterion)

    print(
        f'Epoch [{epoch + 1}/{num_epochs}], Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}')

Epoch [1/5], Train Loss: 19.8741, Test Loss: 0.9255, Test Accuracy: 0.6449
Epoch [2/5], Train Loss: 0.5391, Test Loss: 0.3245, Test Accuracy: 0.4082


KeyboardInterrupt: 

Manual testing

In [79]:
# torch.save(model.state_dict(), 'FirstReportData/sentiment_analysis_model.pth')

model = Net(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, num_classes=num_classes).to(device)
model.load_state_dict(torch.load('FirstReportData/sentiment_analysis_model.pth'))
model.eval()
message = "i loved it here"
encoding = tokenizer(
    message,
    add_special_tokens=True,
    max_length=128,
    padding='max_length',
    truncation=False,
    return_tensors='pt'
)

output = model(encoding['input_ids'].float().to(device))
_, predicted = torch.max(output, 1)
print("Predicted:", predicted.item())


Predicted: 0


  model.load_state_dict(torch.load('FirstReportData/sentiment_analysis_model.pth'))


In [19]:
test_loss, test_accuracy = evaluate(model, test_dataloader, criterion)

print(
    f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}')

Test Loss: 0.0836, Test Accuracy: 0.2891
