In [1]:
!git clone https://github.com/Mateusz-Wojciechowski/sentimentAnalysis.git
%cd sentimentAnalysis

fatal: destination path 'sentimentAnalysis' already exists and is not an empty directory.
/content/sentimentAnalysis


In [2]:
!pip install 'portalocker>=2.0.0'



In [3]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as fun
from EncoderBlock import EncoderBlock
from PositionalEncoding import PositionalEncoding


class SentimentModel(nn.Module):
    def __init__(self, d_model, d_ff, num_heads, max_seq_len, num_classes, vocab_size):
        super(SentimentModel, self).__init__()
        self.encoder = EncoderBlock(d_model, d_ff, num_heads, max_seq_len)
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_len)
        self.ff_net = nn.Linear(d_model, num_classes-1)

    def forward(self, x):
        encoder_input = self.positional_encoding(self.embedding(x))
        encoder_output = self.encoder(encoder_input)
        aggregated_output = torch.mean(encoder_output, dim=1)
        net_output = self.ff_net(aggregated_output)
        return net_output


In [5]:
import torch
from torchtext.datasets import IMDB
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.optim as optim
import torch.nn as nn
import numpy as np
import random

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Tokenizer
tokenizer = get_tokenizer('basic_english')

def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)

def process_text(text, vocab):
    return torch.tensor(vocab(tokenizer(text)), dtype=torch.long).to(device)

def collate_batch(batch):
    label_list, text_list = [], []
    for label, text in batch:
        label_tensor = torch.tensor([label-1], dtype=torch.float).to(device)
        processed_text = process_text(text, vocab)
        label_list.append(label_tensor)
        text_list.append(processed_text)
    return torch.stack(label_list).to(device), pad_sequence(text_list, padding_value=vocab["<pad>"], batch_first=True).to(device)

def calculate_accuracy(preds, y):
    preds = torch.sigmoid(preds)
    rounded_preds = torch.round(preds)
    correct = (rounded_preds == y).float()
    accuracy = correct.sum() / len(correct)
    return accuracy

train_data = list(IMDB(split='train'))
random.shuffle(train_data)

vocab = build_vocab_from_iterator(yield_tokens(data_iter for label, data_iter in train_data), specials=["<unk>", "<pad>"])
vocab.set_default_index(vocab["<unk>"])

train_loader = DataLoader(train_data, batch_size=8, shuffle=False, collate_fn=collate_batch)

# test_data = list(IMDB(split='test'))
# random.shuffle(test_data)
# test_loader = DataLoader(test_data, batch_size=32, shuffle=False, collate_fn=collate_batch)

d_model = 512
num_heads = 8
max_seq_len = 5000
d_ff = 2048
learning_rate = 0.001
num_classes = 2
vocab_size = len(vocab)
num_epochs = 10

model = SentimentModel(d_model, d_ff, num_heads, max_seq_len, num_classes, vocab_size)
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
loss_fn = nn.BCEWithLogitsLoss()

for epoch in range(num_epochs):
    print(f"Epoch: {epoch + 1}")
    total_loss = 0
    total_accuracy = 0
    total_examples = 0
    i = 0

    model.train()
    for labels, sequences in train_loader:
        if i % 1000 == 0:
          print(f"batch {i}")
        i += 1
        output = model(sequences)
        loss = loss_fn(output, labels)
        accuracy = calculate_accuracy(output, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        total_accuracy += accuracy.item()
        total_examples += 1

    print(f"Loss in epoch {epoch + 1} is {total_loss}")
    print(f"Accuracy in epoch {epoch + 1} is {total_accuracy / total_examples}")


Using device: cuda
Epoch: 1
batch 0
batch 1000
batch 2000
batch 3000
Loss in epoch 1 is 1858.8000011891127
Accuracy in epoch 1 is 0.67752
Epoch: 2
batch 0
batch 1000
batch 2000
batch 3000
Loss in epoch 2 is 1380.2212148308754
Accuracy in epoch 2 is 0.81036
Epoch: 3
batch 0
batch 1000
batch 2000
batch 3000
Loss in epoch 3 is 1094.700155752711
Accuracy in epoch 3 is 0.85792
Epoch: 4
batch 0
batch 1000
batch 2000
batch 3000
Loss in epoch 4 is 875.2224251364823
Accuracy in epoch 4 is 0.89256
Epoch: 5
batch 0
batch 1000
batch 2000
batch 3000
Loss in epoch 5 is 694.3559186474304
Accuracy in epoch 5 is 0.91652
Epoch: 6
batch 0
batch 1000
batch 2000
batch 3000
Loss in epoch 6 is 579.2364111022252
Accuracy in epoch 6 is 0.93052
Epoch: 7
batch 0
batch 1000
batch 2000
batch 3000
Loss in epoch 7 is 472.01755257208424
Accuracy in epoch 7 is 0.94616
Epoch: 8
batch 0
batch 1000
batch 2000
batch 3000
Loss in epoch 8 is 398.4770253817278
Accuracy in epoch 8 is 0.95416
Epoch: 9
batch 0
batch 1000
batch 

In [6]:
path = '/content/drive/My Drive/sentiment_model_state.pth'  # Ścieżka, gdzie chcesz zapisać model
torch.save(model.state_dict(), path)

In [7]:
torch.save(model.state_dict(), 'sentiment_model_state.pth')
from google.colab import files
files.download('sentiment_model_state.pth')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>