#Analiza sentymentu (wydźwięku)

In [None]:
import torch
import numpy as np
import random

SEED = 42
random.seed(SEED)

np.random.seed(SEED)
torch.manual_seed(SEED)

if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [None]:
# !pip install -U portalocker>=2.0.0
!pip install torch tensorboard

In [None]:
from torchtext.data.utils import get_tokenizer
from torch.utils.data import random_split
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
import torch
import torch.nn as nn
import torch.optim as optim
import time

In [None]:
import torchdata.datapipes as dp

In [None]:
FILE_PATH = 'sentiment_data_clean.csv'
data_pipe = dp.iter.IterableWrapper([FILE_PATH])
data_pipe = dp.iter.FileOpener(data_pipe, mode='rb')
data_pipe = data_pipe.parse_csv(skip_lines=0, delimiter='\t', as_tuple=True)

In [None]:
for sample in data_pipe:
    print(sample)
    break

('miss good friend he be georgia', '0')


In [None]:
tokenizer = get_tokenizer('spacy', 'en_core_web_sm')

def yield_tokens(data_iter):
    for text, _ in data_iter:
        yield tokenizer(text)

In [None]:
import random
# dzielimy dane na train i test ręcznie
data_list = list(data_pipe)

random.shuffle(data_list)

split_ratio = 0.8  # 80% for training
train_size = int(split_ratio * len(data_list))
train_data = data_list[:train_size]
test_data = data_list[train_size:]
train_iter = iter(train_data)
test_iter = iter(test_data)

In [None]:
len(train_data)

34838

In [None]:
len(test_data)

8710

In [None]:
MAX_TOKENS = 25000
vocab = build_vocab_from_iterator(
    yield_tokens(train_iter),
    specials=['<unk>'],
    max_tokens=MAX_TOKENS)
vocab.set_default_index(vocab['<unk>'])

In [None]:
vocab(['I', 'like'])

[2, 9]

In [None]:
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: 1 if x == '1' else 0

def collate_batch(batch):
    label_list, text_list, lengths = [], [], []
    for (_text, _label) in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        lengths.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.float32)
    lengths = torch.tensor(lengths, dtype=torch.int64)
    text_list = pad_sequence(text_list, batch_first=True)
    return label_list, text_list, lengths

In [None]:
train_dataset = list(train_iter)
test_dataset = list(test_iter)

# sami wydzielimy część na walidację odcinając 50% testu
validation_ratio = 0.5
n_test = len(test_dataset)
n_val = int(n_test * validation_ratio)
n_test = n_test - n_val

# korzystamy z gotowej metody random_split
val_dataset, test_dataset = random_split(test_dataset, [n_val, n_test])

In [None]:
len(train_dataset)

34838

In [None]:
# w końcu ustalamy rozmiar batchy i tworzymy 3 dataloadery do wrzucania danych do modelu
batch_size = 64
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)

In [None]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, text, text_lengths):
        embedded = self.embedding(text)
        # pakujemy sekwencje
        packed_embedded = pack_padded_sequence(embedded, text_lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_output, hidden = self.rnn(packed_embedded)
        # rozpakowujemy wyjście
        output, output_lengths = pad_packed_sequence(packed_output, batch_first=True)
        return self.fc(hidden.squeeze(0))

In [None]:
INPUT_DIM = len(vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'Model ma {count_parameters(model):,} parametrów')

Model ma 2,591,905 parametrów


In [None]:
optimizer = optim.SGD(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
criterion = criterion.to(device)

In [None]:
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc

def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.train()

    for batch in iterator:
        labels, texts, lengths = batch
        texts, labels = texts.to(device), labels.to(device)
        optimizer.zero_grad()

        predictions = model(texts, lengths).squeeze(1)
        loss = criterion(predictions, labels)
        acc = binary_accuracy(predictions, labels)

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()

    with torch.no_grad():
        for batch in iterator:
            labels, texts, lengths = batch
            texts, labels = texts.to(device), labels.to(device)

            predictions = model(texts, lengths).squeeze(1)
            loss = criterion(predictions, labels)
            acc = binary_accuracy(predictions, labels)

            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
N_EPOCHS = 5
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()

    train_loss, train_acc = train(model, train_dataloader, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, test_dataloader, criterion)

    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'rnn-model.pt')

    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 5s
	Train Loss: 0.698 | Train Acc: 50.40%
	 Val. Loss: 0.696 |  Val. Acc: 51.76%
Epoch: 02 | Epoch Time: 0m 5s
	Train Loss: 0.695 | Train Acc: 51.08%
	 Val. Loss: 0.694 |  Val. Acc: 52.48%
Epoch: 03 | Epoch Time: 0m 3s
	Train Loss: 0.693 | Train Acc: 51.97%
	 Val. Loss: 0.692 |  Val. Acc: 53.53%
Epoch: 04 | Epoch Time: 0m 3s
	Train Loss: 0.691 | Train Acc: 52.73%
	 Val. Loss: 0.690 |  Val. Acc: 53.96%
Epoch: 05 | Epoch Time: 0m 4s
	Train Loss: 0.689 | Train Acc: 53.55%
	 Val. Loss: 0.688 |  Val. Acc: 54.32%
