In [47]:
# Imports

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, TensorDataset
from string import punctuation
from collections import Counter
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [4]:
# Swap to GPU if available

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Loading data

with open("/data/reviews.txt", "r") as f:
    reviews = f.read()
with open("/data/labels.txt", "r") as f:
    labels = f.read()

In [6]:
# Preprocessing

full_text = ''.join([c for c in reviews if c not in punctuation])
reviews_split = full_text.split("\n")
full_text = ' '.join(reviews_split)
words = full_text.split()

In [7]:
# Creating vocabulary and mapping to int

counts = Counter(words)
vocab = sorted(counts, key=counts.get, reverse=True)
vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}

In [8]:
# Converting reviews to int

reviews_ints = [[vocab_to_int[word] for word in review.split()]
                for review in reviews_split]

In [9]:
# Preparing labels

labels_split = labels.split("\n")
labels = np.array([1 if label == "positive" else 0 for label in labels_split])

In [10]:
# Filtering reviews

reviews_ints = [review for review in reviews_ints if len(review) > 0]
labels = labels[:len(reviews_ints)]

In [11]:
# Length

seq_len = 200
features = pad_sequences(reviews_ints, maxlen=seq_len,
                         padding="post", truncating="post")

In [12]:
# Splitting data (train, evaluate, test)

split_frac = 0.8
split_idx = int(len(features) * split_frac)
train_x, val_x = features[:split_idx], features[split_idx:]
train_y, val_y = labels[:split_idx], labels[split_idx:]

test_idx = int(len(val_x) * 0.5)
val_x, test_x = val_x[:test_idx], val_x[test_idx:]
val_y, test_y = val_y[:test_idx], val_y[test_idx:]

In [13]:
# Converting to PyTorch tensors

train_x, train_y = torch.tensor(train_x), torch.tensor(train_y)
val_x, val_y = torch.tensor(val_x), torch.tensor(val_y)
test_x, test_y = torch.tensor(test_x), torch.tensor(test_y)

In [17]:
# Data size

print(f"Training-set: {len(train_x)}")
print(f"Test-set: {len(test_x)}")
print(f"Evaluation-set: {len(val_x)}")

Training-set: 20000
Test-set: 2500
Evaluation-set: 2500


In [18]:
# Creating data loaders

batch_size = 500
train_data = TensorDataset(train_x, train_y)
val_data = TensorDataset(val_x, val_y)
test_data = TensorDataset(test_x, test_y)

train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
val_loader = DataLoader(val_data, batch_size=batch_size)
test_loader = DataLoader(test_data, batch_size=batch_size)

In [22]:
# Define the Neural Network


class MainNetwork(nn.Module):
    def __init__(self, vocab_size, embed_size=300, lstm_size=256, seq_len=200):
        super(MainNetwork, self).__init__()

        # Embedding layer
        self.embedding = nn.Embedding(
            num_embeddings=vocab_size + 1, embedding_dim=embed_size, padding_idx=0)

        # Convolutional layer
        self.conv1d = nn.Conv1d(in_channels=embed_size,
                                out_channels=128, kernel_size=3)
        self.maxpool = nn.MaxPool1d(kernel_size=2)

        # LSTM layer (bidirectional)
        self.bilstm = nn.LSTM(
            input_size=128, hidden_size=lstm_size, batch_first=True, bidirectional=True)

        # Second LSTM layer
        self.lstm = nn.LSTM(input_size=lstm_size * 2,
                            hidden_size=lstm_size, batch_first=True, bidirectional=False)

        # Pooling global layer
        self.global_maxpool = nn.AdaptiveMaxPool1d(1)

        # Fully connected layers
        self.fc1 = nn.Linear(lstm_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 32)
        self.fc4 = nn.Linear(32, 1)

        # Dropout
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        # Embedding
        x = self.embedding(x)  # Shape: (batch_size, seq_len, embed_size)
        # Change shape for Conv1D: (batch_size, embed_size, seq_len)
        x = x.permute(0, 2, 1)

        # Convolutional layer + ReLU + Maxpool
        # Shape after conv and maxpool: (batch_size, 128, (seq_len - 2) // 2)
        x = self.maxpool(F.relu(self.conv1d(x)))

        # LSTM (bidirectional)
        # Shape after permute: (batch_size, (seq_len - 2) // 2, lstm_size * 2)
        x, _ = self.bilstm(x.permute(0, 2, 1))

        # Second LSTM layer
        # Shape: (batch_size, (seq_len - 2) // 2, lstm_size)
        x, _ = self.lstm(x)

        # Global pooling
        x = self.global_maxpool(x.permute(0, 2, 1)).squeeze(
            2)  # Shape: (batch_size, lstm_size)

        # Fully connected layers
        x = self.dropout(F.relu(self.fc1(x)))
        x = self.dropout(F.relu(self.fc2(x)))
        x = self.dropout(F.relu(self.fc3(x)))

        # Output layer
        x = torch.sigmoid(self.fc4(x))  # Shape: (batch_size, 1)

        return x

In [44]:
# Initialize model, loss function and optimizer

vocab_size = len(vocab)

model = MainNetwork(vocab_size=vocab_size).to(device)

criterion = nn.BCELoss()

optimizer = optim.Adam(model.parameters(), lr=0.0001)

In [45]:
epochs = 15


train_accuracies = []
test_accuracies = []
val_accuracies = []
for epoch in range(epochs):
    model.train()  # Set the model to training mode
    train_loss = 0
    train_correct = 0  
    train_total = 0  

    for i, labels in train_loader:
        i, labels = i.to(device), labels.to(device)  # Move data to GPU
        optimizer.zero_grad()  # Zero the gradients
        output = model(i)  # Pass the data through the model

        # Output should have shape (batch_size, 1) and labels should have shape (batch_size,)
        loss = criterion(output.view(-1), labels.float())  # Calculate the loss

        loss.backward()  # Backpropagate to compute gradients
        optimizer.step()  # Update the weights
        train_loss += loss.item()  # Accumulate training loss

        # Calcular previsões e atualizar contadores
        predictions = torch.round(output.view(-1))  # Previsões (0 ou 1)
        # Contar previsões corretas
        train_correct += (predictions == labels).sum().item()

        # Adicionar o tamanho do batch ao total de amostras
        train_total += labels.size(0)

    # Calcular precisão e perda média
    train_accuracy = train_correct / train_total 
    train_accuracies.append(train_accuracy)
    print(f"Epoch [{epoch + 1}/{epochs}], Train loss: {train_loss / len(train_loader):.9f}, Train accuracy: {train_accuracy * 100:.2f}%")

    # Evaluation
    model.eval()  # Set the model to evaluation mode
    val_loss = 0
    val_correct = 0
    val_total = 0

    with torch.no_grad():  # Disable gradient computation
        for i, labels in val_loader:
            i, labels = i.to(device), labels.to(device)  # Move data to GPU
            output = model(i)  # Pass the data through the model
            loss = criterion(output.view(-1), labels.float())  # Calculate the loss
            val_loss += loss.item()  # Accumulate validation loss

            predictions = torch.round(output.view(-1))
            val_correct += (predictions == labels).sum().item()
            val_total += labels.size(0)

    val_accuracy = val_correct / val_total 
    val_accuracies.append(val_accuracy)
    print(f"Val accuracy: {val_accuracy * 100:.2f}%")
    print(f"Val loss: {val_loss / len(val_loader):.9f}")

    # Evaluating the model (test data)
    test_loss = 0
    test_correct = 0
    test_total = 0

    model.eval()  # Set the model to evaluation mode

    with torch.no_grad():  # Disable gradient computation
        for i, labels in test_loader:
            i, labels = i.to(device), labels.to(device)  # Move data to GPU
            output = model(i)  # Pass the data through the model

            # Calculate the loss
            test_loss += criterion(output.view(-1), labels.float()).item()

            predictions = torch.round(output.view(-1))  # Get predictions
            # Count correct predictions
            test_correct += (predictions == labels).sum().item()
            test_total += labels.size(0)

    # Calculate final accuracy
    test_accuracy = test_correct / test_total 
    test_accuracies.append(test_accuracy)

    # Print test loss and accuracy
    print(f"Test loss: {test_loss / len(test_loader):.9f}")
    print(f"Test accuracy: {test_accuracy * 100:.2f}%")


Epoch [1/15], Train loss: 0.694531310, Train accuracy: 49.84%
Val accuracy: 50.00%
Val loss: 0.693643045
Test loss: 0.693660438
Test accuracy: 50.00%
Epoch [2/15], Train loss: 0.693395093, Train accuracy: 50.81%
Val accuracy: 50.00%
Val loss: 0.692884409
Test loss: 0.692963636
Test accuracy: 50.00%
Epoch [3/15], Train loss: 0.693049328, Train accuracy: 50.57%
Val accuracy: 63.12%
Val loss: 0.690353537
Test loss: 0.690864432
Test accuracy: 62.28%
Epoch [4/15], Train loss: 0.680482647, Train accuracy: 58.71%
Val accuracy: 63.08%
Val loss: 0.670454681
Test loss: 0.675945270
Test accuracy: 61.08%
Epoch [5/15], Train loss: 0.669698909, Train accuracy: 62.70%
Val accuracy: 67.24%
Val loss: 0.643294692
Test loss: 0.650564039
Test accuracy: 65.08%
Epoch [6/15], Train loss: 0.630415747, Train accuracy: 67.78%
Val accuracy: 67.88%
Val loss: 0.612734997
Test loss: 0.624853337
Test accuracy: 67.04%
Epoch [7/15], Train loss: 0.585210395, Train accuracy: 71.95%
Val accuracy: 73.28%
Val loss: 0.55011

In [48]:
# Accuracy curve graph plot


plt.figure(figsize=(10,5))
plt.plot(train_accuracies, label="Train accuracy")
plt.plot(test_accuracies, label="Evaluation accuracy")
plt.title("Accuracy curve")
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend()
plt.show()