# Notebook 2 IA-024-2024S2 FEEC-UNICAMP - Lucas Couto Lima RA: 220696

## Instalação e importação de pacotes

In [None]:
!pip install datasets portalocker>=2.0.0 -q

In [None]:
import torch
import random
from torch.utils.data import Dataset, DataLoader

from collections import Counter
import torch.nn as nn
import torch.optim as optim
from datasets import load_dataset

# Adicionados
import time
import string

## I - Vocabulário e Tokenização

In [None]:
def preprocess_text(text):
    # Converter para minúsculas
    text = text.lower()
    # Remover pontuações
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

train_dataset = load_dataset("stanfordnlp/imdb", split="train")

vocab_size = 20000

counter = Counter()
for sample in train_dataset: #
    preprocessed_text = preprocess_text(sample["text"])  # Pré-processar o texto
    counter.update(preprocessed_text.split())  # Atualizar o contador com tokens

# create a vocabulary of the 20000 most frequent tokens
most_frequent_words = sorted(counter, key=counter.get, reverse=True)[:vocab_size]
vocab = {word: i for i, word in enumerate(most_frequent_words, 1)}
vocab_size = len(vocab)

In [None]:
def encode_sentence(sentence, vocab):
    sentence = preprocess_text(sentence) # Modificação
    return [vocab.get(word, 0) for word in sentence.split()] # 0 for OOV

## II - Dataset

In [None]:
from torch.nn.functional import one_hot
# Dataset Class with One-hot Encoding
class IMDBDataset(Dataset):
    def __init__(self, split, vocab):
        self.data = load_dataset("stanfordnlp/imdb", split=split)
        self.vocab = vocab
        # Pré-processar os dados para one-hot encoding
        self.encoded_data = []
        for sample in self.data:
            target = sample["label"]
            line = sample["text"]
            target = 1 if target == 1 else 0
            # one-hot encoding
            X = torch.zeros(len(self.vocab) + 1)
            for word in encode_sentence(line, self.vocab):
                X[word] = 1
            self.encoded_data.append((X, torch.tensor(target)))

    def __len__(self):
        return len(self.encoded_data)

    def __getitem__(self, idx):
        return self.encoded_data[idx]

# Load Data with One-hot Encoding
train_data = IMDBDataset('train', vocab)
test_data = IMDBDataset('test', vocab)

## III - Data Loader

In [None]:
batch_size = 128
# define dataloaders
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_loader  = DataLoader(test_data,  batch_size=batch_size, shuffle=False)


## IV - Modelo

In [None]:
class OneHotMLP(nn.Module):
    def __init__(self, vocab_size):
        super(OneHotMLP, self).__init__()

        self.fc1 = nn.Linear(vocab_size+1, 200)
        self.fc2 = nn.Linear(200, 1)

        self.relu = nn.ReLU()

    def forward(self, x):
        o = self.fc1(x.float())
        o = self.relu(o)
        return self.fc2(o)

# Model instantiation
model = OneHotMLP(vocab_size)

## V - Laço de Treinamento - Otimização da função de Perda pelo Gradiente descendente

In [None]:
# Verifica se há uma GPU disponível e define o dispositivo para GPU se possível,
# caso contrário, usa a CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if device.type == 'cuda':
    print('GPU:', torch.cuda.get_device_name(torch.cuda.current_device()))
else:
    print('using CPU')

using CPU


In [None]:
model = model.to(device)

# Define loss and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001)

# Função para calcular loss e acurácia
def evaluate(model, data_loader):
    model.eval()
    total_loss = 0.0
    correct_predictions = 0
    total_samples = 0
    with torch.no_grad():
        for inputs, targets in data_loader:
            inputs = inputs.to(device)
            targets = targets.to(device)
            logits = model(inputs)
            loss = criterion(logits.squeeze(), targets.float())
            total_loss += loss.item() * inputs.size(0)
            predicted = (torch.sigmoid(logits) >= 0.5).float()
            correct_predictions += (predicted.squeeze() == targets).sum().item()
            total_samples += targets.size(0)

    average_loss = total_loss / len(data_loader.dataset)
    accuracy = correct_predictions / total_samples
    return average_loss, accuracy

# Avaliação inicial
initial_train_loss, _ = evaluate(model, train_loader)
initial_val_loss, initial_val_accuracy = evaluate(model, test_loader)

print(f'Initial Train Loss: {initial_train_loss:.4f}')
print(f'Initial Validation Loss: {initial_val_loss:.4f}, Validation Accuracy: {initial_val_accuracy:.4f}')

# Training loop
num_epochs = 5
for epoch in range(num_epochs):
    start_time = time.time()
    model.train()
    total_epoch_loss = 0.0
    for inputs, targets in train_loader:
        inputs = inputs.to(device)
        targets = targets.to(device)
        # Forward pass
        logits = model(inputs)
        loss = criterion(logits.squeeze(), targets.float())
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Acumula a Loss do batch
        total_epoch_loss += loss.item() * inputs.size(0)

    # Calcula a Loss média da época
    average_epoch_loss = total_epoch_loss / len(train_loader.dataset)

    # Avaliação no conjunto de validação
    val_loss, val_accuracy = evaluate(model, test_loader)

    end_time = time.time()
    epoch_duration = end_time - start_time

    print(f'Epoch [{epoch+1}/{num_epochs}], '
          f'Train Loss: {average_epoch_loss:.4f}, '
          f'Validation Loss: {val_loss:.4f}, '
          f'Validation Accuracy: {val_accuracy:.4f}, '
          f'Elapsed Time: {epoch_duration:.2f} sec')

Initial Train Loss: 0.6930
Initial Validation Loss: 0.6930, Validation Accuracy: 0.5026
Epoch [1/5], Train Loss: 0.6924, Validation Loss: 0.6918, Validation Accuracy: 0.5180, Elapsed Time: 14.63 sec
Epoch [2/5], Train Loss: 0.6911, Validation Loss: 0.6906, Validation Accuracy: 0.5355, Elapsed Time: 14.66 sec
Epoch [3/5], Train Loss: 0.6898, Validation Loss: 0.6893, Validation Accuracy: 0.5610, Elapsed Time: 15.12 sec
Epoch [4/5], Train Loss: 0.6884, Validation Loss: 0.6878, Validation Accuracy: 0.5907, Elapsed Time: 15.04 sec
Epoch [5/5], Train Loss: 0.6868, Validation Loss: 0.6862, Validation Accuracy: 0.6248, Elapsed Time: 14.63 sec
