In [2]:
# Train an RNN (with LSTM) to classify short movie reviews as positive or negative.
# Why RNN is suitable

In [3]:
texts = [
    "I loved the movie!",        # 1
    "Absolutely terrible.",      # 0
    "Not bad at all",            # 1
    "Waste of time.",            # 0
    "Fantastic performance!"     # 1
]
labels = [1, 0, 1, 0, 1]


In [18]:
# tokenization and encoding

from collections import defaultdict
import torch
from torch.nn.utils.rnn import pad_sequence

# Build vocabulary
# word2idx: A special dictionary that creates a new unique index for every new word. 
# At this point, it’s just an empty dictionary with auto-indexing behavior.
# for example "I" will get a specific ID, and will always have the same
word2idx = defaultdict(lambda: len(word2idx)) # defaultdict is a special kind of dictionary, it automatically creates a default value for any key that doesn't exist yet.
print("step1", word2idx)

# Assign the token <PAD> to index 0. This token will be used to fill empty space.
# Padding is important because RNNs require uniform sequence lengths for batching
word2idx["<PAD>"] = 0  # padding token
tokenized = [[word2idx[word.lower()] for word in text.split()] for text in texts]
print("tokenized", tokenized)
padded = pad_sequence([torch.tensor(t) for t in tokenized], batch_first=True)
print ("padded", padded)
labels_tensor = torch.tensor(labels).float().unsqueeze(1)


step1 defaultdict(<function <lambda> at 0x123b13ce0>, {})
defaultdict(<function <lambda> at 0x123b13ce0>, {'<PAD>': 0})
tokenized [[1, 2, 3, 4], [5, 6], [7, 8, 9, 10], [11, 12, 13], [14, 15]]
padded tensor([[ 1,  2,  3,  4],
        [ 5,  6,  0,  0],
        [ 7,  8,  9, 10],
        [11, 12, 13,  0],
        [14, 15,  0,  0]])


In [5]:
import torch.nn as nn

class RNNModel(nn.Module):
    def __init__(self, vocab_size, embed_dim=16, hidden_dim=8):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embedding(x)
        _, (hidden, _) = self.lstm(x)
        out = self.fc(hidden[-1])
        return self.sigmoid(out)


In [6]:
model = RNNModel(len(word2idx))
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

for epoch in range(20):
    model.train()
    optimizer.zero_grad()
    output = model(padded)
    loss = loss_fn(output, labels_tensor)
    loss.backward()
    optimizer.step()
    
    pred = (output > 0.5).int()
    acc = (pred == labels_tensor.int()).float().mean()
    print(f"Epoch {epoch+1} | Loss: {loss.item():.4f} | Accuracy: {acc:.2f}")


Epoch 1 | Loss: 0.7339 | Accuracy: 0.20
Epoch 2 | Loss: 0.7079 | Accuracy: 0.40
Epoch 3 | Loss: 0.6831 | Accuracy: 0.40
Epoch 4 | Loss: 0.6599 | Accuracy: 1.00
Epoch 5 | Loss: 0.6384 | Accuracy: 0.80
Epoch 6 | Loss: 0.6186 | Accuracy: 0.80
Epoch 7 | Loss: 0.6001 | Accuracy: 0.80
Epoch 8 | Loss: 0.5820 | Accuracy: 0.80
Epoch 9 | Loss: 0.5633 | Accuracy: 0.80
Epoch 10 | Loss: 0.5435 | Accuracy: 0.80
Epoch 11 | Loss: 0.5222 | Accuracy: 0.80
Epoch 12 | Loss: 0.4989 | Accuracy: 0.80
Epoch 13 | Loss: 0.4734 | Accuracy: 0.80
Epoch 14 | Loss: 0.4456 | Accuracy: 1.00
Epoch 15 | Loss: 0.4162 | Accuracy: 1.00
Epoch 16 | Loss: 0.3867 | Accuracy: 1.00
Epoch 17 | Loss: 0.3586 | Accuracy: 1.00
Epoch 18 | Loss: 0.3319 | Accuracy: 1.00
Epoch 19 | Loss: 0.3059 | Accuracy: 1.00
Epoch 20 | Loss: 0.2805 | Accuracy: 1.00


In [10]:
new_texts = [
    "I hated the ending.",
    "What a masterpiece!",
    "Not bad, could be better.",
    "This movie was a total waste of time.",
    "Absolutely loved every second.", 
    "not bad I really like it"
]

# Tokenize new samples
new_tokenized = [[word2idx[word.lower()] if word.lower() in word2idx else 0
                  for word in text.split()] for text in new_texts]

# Pad sequences to same length
new_padded = pad_sequence([torch.tensor(t) for t in new_tokenized], batch_first=True)


In [11]:
model.eval()  # set model to evaluation mode
with torch.no_grad():
    predictions = model(new_padded)
    predicted_classes = (predictions > 0.5).int().squeeze()

for text, pred in zip(new_texts, predicted_classes):
    sentiment = "Positive 😊" if pred.item() == 1 else "Negative 😞"
    print(f"{text} → {sentiment}")


I hated the ending. → Positive 😊
What a masterpiece! → Positive 😊
Not bad, could be better. → Negative 😞
This movie was a total waste of time. → Negative 😞
Absolutely loved every second. → Positive 😊
not bad I really like it → Positive 😊
