In [None]:
!pip install torchtext




In [None]:
!pip install torch==2.0.1 torchtext==0.15.2




In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import IMDB
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence


In [None]:
!pip install portalocker




In [None]:
from torchtext.datasets import IMDB
train_iter, test_iter = IMDB(split=('train', 'test'))


In [None]:
# Uninstall conflicting versions (if not already done)
!pip uninstall -y torch torchtext

# Install compatible versions of torch and torchtext
!pip install torch==2.0.1 torchtext==0.15.2

# Install portalocker dependency
!pip install portalocker


Found existing installation: torch 2.0.1
Uninstalling torch-2.0.1:
  Successfully uninstalled torch-2.0.1
Found existing installation: torchtext 0.15.2
Uninstalling torchtext-0.15.2:
  Successfully uninstalled torchtext-0.15.2
Collecting torch==2.0.1
  Using cached torch-2.0.1-cp311-cp311-manylinux1_x86_64.whl.metadata (24 kB)
Collecting torchtext==0.15.2
  Using cached torchtext-0.15.2-cp311-cp311-manylinux1_x86_64.whl.metadata (7.4 kB)
Using cached torch-2.0.1-cp311-cp311-manylinux1_x86_64.whl (619.9 MB)
Using cached torchtext-0.15.2-cp311-cp311-manylinux1_x86_64.whl (2.0 MB)
Installing collected packages: torch, torchtext
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchtune 0.6.1 requires torchdata==0.11.0, but you have torchdata 0.6.1 which is incompatible.
torchvision 0.21.0+cu124 requires torch==2.6.0, but you have torch 2.0.1 which is incompati



In [None]:
# 1. Clean uninstall of existing versions
!pip uninstall -y torch torchtext torchdata portalocker

# 2. Install compatible versions
!pip install torch==2.0.1 torchtext==0.15.2 torchdata==0.6.1 portalocker==2.7.0

# 3. Restart runtime after this cell!


Found existing installation: torch 2.0.1
Uninstalling torch-2.0.1:
  Successfully uninstalled torch-2.0.1
Found existing installation: torchtext 0.15.2
Uninstalling torchtext-0.15.2:
  Successfully uninstalled torchtext-0.15.2
Found existing installation: torchdata 0.6.1
Uninstalling torchdata-0.6.1:
  Successfully uninstalled torchdata-0.6.1
Found existing installation: portalocker 2.7.0
Uninstalling portalocker-2.7.0:
  Successfully uninstalled portalocker-2.7.0
Collecting torch==2.0.1
  Using cached torch-2.0.1-cp311-cp311-manylinux1_x86_64.whl.metadata (24 kB)
Collecting torchtext==0.15.2
  Using cached torchtext-0.15.2-cp311-cp311-manylinux1_x86_64.whl.metadata (7.4 kB)
Collecting torchdata==0.6.1
  Using cached torchdata-0.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting portalocker==2.7.0
  Using cached portalocker-2.7.0-py2.py3-none-any.whl.metadata (6.8 kB)
Using cached torch-2.0.1-cp311-cp311-manylinux1_x86_64.whl (619.9 MB)
Using cac

In [None]:
# Imports
from torchtext.datasets import IMDB
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

# Load dataset only ONCE and store it
raw_train_iter, raw_test_iter = list(IMDB(split='train')), list(IMDB(split='test'))

# Tokenizer
tokenizer = get_tokenizer('basic_english')

# Yield tokens
def yield_tokens(data_iter):
    for label, text in data_iter:
        yield tokenizer(text)

# Build vocabulary (use saved raw iterator)
vocab = build_vocab_from_iterator(yield_tokens(raw_train_iter), specials=["<pad>"])
vocab.set_default_index(vocab["<pad>"])



In [None]:
def process(text):
    return torch.tensor(vocab(tokenizer(text)), dtype=torch.long)

def label_to_int(label):
    return 1 if label == 'pos' else 0


In [None]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence


In [None]:
def collate_batch(batch):
    texts = [process(text) for label, text in batch]
    labels = torch.tensor([label_to_int(label) for label, text in batch], dtype=torch.float32)
    texts_padded = pad_sequence(texts, batch_first=True, padding_value=vocab["<pad>"])
    return texts_padded, labels

# Reload datasets again (important)
train_iter, test_iter = IMDB(split=('train', 'test'))

# Create dataloaders
train_loader = DataLoader(list(train_iter), batch_size=32, shuffle=True, collate_fn=collate_batch)
test_loader = DataLoader(list(test_iter), batch_size=32, collate_fn=collate_batch)

In [None]:
import torch.nn as nn
import torch.optim as optim

In [None]:
class SentimentANN(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super(SentimentANN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.fc1 = nn.Linear(embed_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embedding(x)             # (batch, seq_len, embed_dim)
        x = x.mean(dim=1)                 # Average over sequence length
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return self.sigmoid(x).squeeze()


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SentimentANN(len(vocab), embed_dim=100, hidden_dim=64).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [None]:
for epoch in range(5):
    model.train()
    total_loss = 0
    for texts, labels in train_loader:
        texts, labels = texts.to(device), labels.to(device)
        outputs = model(texts)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")


Epoch 1, Loss: 11.1158
Epoch 2, Loss: 0.0068
Epoch 3, Loss: 0.0019
Epoch 4, Loss: 0.0008
Epoch 5, Loss: 0.0004


In [None]:
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for texts, labels in test_loader:
        texts, labels = texts.to(device), labels.to(device)
        outputs = model(texts)
        predicted = (outputs > 0.5).float()
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f"Test Accuracy: {100 * correct / total:.2f}%")


Test Accuracy: 100.00%
