In [None]:
!pip install torch==2.2.0 torchtext==0.16.2
!pip install 'portalocker>=2.0.0'




In [None]:
import torch
import torch.nn as nn
from torchtext.datasets import IMDB
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.optim as optim
import matplotlib.pyplot as plt



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/usr/local/lib/python3.12/dist-packages/colab_kernel_launcher.py", line 37, in <module>
    ColabKernelApp.launch_instance()
  File "/usr/local/lib/python3.12/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/usr/local/lib/python3.12/dist-packages/ipykernel/kernelapp.py", line 712, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.12/dist-package

In [None]:
if torch.cuda.is_available():
    device=torch.device(type='cuda',index=0)
else:
    device=torch.device(type='cpu',index=0)

In [None]:
train_iter, test_iter = IMDB()
print("Loaded IMDb dataset (25,000 train + 25,000 test)")

Loaded IMDb dataset (25,000 train + 25,000 test)


In [None]:
print("Creating tokenizer and building vocabulary...")

tokenizer = get_tokenizer('basic_english')

def yield_tokens(data_iter):
    for label, text in data_iter:
        yield tokenizer(text)


vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])
print(f"\nVocabulary built successfully!")
print(f"Total vocabulary size: {len(vocab)} words\n")


print(" Sample words from vocabulary:")
print(list(vocab.get_stoi().keys())[:200])


Creating tokenizer and building vocabulary...

Vocabulary built successfully!
Total vocabulary size: 100683 words

 Sample words from vocabulary:
['₤250', '₤100', '“you’ve', '“x”', '“sanatorium”', '“playboy”', '“mr', '“jean', '“it’s', '“him”', '“family”', '“consider', '“b”', '“at', '’', '‘revenge’', '‘lifer’', 'üvegtigris', 'über-spy', 'über-annoying', 'ø', 'ís', 'évery', 'étc', 'émigrés', 'écran', 'åmål', 'äänekoski', 'ääliöt', 'ángela', 'ángel', 'ánd', 'álvaro', 'álex', '¿acting', '½/*****', '½*', 'º', '´cos', '´83', '®', '«there', '«the', '«syvsoverskens', '«modern', '«lexx»', '«i', '«farscape»', '«boy', '«bazar»', '«battlestar', '¨una', '¨town', '¨the', '¨scandal', '¨sabretooth', '¨nuit', '¨invitation', '¨gore', '¨by', '¨abe', '¨a', '§12', '£9', '£8', '£300', '£200', '£18', '£16', '¡§october', '¡§just', '¡§galaxy', '¡§astronauts', '¡viva', '¡the', '¡colombians', '\x97two', '\x97he', '\x97are', '\x97all', '\x96sensitive', '\x96russwill', '\x96organized', '\x96like', '\x96knit', '\x9

In [None]:
def text_pipeline(x):
    return vocab(tokenizer(x))

def label_pipeline(y):
    return 1 if y == 'pos' else 0
sample_label, sample_text = next(iter(train_iter))
print("\n Example sample:")
print("Label:", sample_label)
print("Text:", sample_text[:200], "...")
print("Encoded label:", label_pipeline(sample_label))
print("Tokenized sample words:", tokenizer(sample_text)[:10])


 Example sample:
Label: 1
Text: I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ev ...
Encoded label: 0
Tokenized sample words: ['i', 'rented', 'i', 'am', 'curious-yellow', 'from', 'my', 'video', 'store', 'because']


In [None]:
def collate_batch(batch):
    label_list, text_list, lengths = [], [], []
    for label, text in batch:
        label_list.append(label_pipeline(label))
        processed_text = torch.tensor(text_pipeline(text), dtype=torch.int64)
        text_list.append(processed_text)
        lengths.append(len(processed_text))

    text_list = pad_sequence(text_list, batch_first=True)
    label_list = torch.tensor(label_list, dtype=torch.int64)
    lengths = torch.tensor(lengths)
    return text_list, label_list, lengths

train_iter, test_iter = IMDB()

train_dataloader = DataLoader(list(train_iter), batch_size=64, shuffle=True,
                              collate_fn=collate_batch)
test_dataloader = DataLoader(list(test_iter), batch_size=64, shuffle=False,
                             collate_fn=collate_batch)
print(f" DataLoader created!  Train batches: {len(train_dataloader)}, Test batches: {len(test_dataloader)}")

 DataLoader created!  Train batches: 391, Test batches: 391


In [None]:
class GRUClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, n_layers, dropout):
        super(GRUClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.gru = nn.GRU(embed_dim, hidden_dim, num_layers=n_layers,
                          batch_first=True, dropout=dropout, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text, lengths):
        embedded = self.embedding(text)
        packed_output, hidden = self.gru(embedded)
        hidden_cat = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        out = self.fc(self.dropout(hidden_cat))
        return out


In [None]:
vocab_size = len(vocab)
embed_dim = 100
hidden_dim = 128
output_dim = 2
n_layers = 2
dropout = 0.3

model = GRUClassifier(vocab_size, embed_dim, hidden_dim, output_dim, n_layers, dropout)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
criterion = criterion.to(device)

print(f"Model initialized on device: {device}")
print(f"Model Parameters:\nEmbedding={embed_dim}, Hidden={hidden_dim}, Layers={n_layers}, Dropout={dropout}")

Model initialized on device: cpu
Model Parameters:
Embedding=100, Hidden=128, Layers=2, Dropout=0.3


In [None]:
def train_epoch(model, dataloader, criterion, optimizer):
    model.train()
    total_loss, total_correct = 0, 0

    for batch_idx, (text, labels, lengths) in enumerate(dataloader):
        text, labels = text.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(text, lengths)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        total_correct += (outputs.argmax(1) == labels).sum().item()


        if batch_idx + 1 == len(dataloader):
            print(f"  Batch {batch_idx+1}/{len(dataloader)} processed.")

    return total_loss / len(dataloader), total_correct / len(dataloader.dataset)


In [None]:
def evaluate(model, dataloader, criterion):
    model.eval()
    total_loss, total_correct = 0, 0
    with torch.no_grad():
        for text, labels, lengths in dataloader:
            text, labels = text.to(device), labels.to(device)
            outputs = model(text, lengths)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            total_correct += (outputs.argmax(1) == labels).sum().item()
    return total_loss / len(dataloader), total_correct / len(dataloader.dataset)

In [15]:
num_epochs = 10
for epoch in range(num_epochs):
    print(f"\n Epoch {epoch+1}/{num_epochs}")
    train_loss, train_acc = train_epoch(model, train_dataloader, criterion, optimizer)

    eval_loss, eval_acc = evaluate(model, test_dataloader, criterion)

    print(f"Epoch={epoch+1}, Train Loss={train_loss:.4f}, Train Acc={train_acc*100:.2f}")
    print(f"Eval Loss={eval_loss:.4f}, Eval Acc={eval_acc*100:.2f}")


 Epoch 1/10
  Batch 391/391 processed.
Epoch=1, Train Loss=0.0000, Train Acc=100.00
Eval Loss=0.0000, Eval Acc=100.00

 Epoch 2/10
  Batch 391/391 processed.
Epoch=2, Train Loss=0.0000, Train Acc=100.00
Eval Loss=0.0000, Eval Acc=100.00

 Epoch 3/10
  Batch 391/391 processed.
Epoch=3, Train Loss=0.0000, Train Acc=100.00
Eval Loss=0.0000, Eval Acc=100.00

 Epoch 4/10
  Batch 391/391 processed.
Epoch=4, Train Loss=0.0000, Train Acc=100.00
Eval Loss=0.0000, Eval Acc=100.00

 Epoch 5/10
  Batch 391/391 processed.
Epoch=5, Train Loss=0.0000, Train Acc=100.00
Eval Loss=0.0000, Eval Acc=100.00

 Epoch 6/10
  Batch 391/391 processed.
Epoch=6, Train Loss=0.0000, Train Acc=100.00
Eval Loss=0.0000, Eval Acc=100.00

 Epoch 7/10
  Batch 391/391 processed.
Epoch=7, Train Loss=0.0000, Train Acc=100.00
Eval Loss=0.0000, Eval Acc=100.00

 Epoch 8/10
  Batch 391/391 processed.
Epoch=8, Train Loss=0.0000, Train Acc=100.00
Eval Loss=0.0000, Eval Acc=100.00

 Epoch 9/10
  Batch 391/391 processed.
Epoch=9,