In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import re
import torch
from torchtext.legacy import data # type: ignore 

# Download IMDB dataset
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
!wget -qO- $url | tar xvz -C /content/

# Load dataset
def load_imdb_data(path):
    data = []
    for label in ["pos", "neg"]:
        for file in os.listdir(f"{path}/{label}"):
            with open(f"{path}/{label}/{file}", "r") as f:
                data.append((f.read(), label))
    return pd.DataFrame(data, columns=["review", "sentiment"])

# Combine train and test data
train_data = load_imdb_data("/content/aclImdb/train")
test_data = load_imdb_data("/content/aclImdb/test")

# Combine data for preprocessing
data = pd.concat([train_data, test_data], ignore_index=True)

# Encode labels
label_encoder = LabelEncoder()
data["sentiment"] = label_encoder.fit_transform(data["sentiment"])

# Split data into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Preprocess text
def preprocess_text(text):
    text = re.sub(r"<br />", " ", text)
    text = re.sub(r"[^a-zA-Z]", " ", text)
    text = text.lower()
    text = text.split()
    return " ".join(text)

train_data["review"] = train_data["review"].apply(preprocess_text)
test_data["review"] = test_data["review"].apply(preprocess_text)

# Convert to Torchtext format
TEXT = data.Field(tokenize="spacy", tokenizer_language="en_core_web_sm")
LABEL = data.LabelField(dtype=torch.float)

fields = [("review", TEXT), ("sentiment", LABEL)]

train_examples = [data.Example.fromlist([row.review, row.sentiment], fields) for _, row in train_data.iterrows()]
test_examples = [data.Example.fromlist([row.review, row.sentiment], fields) for _, row in test_data.iterrows()]

train_dataset = data.Dataset(train_examples, fields)
test_dataset = data.Dataset(test_examples, fields)

# Build vocabulary
TEXT.build_vocab(train_dataset, max_size=25000, vectors="glove.6B.100d", unk_init=torch.Tensor.normal_)
LABEL.build_vocab(train_dataset)

# Create iterators
train_iterator, test_iterator = data.BucketIterator.splits(
    (train_dataset, test_dataset), 
    batch_size=64, 
    device=torch.device("cuda" if torch.cuda.is_available() else "cpu"))


ModuleNotFoundError: No module named 'torchtext.legacy'

In [None]:
import torch.nn as nn

class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, 
                            hidden_dim, 
                            num_layers=n_layers, 
                            bidirectional=bidirectional, 
                            dropout=dropout,
                            batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        embedded = self.dropout(self.embedding(text))
        lstm_out, (hidden, cell) = self.lstm(embedded)
        if self.lstm.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        else:
            hidden = self.dropout(hidden[-1,:,:])
        return self.fc(hidden)

# Hyperparameters
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5

model = LSTMModel(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)

# Load pretrained embeddings
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

# Define optimizer and loss function
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()

# Move to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
criterion = criterion.to(device)


In [None]:
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc

def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    
    for batch in iterator:
        optimizer.zero_grad()
        predictions = model(batch.review).squeeze(1)
        loss = criterion(predictions, batch.sentiment)
        acc = binary_accuracy(predictions, batch.sentiment)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    
    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.review).squeeze(1)
            loss = criterion(predictions, batch.sentiment)
            acc = binary_accuracy(predictions, batch.sentiment)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
    
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

N_EPOCHS = 5

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    test_loss, test_acc = evaluate(model, test_iterator, criterion)
    
    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\tTest Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')


In [None]:
import matplotlib.pyplot as plt

# Plot training and validation loss
epochs = range(1, N_EPOCHS + 1)
train_losses = []
test_losses = []

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    test_loss, test_acc = evaluate(model, test_iterator, criterion)
    train_losses.append(train_loss)
    test_losses.append(test_loss)

plt.plot(epochs, train_losses, 'b', label='Training loss')
plt.plot(epochs, test_losses, 'r', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()
