In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from datasets import load_dataset
from transformers import BertTokenizer

In [2]:
# 1. Load dataset
dataset = load_dataset("imdb")

In [3]:
# 2. Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# 3. Encode texts with tokenizer
def encode_with_bert(example):
    tokens = tokenizer(
        example["text"],
        truncation=True,
        padding='max_length',
        max_length=128,
    )
    return {
        "input_ids": tokens["input_ids"],
        "attention_mask": tokens["attention_mask"],
        "label": example["label"]
    }

encoded_dataset = dataset.map(encode_with_bert, batched=True)
encoded_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])


In [4]:
# 4. Prepare DataLoader
train_loader = DataLoader(encoded_dataset["train"], batch_size=32, shuffle=True)
test_loader = DataLoader(encoded_dataset["test"], batch_size=32)

# 5. Define Transformer-based model without LoRA
class TextTransformer(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, num_layers, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(embed_dim, num_classes)

    def forward(self, input_ids, attention_mask):
        embedded = self.embedding(input_ids)
        embedded = embedded.masked_fill(attention_mask.unsqueeze(-1) == 0, 0)
        out = self.transformer(embedded)
        out = out[:, 0, :]
        return self.fc(out)



In [5]:
# 6. Initialize model
model = TextTransformer(
    vocab_size=tokenizer.vocab_size,
    embed_dim=128,
    num_heads=4,
    num_layers=2,
    num_classes=4
)

# Move to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 7. Define optimizer and loss
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()


In [6]:
# 8. Train model
for epoch in range(10):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader):.4f}")
torch.save(model.state_dict(), "Transformer_imdb.pth")

Epoch 1, Loss: 0.5836
Epoch 2, Loss: 0.3887
Epoch 3, Loss: 0.3054
Epoch 4, Loss: 0.2398
Epoch 5, Loss: 0.1846
Epoch 6, Loss: 0.1450
Epoch 7, Loss: 0.1095
Epoch 8, Loss: 0.0861
Epoch 9, Loss: 0.0745
Epoch 10, Loss: 0.0668


In [10]:
# 9. Evaluation
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids, attention_mask)
        predicted = torch.argmax(outputs, dim=1)
        correct += (predicted == labels).sum().item()
        total += labels.size(0)

print(f"Test Accuracy: {correct / total:.2%}")


Test Accuracy: 77.24%
