<a href="https://colab.research.google.com/github/HamdanXI/nlp_adventure/blob/main/lstm_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install tqdm



In [3]:
!pip install datasets transformers

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/521.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.6/521.2 kB[0m [31m3.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m


In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Subset
from datasets import load_dataset
from transformers import BertTokenizer
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the IMDb dataset
dataset = load_dataset("imdb")
train_data = dataset['train']
test_data = dataset['test']

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenization and encoding of the dataset
def encode(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=256)

train_data = train_data.map(encode, batched=True)
test_data = test_data.map(encode, batched=True)

train_data.set_format(type='torch', columns=['input_ids', 'label'])
test_data.set_format(type='torch', columns=['input_ids', 'label'])

# Take only the first 1,000 samples from train_data
train_data = Subset(train_data, range(100))

# Create data loaders
BATCH_SIZE = 32
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE)

# Define the LSTM model
class LSTM(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(tokenizer.vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, text):
        embedded = self.embedding(text)
        _, (hidden, _) = self.rnn(embedded)
        hidden = hidden.squeeze(0)
        return self.fc(hidden)

# Model instance
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1

model = LSTM(EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
model = model.to(device)

# Loss and optimizer
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()
criterion = criterion.to(device)

# Train function
def train(model, iterator, optimizer, criterion):
    model.train()
    total_loss = 0
    progress_bar = tqdm(iterator, desc='Training', leave=False)
    for batch in progress_bar:
        optimizer.zero_grad()
        predictions = model(batch['input_ids']).squeeze(1)
        loss = criterion(predictions, batch['label'].float())
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        progress_bar.set_postfix({'Training Loss': '{:.4f}'.format(total_loss / (progress_bar.n+1))})
    return total_loss / len(iterator)

# Evaluate function (remains the same as before)
def evaluate(model, iterator, criterion):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch['input_ids']).squeeze(1)
            loss = criterion(predictions, batch['label'].float())
            total_loss += loss.item()
    return total_loss / len(iterator)

# Rest of the training and evaluation code remains the same


# Train for some epochs
N_EPOCHS = 1
for epoch in range(N_EPOCHS):
    train(model, train_loader, optimizer, criterion)
    evaluate(model, test_loader, criterion)

# Test the model
evaluate(model, test_loader, criterion)

