                                                            NLP ASSIGNMENT - 3

In [8]:
import re
import csv
from datetime import datetime

def parse_date(text):
    """
    Parses a date from a given text string and formats it as DD/MM/YYYY.
    This function uses regular expressions to handle various date formats.

    Args:
        text (str): The input text containing a date.

    Returns:
        str: The extracted and formatted date in DD/MM/YYYY format, or None if no date is found.
    """
    # A dictionary to map month names to numbers
    month_mapping = {
        'january': '01', 'jan': '01',
        'february': '02', 'feb': '02',
        'march': '03', 'mar': '03',
        'april': '04', 'apr': '04',
        'may': '05',
        'june': '06', 'jun': '06',
        'july': '07', 'jul': '06',
        'august': '08', 'aug': '08',
        'september': '09', 'sep': '09',
        'october': '10', 'oct': '10',
        'november': '11', 'nov': '11',
        'december': '12', 'dec': '12'
    }

    # Regex patterns for different date formats
    patterns = [
        # YYYY-MM-DD, YYYY.MM.DD
        r'(\d{4})[-.\/](\d{1,2})[-.\/](\d{1,2})',
        # DD-MM-YYYY, DD/MM/YYYY, DD.MM.YYYY
        r'(\d{1,2})[-.\/](\d{1,2})[-.\/](\d{4})',
        # MM/DD/YY or YYYY
        r'(\d{1,2})[-.\/](\d{1,2})[-.\/](\d{2,4})',
        # Month Day, Year (e.g., March 5, 2023)
        r'(?:on|for|is on|starts on|is set for|was born on|on the) (?:the )?(\d{1,2})(?:st|nd|rd|th)?(?: of)? ([a-zA-Z]+)[,.]? (\d{4})',
        # Month Day, Year (e.g., March 5, 2023) without 'on the' part
        r'([a-zA-Z]+) (\d{1,2})(?:st|nd|rd|th)?[,.]? (\d{4})',
        # Day Month Year (e.g., 5th March 2023)
        r'(\d{1,2})(?:st|nd|rd|th)?(?: of)? ([a-zA-Z]+)[,.]? (\d{4})'
    ]

    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            parts = match.groups()
            
            # Case: DD-MM-YYYY or similar
            if len(parts) == 3 and parts[0].isdigit() and parts[1].isdigit():
                day = parts[0]
                month = parts[1]
                year = parts[2]
            
            # Case: YYYY-MM-DD or similar
            elif len(parts) == 3 and parts[0].isdigit() and parts[1].isdigit() and len(parts[0]) == 4:
                day = parts[2]
                month = parts[1]
                year = parts[0]

            # Case: Month Day, Year
            elif len(parts) == 3 and not parts[0].isdigit() and parts[1].isdigit():
                month = month_mapping.get(parts[0].lower())
                day = parts[1]
                year = parts[2]
                if not month:
                    continue
            
            # Case: Day Month, Year or Month Day, Year
            elif len(parts) == 3 and parts[0].isdigit() and not parts[1].isdigit():
                day = parts[0]
                month = month_mapping.get(parts[1].lower())
                year = parts[2]
                if not month:
                    continue
            
            # Format the day and month with leading zeros if necessary
            day = day.zfill(2)
            month = month.zfill(2)
            
            # Handle 2-digit years
            if len(year) == 2:
                year = f"20{year}" if int(year) < 50 else f"19{year}"

            return f"{day}/{month}/{year}"

    return None

def process_testcases(filename):
    """
    Reads a CSV file of test cases, processes each line, and prints the output.

    Args:
        filename (str): The path to the CSV file.
    """
    print("Running date parser test cases...")
    with open(filename, 'r', encoding='utf-8') as f:
        reader = csv.reader(f)
        next(reader)  # Skip header row
        for i, row in enumerate(reader):
            input_text = row[0]
            expected_output = row[1]
            extracted_date = parse_date(input_text)
            print(f"Test Case {i+1}:")
            print(f"  Input Text:      '{input_text}'")
            print(f"  Extracted Date:  '{extracted_date}'")
            print(f"  Expected Output: '{expected_output}'")
            print(f"  Status:          {'PASS' if extracted_date == expected_output else 'FAIL'}")
            print("-" * 30)

if __name__ == "__main__":
    process_testcases("date_parser_testcases.csv")


Running date parser test cases...
Test Case 1:
  Input Text:      'The event will take place on March 5, 2023.'
  Extracted Date:  '05/03/2023'
  Expected Output: '05/03/2023'
  Status:          PASS
------------------------------
Test Case 2:
  Input Text:      'Her birthday is on 07/08/1990.'
  Extracted Date:  '07/08/1990'
  Expected Output: '07/08/1990'
  Status:          PASS
------------------------------
Test Case 3:
  Input Text:      'The deadline is 2022-12-31.'
  Extracted Date:  '2022/12/2031'
  Expected Output: '31/12/2022'
  Status:          FAIL
------------------------------
Test Case 4:
  Input Text:      'We met on 1st of January 2000.'
  Extracted Date:  '01/01/2000'
  Expected Output: '01/01/2000'
  Status:          PASS
------------------------------
Test Case 5:
  Input Text:      'The concert is scheduled for 15th September, 2021.'
  Extracted Date:  '15/09/2021'
  Expected Output: '15/09/2021'
  Status:          PASS
------------------------------
Test Case 6:
 

In [14]:
! pip install spacy
! python -m spacy download en_core_web_sm


import spacy
nlp = spacy.load("en_core_web_sm")


text = [
    "This is her book.",
    "Give the book back to her."
]


for t in text:
    doc = nlp(t)

    for tok in doc:
        print(tok.text, tok.dep_)
    
    print()




[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip
  import scipy.sparse as _sparse

A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 189, in _run_module_as_main
  File "<frozen runpy>", line 148, in _get_module_details
  File "<frozen runpy>", line 112, in _get_module_details
  File "c:\Users\Yatha\AppData\Local\Programs\Python\Python311\Lib\site-packages\spacy\__init__.py", line 6, in <module>
  File "c:\Users\Yatha\AppData\Local\Programs\Python\Python311\Lib\site-pac

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     -- ------------------------------------- 0.8/12.8 MB 3.0 MB/s eta 0:00:04
     ---- ----------------------------------- 1.6/12.8 MB 3.4 MB/s eta 0:00:04
     ------ --------------------------------- 2.1/12.8 MB 3.3 MB/s eta 0:00:04
     ---------- ----------------------------- 3.4/12.8 MB 3.7 MB/s eta 0:00:03
     ------------- -------------------------- 4.5/12.8 MB 4.1 MB/s eta 0:00:03
     ------------------ --------------------- 5.8/12.8 MB 4.3 MB/s eta 0:00:02
     --------------------- ------------------ 6.8/12.8 MB 4.5 MB/s eta 0:00:02
     ------------------------- -------------- 8.1/12.8 MB 4.6 MB/s eta 0:00:02
     ----------------------------- ---------- 9.

In [16]:
import csv
import re
import spacy

# Note: As you correctly pointed out in your previous error,
# the spacy model 'en_core_web_sm' must be downloaded separately.
# Ensure you have run:
# ! python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")

def transform_pronouns(text, target_gender):
    """
    Transforms gendered pronouns in a sentence to the opposite gender.

    Args:
        text (str): The input sentence.
        target_gender (str): The target gender ('male' or 'female').

    Returns:
        str: The sentence with transformed pronouns.
    """
    # A single, comprehensive mapping for pronoun transformations.
    pronoun_map = {
        'he': 'she', 'He': 'She', 'HE': 'SHE',
        'she': 'he', 'She': 'He', 'SHE': 'HE',
        'him': 'her', 'Him': 'Her', 'HIM': 'HER',
        'her': 'him', 'Her': 'Him', 'HER': 'HIM',
        'his': 'her', 'His': 'Her', 'HIS': 'HER',
        'hers': 'his', 'Hers': 'His', 'HERS': 'HIS',
        'himself': 'herself', 'Himself': 'Herself', 'HIMSELF': 'HERSELF',
        'herself': 'himself', 'Herself': 'Himself', 'HERSELF': 'HIMSELF'
    }

    # Decide which direction to swap based on the target gender.
    if target_gender.lower() == 'female':
        # Create a mapping to convert male pronouns to female ones.
        swap_map = {
            'he': 'she', 'He': 'She', 'HE': 'SHE',
            'him': 'her', 'Him': 'Her', 'HIM': 'HER',
            'his': 'her', 'His': 'Her', 'HIS': 'HER',
            'himself': 'herself', 'Himself': 'Herself', 'HIMSELF': 'HERSELF'
        }
    elif target_gender.lower() == 'male':
        # Create a mapping to convert female pronouns to male ones.
        swap_map = {
            'she': 'he', 'She': 'He', 'SHE': 'HE',
            'her': 'him', 'Her': 'Him', 'HER': 'HIM',
            'hers': 'his', 'Hers': 'His', 'HERS': 'HIS',
            'herself': 'himself', 'Herself': 'Himself', 'HERSELF': 'HIMSELF'
        }
    else:
        return text

    words = text.split()
    transformed_words = []
    
    for word in words:
        # Check if the word, after stripping punctuation, is a key in our swap map.
        clean_word = re.sub(r'[^a-zA-Z]', '', word)
        
        if clean_word in swap_map:
            transformed_word = swap_map[clean_word]
            # Preserve the original capitalization and punctuation.
            # For example, if the word was "He.", the new word should be "She."
            if not clean_word == word:
                transformed_word += word[len(clean_word):]
            transformed_words.append(transformed_word)
        else:
            transformed_words.append(word)

    # Join the words back into a sentence.
    return " ".join(transformed_words)

def process_testcases(filename):
    """
    Reads a CSV of test cases, processes each line, and prints the output.

    Args:
        filename (str): The path to the CSV file.
    """
    print("Running pronoun transformer test cases...")
    with open(filename, 'r', encoding='utf-8') as f:
        reader = csv.reader(f)
        next(reader)  # Skip header row
        for i, row in enumerate(reader):
            input_text = row[0]
            target_gender = row[1]
            expected_output = row[2]
            
            transformed_text = transform_pronouns(input_text, target_gender)

            print(f"Test Case {i+1}:")
            print(f"  Input Text:       '{input_text}'")
            print(f"  Target Gender:    '{target_gender}'")
            print(f"  Transformed Text: '{transformed_text}'")
            print(f"  Expected Output:  '{expected_output}'")
            print(f"  Status:           {'PASS' if transformed_text.strip() == expected_output.strip() else 'FAIL'}")
            print("-" * 30)


if __name__ == "__main__":
    process_testcases("pronoun_testcases.csv")

Running pronoun transformer test cases...
Test Case 1:
  Input Text:       'He is going to the market.'
  Target Gender:    'female'
  Transformed Text: 'She is going to the market.'
  Expected Output:  'She is going to the market.'
  Status:           PASS
------------------------------
Test Case 2:
  Input Text:       'His book is on the table.'
  Target Gender:    'female'
  Transformed Text: 'Her book is on the table.'
  Expected Output:  'Her book is on the table.'
  Status:           PASS
------------------------------
Test Case 3:
  Input Text:       'I saw him yesterday.'
  Target Gender:    'female'
  Transformed Text: 'I saw her yesterday.'
  Expected Output:  'I saw her yesterday.'
  Status:           PASS
------------------------------
Test Case 4:
  Input Text:       'He hurt himself.'
  Target Gender:    'female'
  Transformed Text: 'She hurt herself.'
  Expected Output:  'She hurt herself.'
  Status:           PASS
------------------------------
Test Case 5:
  Input Text

In [19]:
!pip install torchtext

Collecting torchtext
  Downloading torchtext-0.18.0-cp311-cp311-win_amd64.whl.metadata (7.9 kB)
Collecting torch>=2.3.0 (from torchtext)
  Using cached torch-2.8.0-cp311-cp311-win_amd64.whl.metadata (30 kB)
Collecting sympy>=1.13.3 (from torch>=2.3.0->torchtext)
  Using cached sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Downloading torchtext-0.18.0-cp311-cp311-win_amd64.whl (1.9 MB)
   ---------------------------------------- 0.0/1.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.9 MB ? eta -:--:--
   ----- ---------------------------------- 0.3/1.9 MB ? eta -:--:--
   ---------- ----------------------------- 0.5/1.9 MB 1.2 MB/s eta 0:00:02
   ---------------- ----------------------- 0.8/1.9 MB 1.1 MB/s eta 0:00:02
   --------------------- ------------------ 1.0/1.9 MB 1.2 MB/s eta 0:00:01
   -------------------------- ------------- 1.3/1.9 MB 1.3 MB/s eta 0:00:01
   ------------------------------------- -- 1.8/1.9 MB 1.5 MB/s eta 0:00:01
   ---------------------

  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchaudio 2.2.2+cpu requires torch==2.2.2, but you have torch 2.8.0 which is incompatible.
torchvision 0.17.2+cpu requires torch==2.2.2, but you have torch 2.8.0 which is incompatible.
ultralytics 8.3.93 requires numpy<=2.1.1,>=1.23.0, but you have numpy 2.3.2 which is incompatible.

[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [21]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split
from torchtext.vocab import GloVe
from collections import Counter
import re
import random
import os

# Set a seed for reproducibility
SEED = 42
torch.manual_seed(SEED)
random.seed(SEED)
torch.backends.cudnn.deterministic = True

# --- 1. Data Loading and Preprocessing ---
def load_and_preprocess_data(file_path):
    """
    Loads the IMDB dataset, cleans the text, and splits into train/test sets.
    
    Args:
        file_path (str): The path to the IMDB Dataset CSV file.
        
    Returns:
        tuple: A tuple containing lists of train reviews, train labels,
               test reviews, and test labels.
    """
    df = pd.read_csv(file_path)
    
    # Simple text cleaning
    df['review'] = df['review'].apply(lambda x: re.sub(r'<br\s*/>', ' ', x).strip())
    
    # Convert sentiment labels to numerical values (0 for 'negative', 1 for 'positive')
    df['sentiment'] = df['sentiment'].map({'negative': 0, 'positive': 1})
    
    # Split data into training and testing sets
    reviews_train, reviews_test, labels_train, labels_test = train_test_split(
        df['review'].tolist(), 
        df['sentiment'].tolist(), 
        test_size=0.2, 
        random_state=SEED
    )
    return reviews_train, reviews_test, labels_train, labels_test

class TextDataset(Dataset):
    """
    A custom PyTorch Dataset for text data.
    """
    def __init__(self, texts, labels, vocab, max_len):
        self.texts = [self.tokenize(text, vocab, max_len) for text in texts]
        self.labels = labels
        
    def __len__(self):
        return len(self.texts)
        
    def __getitem__(self, idx):
        return torch.tensor(self.texts[idx]), torch.tensor(self.labels[idx])
    
    def tokenize(self, text, vocab, max_len):
        """
        Tokenizes and pads/truncates a text.
        The function is corrected to use the standard Python dictionary `vocab`
        instead of a non-existent `.stoi` attribute.
        """
        tokens = text.lower().split()
        
        # Look up word indices, defaulting to the '<unk>' token's index for unknown words.
        indexed_tokens = [vocab.get(token, vocab['<unk>']) for token in tokens]
        
        if len(indexed_tokens) < max_len:
            # Pad with the index of the padding token (0)
            indexed_tokens += [vocab['<pad>']] * (max_len - len(indexed_tokens))
        elif len(indexed_tokens) > max_len:
            # Truncate
            indexed_tokens = indexed_tokens[:max_len]
        
        return indexed_tokens

# --- 2. GloVe Embeddings & Vocabulary ---
def build_glove_vocab(reviews_train):
    """
    Builds a vocabulary and loads GloVe embeddings.
    
    Note: This will download the GloVe embeddings file (~822MB) the first time it's run.
    """
    counter = Counter(' '.join(reviews_train).lower().split())
    glove = GloVe(name='6B', dim=100)
    
    # Build a vocabulary that includes all words in the dataset and in GloVe
    vocab_itos = ['<pad>', '<unk>'] + [word for word in counter if word in glove.stoi]
    vocab_stoi = {word: i for i, word in enumerate(vocab_itos)}
    
    # Create the embedding matrix
    weights_matrix = torch.zeros(len(vocab_itos), glove.dim)
    for i, word in enumerate(vocab_itos):
        if word in glove.stoi:
            weights_matrix[i] = glove[word]
    
    return vocab_stoi, weights_matrix

# --- 3. Model Architectures ---
class RNN(nn.Module):
    """Vanilla RNN with an Embedding layer."""
    def __init__(self, embedding_dim, hidden_dim, output_dim, vocab_size, pretrained_embeddings=None):
        super().__init__()
        
        if pretrained_embeddings is not None:
            self.embedding = nn.Embedding.from_pretrained(pretrained_embeddings, padding_idx=0)
        else:
            self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
            
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):
        embedded = self.embedding(text)
        output, hidden = self.rnn(embedded)
        hidden_output = hidden.squeeze(0) # Use the last hidden state
        return self.fc(hidden_output)

class LSTM(nn.Module):
    """LSTM with an Embedding layer."""
    def __init__(self, embedding_dim, hidden_dim, output_dim, vocab_size, pretrained_embeddings=None):
        super().__init__()
        
        if pretrained_embeddings is not None:
            self.embedding = nn.Embedding.from_pretrained(pretrained_embeddings, padding_idx=0)
        else:
            self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
            
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):
        embedded = self.embedding(text)
        output, (hidden, cell) = self.lstm(embedded)
        hidden_output = hidden.squeeze(0) # Use the last hidden state
        return self.fc(hidden_output)

# --- 4. Training and Evaluation Functions ---
def train_model(model, iterator, optimizer, criterion):
    """Trains the model for one epoch."""
    model.train()
    epoch_loss = 0
    epoch_acc = 0
    
    for batch in iterator:
        optimizer.zero_grad()
        text, labels = batch
        predictions = model(text).squeeze(1)
        
        # Calculate loss and accuracy
        loss = criterion(predictions, labels.float())
        acc = binary_accuracy(predictions, labels)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate_model(model, iterator, criterion):
    """Evaluates the model's performance."""
    model.eval()
    epoch_loss = 0
    epoch_acc = 0
    
    with torch.no_grad():
        for batch in iterator:
            text, labels = batch
            predictions = model(text).squeeze(1)
            
            loss = criterion(predictions, labels.float())
            acc = binary_accuracy(predictions, labels)
            
            epoch_loss += loss.item()
            epoch_acc += acc.item()
            
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def binary_accuracy(preds, y):
    """Calculates accuracy for binary classification."""
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc

# --- 5. Main Execution Block ---
if __name__ == "__main__":
    
    # Hyperparameters
    MAX_LEN = 256
    EMBEDDING_DIM = 100
    HIDDEN_DIM = 256
    OUTPUT_DIM = 1
    N_EPOCHS = 5
    BATCH_SIZE = 64
    LEARNING_RATE = 1e-3
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
    reviews_train, reviews_test, labels_train, labels_test = load_and_preprocess_data('IMDB Dataset.csv')

    # --- Part 1: GloVe Embeddings with Vanilla RNN and LSTMs ---
    print("\n--- Part 1: Training with GloVe Embeddings ---")
    
    # Build vocabulary and get pre-trained weights
    vocab_stoi, weights_matrix = build_glove_vocab(reviews_train)
    VOCAB_SIZE = len(vocab_stoi)

    train_data = TextDataset(reviews_train, labels_train, vocab_stoi, MAX_LEN)
    test_data = TextDataset(reviews_test, labels_test, vocab_stoi, MAX_LEN)
    
    train_iterator = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
    test_iterator = DataLoader(test_data, batch_size=BATCH_SIZE)
    
    # 1.1 GloVe with Vanilla RNN
    print("\n--- Training GloVe + Vanilla RNN ---")
    rnn_glove_model = RNN(EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, VOCAB_SIZE, weights_matrix).to(device)
    optimizer_rnn_glove = optim.Adam(rnn_glove_model.parameters(), lr=LEARNING_RATE)
    criterion = nn.BCEWithLogitsLoss().to(device)
    
    for epoch in range(N_EPOCHS):
        train_loss, train_acc = train_model(rnn_glove_model, train_iterator, optimizer_rnn_glove, criterion)
        valid_loss, valid_acc = evaluate_model(rnn_glove_model, test_iterator, criterion)
        print(f'Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}%')
        
    # 1.2 GloVe with LSTM
    print("\n--- Training GloVe + LSTM ---")
    lstm_glove_model = LSTM(EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, VOCAB_SIZE, weights_matrix).to(device)
    optimizer_lstm_glove = optim.Adam(lstm_glove_model.parameters(), lr=LEARNING_RATE)
    
    for epoch in range(N_EPOCHS):
        train_loss, train_acc = train_model(lstm_glove_model, train_iterator, optimizer_lstm_glove, criterion)
        valid_loss, valid_acc = evaluate_model(lstm_glove_model, test_iterator, criterion)
        print(f'Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}%')

    # --- Part 2: On-the-fly Embeddings with Vanilla RNN and LSTMs ---
    print("\n--- Part 2: Training with On-the-fly Embeddings ---")
    
    # A simple vocabulary for on-the-fly embeddings. No need for GloVe.
    counter_simple = Counter(' '.join(reviews_train).lower().split())
    vocab_stoi_simple = {word: i+2 for i, word in enumerate(counter_simple)}
    vocab_stoi_simple['<pad>'] = 0
    vocab_stoi_simple['<unk>'] = 1
    VOCAB_SIZE_SIMPLE = len(vocab_stoi_simple)
    
    train_data_simple = TextDataset(reviews_train, labels_train, vocab_stoi_simple, MAX_LEN)
    test_data_simple = TextDataset(reviews_test, labels_test, vocab_stoi_simple, MAX_LEN)
    
    train_iterator_simple = DataLoader(train_data_simple, batch_size=BATCH_SIZE, shuffle=True)
    test_iterator_simple = DataLoader(test_data_simple, batch_size=BATCH_SIZE)
    
    # 2.1 On-the-fly Embeddings with Vanilla RNN
    print("\n--- Training On-the-fly + Vanilla RNN ---")
    rnn_simple_model = RNN(EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, VOCAB_SIZE_SIMPLE).to(device)
    optimizer_rnn_simple = optim.Adam(rnn_simple_model.parameters(), lr=LEARNING_RATE)
    
    for epoch in range(N_EPOCHS):
        train_loss, train_acc = train_model(rnn_simple_model, train_iterator_simple, optimizer_rnn_simple, criterion)
        valid_loss, valid_acc = evaluate_model(rnn_simple_model, test_iterator_simple, criterion)
        print(f'Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}%')

    # 2.2 On-the-fly Embeddings with LSTM
    print("\n--- Training On-the-fly + LSTM ---")
    lstm_simple_model = LSTM(EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, VOCAB_SIZE_SIMPLE).to(device)
    optimizer_lstm_simple = optim.Adam(lstm_simple_model.parameters(), lr=LEARNING_RATE)
    
    for epoch in range(N_EPOCHS):
        train_loss, train_acc = train_model(lstm_simple_model, train_iterator_simple, optimizer_lstm_simple, criterion)
        valid_loss, valid_acc = evaluate_model(lstm_simple_model, test_iterator_simple, criterion)
        print(f'Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}%')


Using device: cpu

--- Part 1: Training with GloVe Embeddings ---

--- Training GloVe + Vanilla RNN ---
Epoch: 01 | Train Loss: 0.696 | Train Acc: 50.39% | Val. Loss: 0.694 | Val. Acc: 50.73%
Epoch: 02 | Train Loss: 0.697 | Train Acc: 50.03% | Val. Loss: 0.696 | Val. Acc: 50.91%
Epoch: 03 | Train Loss: 0.696 | Train Acc: 49.82% | Val. Loss: 0.699 | Val. Acc: 49.05%
Epoch: 04 | Train Loss: 0.696 | Train Acc: 50.59% | Val. Loss: 0.694 | Val. Acc: 49.83%
Epoch: 05 | Train Loss: 0.696 | Train Acc: 50.51% | Val. Loss: 0.701 | Val. Acc: 49.79%

--- Training GloVe + LSTM ---
Epoch: 01 | Train Loss: 0.692 | Train Acc: 51.54% | Val. Loss: 0.690 | Val. Acc: 52.34%
Epoch: 02 | Train Loss: 0.692 | Train Acc: 51.91% | Val. Loss: 0.693 | Val. Acc: 50.93%
Epoch: 03 | Train Loss: 0.689 | Train Acc: 52.59% | Val. Loss: 0.693 | Val. Acc: 51.27%
Epoch: 04 | Train Loss: 0.687 | Train Acc: 53.50% | Val. Loss: 0.694 | Val. Acc: 51.55%
Epoch: 05 | Train Loss: 0.637 | Train Acc: 62.87% | Val. Loss: 0.829 | Va