Add RNNs and LSTMs notebook, remove unused Untitled-1.ipynb, and update .DS_Store


In [8]:
import os
print("Current working directory:", os.getcwd())


Current working directory: /Users/mubaraqolojo/Downloads/RNN and LSTMs


In [9]:
import json

train_file = "/Users/mubaraqolojo/Downloads/RNN and LSTMs/train.jsonl"

with open(train_file, "r", encoding="utf-8") as f:
    for line in f:
        if line.strip():
            obj = json.loads(line)
            print("🔍 JSON Sample Entry:")
            print(json.dumps(obj, indent=2))
            break


🔍 JSON Sample Entry:
{
  "prompt": "are occasions on which the governors and the governed meet together,at festivals, on a journey, voyaging or fighting. the sturdy pauper finds that in the hour of danger he is not despised; he sees the rich man puffing and panting, and",
  "completion": "draws"
}


In [None]:
#!/usr/bin/env python3
"""
Foundational AI Project 2 – Language Modeling with RNNs, LSTMs, and Transformer (Graduate Version)

This script trains three different language models (RNN, LSTM, Transformer) for text generation.
It tokenizes the input data using a BPE tokenizer (SentencePiece), builds fixed-length token
sequences, and then trains the models with early stopping and learning rate scheduling.
Evaluation metrics such as perplexity and token accuracy are computed, loss curves are plotted,
and sample text is generated using a prompt. After training, each model is saved to file.
Timing measurements are used to display the duration of each epoch and overall training per model.
"""

import os
import math
import json
import random
import time  # For measuring time durations
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import Dataset, DataLoader
import sentencepiece as spm
import matplotlib.pyplot as plt
import nltk
from nltk.translate.bleu_score import sentence_bleu

# File paths (update these as needed)
train_file = "/Users/mubaraqolojo/Downloads/RNN and LSTMs/train.jsonl"
test_file = "/Users/mubaraqolojo/Downloads/RNN and LSTMs/test.jsonl"

# Check that dataset files exist
if not os.path.exists(train_file) or not os.path.exists(test_file):
    raise FileNotFoundError("train.jsonl and/or test.jsonl not found.")


def generate_text(model, tokenizer, prompt_text, max_length=50, temperature=0.0):
    """
    Generate text from a trained model given a prompt.
    
    Args:
        model (nn.Module): Trained PyTorch model (RNN/LSTM/Transformer).
        tokenizer (SentencePieceProcessor): Tokenizer for encoding/decoding text.
        prompt_text (str): Initial input prompt.
        max_length (int): Maximum number of tokens to generate.
        temperature (float): Sampling temperature. 0.0 selects greedy decoding.
        
    Returns:
        str: Decoded text generated by the model.
    """
    model.eval()
    token_ids = tokenizer.encode(prompt_text, out_type=int)
    device = next(model.parameters()).device
    input_ids = torch.tensor([token_ids], dtype=torch.long, device=device)
    generated = token_ids.copy()
    
    with torch.no_grad():
        for _ in range(max_length):
            logits = model(input_ids)
            next_token_logits = logits[0, -1, :]
            if temperature == 0.0:
                next_token = torch.argmax(next_token_logits).item()
            else:
                scaled_logits = next_token_logits / temperature
                probs = torch.softmax(scaled_logits, dim=0)
                next_token = torch.multinomial(probs, num_samples=1).item()
            generated.append(next_token)
            if next_token == tokenizer.eos_id():
                break
            input_ids = torch.tensor([generated], dtype=torch.long, device=device)
    return tokenizer.decode(generated)


class PositionalEncoding(nn.Module):
    """
    Implements sinusoidal positional encoding as described in 'Attention is All You Need'.
    """
    def __init__(self, d_model, dropout=0.1, max_len=512):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float32).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2, dtype=torch.float32) *
                             -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return self.dropout(x)


class LanguageModelDataset(Dataset):
    """
    Custom Dataset for language modeling.
    """
    def __init__(self, sequences, seq_length):
        self.samples = [(seq[:-1], seq[1:]) for seq in sequences if len(seq) == seq_length + 1]

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        inp, target = self.samples[idx]
        return torch.tensor(inp, dtype=torch.long), torch.tensor(target, dtype=torch.long)


class RNNLanguageModel(nn.Module):
    """
    Simple RNN language model.
    """
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.rnn = nn.RNN(embed_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.rnn(embedded)
        return self.fc(output)

    def prompt(self, tokenizer, prompt_text, max_length=50, temperature=0.0):
        return generate_text(self, tokenizer, prompt_text, max_length, temperature)


class LSTMLanguageModel(nn.Module):
    """
    LSTM language model.
    """
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.lstm(embedded)
        return self.fc(output)

    def prompt(self, tokenizer, prompt_text, max_length=50, temperature=0.0):
        return generate_text(self, tokenizer, prompt_text, max_length, temperature)


class TransformerLanguageModel(nn.Module):
    """
    Transformer-based language model.
    """
    def __init__(self, vocab_size, embed_dim, num_heads, hidden_dim, num_layers, max_seq_length, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.pos_encoder = PositionalEncoding(embed_dim, dropout, max_seq_length)
        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads,
                                                   dim_feedforward=hidden_dim, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers)
        self.fc = nn.Linear(embed_dim, vocab_size)

    def forward(self, x):
        embedded = self.embedding(x)
        encoded = self.pos_encoder(embedded)
        encoded = encoded.transpose(0, 1)
        out = self.transformer_encoder(encoded)
        out = out.transpose(0, 1)
        return self.fc(out)

    def prompt(self, tokenizer, prompt_text, max_length=50, temperature=0.0):
        return generate_text(self, tokenizer, prompt_text, max_length, temperature)


def train_model(model, train_loader, val_loader, num_epochs, criterion, optimizer, scheduler, device, patience=5):
    model.to(device)
    train_losses, val_losses = [], []
    best_val_loss = float('inf')
    epochs_no_improve = 0

    for epoch in range(1, num_epochs + 1):
        epoch_start = time.time()
        model.train()
        total_train_loss = 0
        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            output = model(inputs)
            loss = criterion(output.view(-1, output.size(-1)), targets.view(-1))
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5)
            optimizer.step()
            total_train_loss += loss.item()
        avg_train_loss = total_train_loss / len(train_loader)
        train_losses.append(avg_train_loss)

        model.eval()
        total_val_loss = 0
        with torch.no_grad():
            for inputs, targets in val_loader:
                inputs, targets = inputs.to(device), targets.to(device)
                output = model(inputs)
                loss = criterion(output.view(-1, output.size(-1)), targets.view(-1))
                total_val_loss += loss.item()
        avg_val_loss = total_val_loss / len(val_loader)
        val_losses.append(avg_val_loss)
        scheduler.step(avg_val_loss)

        epoch_duration = time.time() - epoch_start
        print(f"Epoch {epoch} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | Duration: {epoch_duration:.2f}s")

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            epochs_no_improve = 0
        else:
            epochs_no_improve += 1
            if epochs_no_improve >= patience:
                print("Early stopping triggered.")
                break

    return train_losses, val_losses


def evaluate_model(model, data_loader, criterion, device):
    model.eval()
    loss = 0
    with torch.no_grad():
        for inputs, targets in data_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss += criterion(outputs.view(-1, outputs.size(-1)), targets.view(-1)).item()
    return loss / len(data_loader)


def compute_perplexity(loss):
    return np.exp(loss)


def compute_token_accuracy(model, data_loader, device):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, targets in data_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            preds = torch.argmax(outputs, dim=-1)
            correct += (preds == targets).sum().item()
            total += targets.numel()
    return correct / total


def compute_bleu(reference, candidate):
    return sentence_bleu([nltk.word_tokenize(reference)], nltk.word_tokenize(candidate))


def plot_loss_curve(train_losses, val_losses, model_name):
    plt.figure(figsize=(10, 6))
    plt.plot(train_losses, label="Train Loss", marker='o', linestyle='-', linewidth=2)
    plt.plot(val_losses, label="Validation Loss", marker='s', linestyle='-', linewidth=2)
    plt.xlabel("Epoch", fontsize=12)
    plt.ylabel("Loss", fontsize=12)
    plt.title(f"{model_name} Loss Curve", fontsize=14)
    plt.legend(fontsize=12)
    plt.grid(True, which='both', linestyle='--', linewidth=0.5)
    plt.tight_layout()
    plt.savefig(f"{model_name}_loss.png", dpi=300)
    plt.show()


def load_and_tokenize(file, sp):
    texts = []
    with open(file, "r", encoding="utf-8") as f:
        for line in f:
            try:
                obj = json.loads(line)
            except json.JSONDecodeError:
                continue
            prompt = obj.get("prompt", "")
            completion = obj.get("completion", "")
            text = (prompt.strip() + " " + completion.strip()).strip()
            if text:
                texts.append(text)
    print(f"Found {len(texts)} valid text entries in {file}")
    # Use newline as the separator so that each sentence is on its own line.
    combined = "\n".join(texts)
    print(f"Combined training text length: {len(combined)} characters")
    return sp.encode(combined, out_type=int)


def build_sequences(token_ids, max_len):
    return [token_ids[i:i + max_len + 1] for i in range(len(token_ids) - max_len)]


def main():
    global_start = time.time()
    seed = 42
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

    if torch.cuda.is_available():
        device = torch.device("cuda")
    elif torch.backends.mps.is_available():
        device = torch.device("mps")
    else:
        device = torch.device("cpu")
    print(f"[✓] Using device: {device}")

    vocab_size = 10000
    embed_dim = 256
    hidden_dim = 512
    num_layers = 2
    num_heads = 8
    max_seq_length = 50
    batch_size = 128
    num_epochs = 30
    learning_rate = 1e-3
    pad_token_id = 3

    tokenizer_model_prefix = "tokenizer"
    if not os.path.exists(f"{tokenizer_model_prefix}.model"):
        print("Training tokenizer...")
        texts = []
        with open(train_file, "r", encoding="utf-8") as f:
            for line in f:
                try:
                    obj = json.loads(line)
                except json.JSONDecodeError:
                    continue
                prompt = obj.get("prompt", "")
                completion = obj.get("completion", "")
                text = (prompt.strip() + " " + completion.strip()).strip()
                if text:
                    texts.append(text)
        if not texts:
            raise ValueError("No valid text found in the training file. Cannot train tokenizer.")
        combined = "\n".join(texts)
        if not combined.strip():
            raise ValueError("Combined text is empty. Check your training data.")
        print(f"Combined training text length: {len(combined)} characters")
        with open("temp.txt", "w", encoding="utf-8") as f:
            f.write(combined)
        spm.SentencePieceTrainer.train(
            input="temp.txt",
            model_prefix=tokenizer_model_prefix,
            vocab_size=vocab_size,
            model_type="bpe",
            character_coverage=1.0
        )
        os.remove("temp.txt")

    sp = spm.SentencePieceProcessor(model_file=f"{tokenizer_model_prefix}.model")

    train_tokens = load_and_tokenize(train_file, sp)
    val_tokens = load_and_tokenize(test_file, sp)

    train_seqs = build_sequences(train_tokens, max_seq_length)
    val_seqs = build_sequences(val_tokens, max_seq_length)

    print(f"Train tokens length: {len(train_tokens)}")
    print(f"Validation tokens length: {len(val_tokens)}")
    print(f"Generated train sequences: {len(train_seqs)}")
    print(f"Generated validation sequences: {len(val_seqs)}")

    num_workers = 0  
    pin_memory = True if device.type == "cuda" else False

    train_dataset = LanguageModelDataset(train_seqs, max_seq_length)
    val_dataset = LanguageModelDataset(val_seqs, max_seq_length)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,
                              num_workers=num_workers, pin_memory=pin_memory)
    val_loader = DataLoader(val_dataset, batch_size=batch_size,
                            num_workers=num_workers, pin_memory=pin_memory)

    models = {
        "RNN": RNNLanguageModel(vocab_size, embed_dim, hidden_dim, num_layers),
        "LSTM": LSTMLanguageModel(vocab_size, embed_dim, hidden_dim, num_layers),
        "Transformer": TransformerLanguageModel(vocab_size, embed_dim, num_heads, hidden_dim, num_layers, max_seq_length)
    }

    for name, model in models.items():
        print(f"\n--- Training {name} model ---")
        model_start_time = time.time()
        criterion = nn.CrossEntropyLoss(ignore_index=pad_token_id)
        optimizer = optim.AdamW(model.parameters(), lr=learning_rate)
        scheduler = ReduceLROnPlateau(optimizer, mode="min", factor=0.5, patience=2)
        
        train_losses, val_losses = train_model(
            model, train_loader, val_loader,
            num_epochs, criterion, optimizer, scheduler, device, patience=5
        )
        model_end_time = time.time()
        total_model_time = model_end_time - model_start_time
        print(f"Total training time for {name} model: {total_model_time:.2f} seconds")
        
        plot_loss_curve(train_losses, val_losses, name)
        
        val_loss = evaluate_model(model, val_loader, criterion, device)
        perplexity = compute_perplexity(val_loss)
        accuracy = compute_token_accuracy(model, val_loader, device) * 100
        
        print(f"{name} | Perplexity: {perplexity:.2f} | Token Accuracy: {accuracy:.2f}%")
        sample_output = model.prompt(sp, "Which do you prefer? Dogs or cats?", max_length=50)
        print("Sample output:", sample_output)
        
        model_save_path = f"{name}_model.pt"
        torch.save(model.state_dict(), model_save_path)
        print(f"{name} model saved to {model_save_path}")

    global_end = time.time()
    print(f"\nTotal training process time: {global_end - global_start:.2f} seconds")


if __name__ == "__main__":
    main()
