In [3]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt

torch.manual_seed(42)
np.random.seed(42)

In [4]:
TEXT_PATH = "/content/sample_data/office_script_clean.txt"

with open(TEXT_PATH, "r", encoding="utf-8") as f:
    text = f.read()

print("Total characters:", len(text))
print(text[:500])

Total characters: 3427466
Michael: All right Jim. Your quarterlies look very good. How are things at the library?
Jim: Oh, I told you. I couldn't close it. So...
Michael: So you've come to the master for guidance? Is this what you're saying, grasshopper?
Jim: Actually, you called me in here, but yeah.
Michael: All right. Well, let me show you how it's done.
Michael:  Yes, I'd like to speak to your office manager, please. Yes, hello. This is Michael Scott. I am the Regional Manager of Dunder Mifflin Paper Products. Just w


In [5]:
chars = sorted(list(set(text)))
vocab_size = len(chars)

char2idx = {ch: i for i, ch in enumerate(chars)}
idx2char = {i: ch for i, ch in enumerate(chars)}

print("Vocab size:", vocab_size)

Vocab size: 72


In [6]:
encoded = np.array([char2idx[ch] for ch in text], dtype=np.int64)

In [7]:
split_ratio = 0.9
split_idx = int(len(encoded) * split_ratio)

train_data = encoded[:split_idx]
val_data   = encoded[split_idx:]

print(len(train_data), len(val_data))

3084719 342747


In [8]:
class CharDataset(Dataset):
    def __init__(self, data, seq_len):
        self.data = data
        self.seq_len = seq_len

    def __len__(self):
        return len(self.data) - self.seq_len

    def __getitem__(self, idx):
        x = self.data[idx:idx + self.seq_len]
        y = self.data[idx + 1:idx + self.seq_len + 1]

        return (
            torch.tensor(x, dtype=torch.long),
            torch.tensor(y, dtype=torch.long)
        )

In [9]:
SEQ_LEN = 100
BATCH_SIZE = 64

train_ds = CharDataset(train_data, SEQ_LEN)
val_ds   = CharDataset(val_data, SEQ_LEN)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=False)
val_loader   = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)

In [10]:
class CharLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_dim=512, num_layers=2):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(
            embed_dim,
            hidden_dim,
            num_layers,
            batch_first=True
        )
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden=None):
        x = self.embedding(x)
        out, hidden = self.lstm(x, hidden)
        logits = self.fc(out)
        return logits, hidden

In [11]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model = CharLSTM(vocab_size).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=2e-3)

In [12]:
def train_epoch(model, loader):
    model.train()
    total_loss = 0

    for x, y in loader:
        x, y = x.to(device), y.to(device)

        logits, _ = model(x)
        loss = criterion(
            logits.reshape(-1, vocab_size),
            y.reshape(-1)
        )

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(loader)


def eval_epoch(model, loader):
    model.eval()
    total_loss = 0

    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(device), y.to(device)
            logits, _ = model(x)
            loss = criterion(
                logits.reshape(-1, vocab_size),
                y.reshape(-1)
            )
            total_loss += loss.item()

    return total_loss / len(loader)

In [None]:
EPOCHS = 15

train_losses, val_losses = [], []

for epoch in range(EPOCHS):
    train_loss = train_epoch(model, train_loader)
    val_loss   = eval_epoch(model, val_loader)

    train_losses.append(train_loss)
    val_losses.append(val_loss)

    print(
        f"Epoch {epoch+1:02d} | "
        f"Train Loss: {train_loss:.4f} | "
        f"Val Loss: {val_loss:.4f} | "
        f"Val Perplexity: {np.exp(val_loss):.2f}"
    )

Epoch 01 | Train Loss: 1.3996 | Val Loss: 1.5184 | Val Perplexity: 4.56
Epoch 02 | Train Loss: 1.3505 | Val Loss: 1.4835 | Val Perplexity: 4.41


In [None]:
plt.plot(train_losses, label="Train Loss")
plt.plot(val_losses, label="Val Loss")
plt.legend()
plt.title("Training & Validation Loss")
plt.show()

plt.plot(np.exp(val_losses), label="Validation Perplexity")
plt.legend()
plt.title("Validation Perplexity")
plt.show()

In [None]:
def generate_text(model, start_text, length=500, temperature=0.8):
    model.eval()

    input_idxs = torch.tensor(
        [char2idx[c] for c in start_text],
        dtype=torch.long
    ).unsqueeze(0).to(device)

    generated = start_text
    hidden = None

    with torch.no_grad():
        for _ in range(length):
            logits, hidden = model(input_idxs, hidden)
            logits = logits[:, -1, :] / temperature
            probs = torch.softmax(logits, dim=-1)

            next_idx = torch.multinomial(probs, 1).item()
            generated += idx2char[next_idx]

            input_idxs = torch.tensor([[next_idx]], dtype=torch.long).to(device)

    return generated

In [None]:
for T in [0.4, 0.7, 1.0]:
    print(f"\n--- Temperature {T} ---\n")
    print(generate_text(model, "Michael:", 400, temperature=T))