In [2]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import numpy as np
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

Device: cuda


In [4]:
with open("office_script_clean.txt", "r", encoding="utf-8") as f: #utf-8 tells python how to read bytes from file
    text = f.read()

print("Total characters in dataset:", len(text))

Total characters in dataset: 3427466


In [None]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print("Vocabulary size:", vocab_size)

char2idx = {ch: i for i, ch in enumerate(chars)} # dictionary mapping
idx2char = {i: ch for i, ch in enumerate(chars)}

text_encoded = np.array([char2idx[c] for c in text])
# assigned each character a number between 0 to 71

Vocabulary size: 72


In [None]:
class CharDataset(Dataset):
    def __init__(self, data, seq_length=100):
        self.data = data
        self.seq_length = seq_length
        
    def __len__(self):
        return len(self.data) - self.seq_length # if like 100 characters in data set and sequence length 10 u can start sequence at positions 0 to 90 
    
    def __getitem__(self, idx):
        x = self.data[idx: idx + self.seq_length] # not pre storing all sequnces coz wastes memory
        y = self.data[idx + 1: idx + self.seq_length + 1]
        return torch.tensor(x, dtype=torch.long), torch.tensor(y, dtype=torch.long)

seq_length = 100
dataset = CharDataset(text_encoded, seq_length)

train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

batch_size = 128
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True) # gradients become correlated, overfitting risk if no shuffling
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, drop_last=True) # drop last = true if total samples not divisible by 128

In [None]:
class CharLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim=128, hidden_dim=256, num_layers=2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)
        
    def forward(self, x, hidden=None):
        x = self.embedding(x)
        if hidden is None:
            output, hidden = self.lstm(x) # output contains hidden state at each time step just like weather forecasting one, hidden is final a and c of each sequence
        else:
            output, hidden = self.lstm(x, hidden)
        logits = self.fc(output)
        return logits, hidden

In [None]:
model = CharLSTM(vocab_size).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.002)
criterion = nn.CrossEntropyLoss()

def calculate_perplexity(loss):
    return torch.exp(loss)

num_epochs = 5

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for x, y in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"): #tqdm to show progress
        x, y = x.to(device), y.to(device) # note that y was never embedded
        optimizer.zero_grad()
        logits, _ = model(x)
        loss = criterion(logits.view(-1, vocab_size), y.view(-1)) # logits is batchsize, sequence length, vocab size  y is just batchsize, sequence length whose value is integer which is index of currect word in vocab
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    avg_train_loss = total_loss / len(train_loader)
    
    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for x, y in val_loader:
            x, y = x.to(device), y.to(device)
            logits, _ = model(x)
            loss = criterion(logits.view(-1, vocab_size), y.view(-1))
            val_loss += loss.item()
    avg_val_loss = val_loss / len(val_loader)
    
    print(f"Epoch {epoch+1} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | Val PPL: {calculate_perplexity(torch.tensor(avg_val_loss)):.4f}")

Epoch 1/5: 100%|██████████| 24098/24098 [14:24<00:00, 27.87it/s]


Epoch 1 | Train Loss: 1.0859 | Val Loss: 0.9934 | Val PPL: 2.7004


Epoch 2/5: 100%|██████████| 24098/24098 [14:40<00:00, 27.36it/s]


Epoch 2 | Train Loss: 0.9654 | Val Loss: 0.9491 | Val PPL: 2.5833


Epoch 3/5: 100%|██████████| 24098/24098 [13:00<00:00, 30.89it/s] 


Epoch 3 | Train Loss: 0.9388 | Val Loss: 0.9345 | Val PPL: 2.5460


Epoch 4/5: 100%|██████████| 24098/24098 [12:06<00:00, 33.15it/s]


Epoch 4 | Train Loss: 0.9276 | Val Loss: 0.9277 | Val PPL: 2.5287


Epoch 5/5: 100%|██████████| 24098/24098 [12:08<00:00, 33.08it/s]


Epoch 5 | Train Loss: 0.9218 | Val Loss: 0.9225 | Val PPL: 2.5157


In [None]:
def generate_script(model, seed_text, temperature=1.0, num_chars=500):
    model.eval()
    generated = seed_text
    input_seq = torch.tensor([char2idx[c] for c in seed_text], dtype=torch.long).unsqueeze(0).to(device) # inital shape was (seedtext's length, ) after unsqueeze (0, len)
    hidden = None
    
    for _ in range(num_chars):
        logits, hidden = model(input_seq, hidden)
        # model.forward is called - embedds, lstm gives output (hidden final memory), fc gives logits
        logits = logits[:, -1, :] / temperature # final output of sequences divided by temp, shape is (batch size, V) = (1, vocab size)
        probs = torch.softmax(logits, dim=-1) 
        next_idx = torch.multinomial(probs, num_samples=1).item() # random sampling weighted by probability
        next_char = idx2char[next_idx] 
        generated += next_char
        input_seq = torch.tensor([[next_idx]], dtype=torch.long).to(device) # embedding layer will do the embedding, only giving one character but hidden layer holds context
    
    return generated


In [10]:
seed = "Michael: "
sample_1 = generate_script(model, seed, temperature=0.3, num_chars=1000)
sample_2 = generate_script(model, seed, temperature=0.7, num_chars=1000)
sample_3 = generate_script(model, seed, temperature=1.0, num_chars=1000)

print("=== Temp 0.3 ===\n", sample_1)
print("=== Temp 0.7 ===\n", sample_2)
print("=== Temp 1.0 ===\n", sample_3)

=== Temp 0.3 ===
 Michael: Alright. Well, I guess I could be ready.
Michael: What are you doing here?
Andy: I have a song on a paper company in the world. And then you will not be a formal guy, and then I was thinking about some decisive on you. I was a family on Pam falls away from me. And I can make sure that you were the same time that I want to see the word I can be confidence and prepared to confinnie with her. I think I go to the store in the workplace.
Michael: Well, I can't be so much subshing company for a while. I don't know when they are going to take a look at the top to the machine that they are not a completely disressing the office. I would love that. That is a good idea to go to the store and I said I was gonna be my father anymore. I want you to be more specific. Welcome to the store in the world. And I will see you tonight. I think it's a good sales call.
Kevin: What is that?
Angela: Yes. In a way to get a strong conseque. I saw the paper in the face of the day to mak