In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import re
from collections import Counter
import math
import random

# Determine the device and print it
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_heads):
        super().__init__()
        self.d_model = d_model
        self.n_heads = n_heads
        self.d_k = d_model // n_heads

        self.w_q = nn.Linear(d_model, d_model)
        self.w_k = nn.Linear(d_model, d_model)
        self.w_v = nn.Linear(d_model, d_model)
        self.w_o = nn.Linear(d_model, d_model)

    def forward(self, x, mask=None):
        batch_size, seq_len, d_model = x.size()

        # Linear transformations and split into heads
        # Q, K, V: (batch_size, n_heads, seq_len, d_k)
        Q = self.w_q(x).view(batch_size, seq_len, self.n_heads, self.d_k).transpose(1, 2)
        K = self.w_k(x).view(batch_size, seq_len, self.n_heads, self.d_k).transpose(1, 2)
        V = self.w_v(x).view(batch_size, seq_len, self.n_heads, self.d_k).transpose(1, 2)

        # Scaled dot-product attention
        # scores: (batch_size, n_heads, seq_len, seq_len)
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)

        if mask is not None:
            # mask is typically (1, 1, seq_len, seq_len) or (batch_size, 1, seq_len, seq_len)
            # It must be on the same device as scores.
            # masked_fill fills elements where mask == 0 is True.
            scores = scores.masked_fill(mask == 0, -1e9)

        attention_weights = F.softmax(scores, dim=-1)
        attention_output = torch.matmul(attention_weights, V)

        # Concatenate heads and put through final linear layer
        # attention_output: (batch_size, seq_len, d_model)
        attention_output = attention_output.transpose(1, 2).contiguous().view(
            batch_size, seq_len, d_model)

        return self.w_o(attention_output)

class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(0.1)

    def forward(self, x):
        return self.linear2(self.dropout(F.relu(self.linear1(x))))

class TransformerBlock(nn.Module):
    def __init__(self, d_model, n_heads, d_ff):
        super().__init__()
        self.attention = MultiHeadAttention(d_model, n_heads)
        self.feed_forward = FeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(0.1)

    def forward(self, x, mask=None):
        # Self-attention with residual connection and layer norm
        attn_output = self.attention(x, mask)
        x = self.norm1(x + self.dropout(attn_output))

        # Feed forward with residual connection and layer norm
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))

        return x

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length=5000):
        super().__init__()

        pe = torch.zeros(max_seq_length, d_model)
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)

        div_term = torch.exp(torch.arange(0, d_model, 2).float() *
                           (-math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        # pe is (max_seq_length, d_model)
        # self.register_buffer makes 'pe' part of model's state_dict
        # and moves it to GPU if model.to(device) is called.
        self.register_buffer('pe', pe.unsqueeze(0)) # (1, max_seq_length, d_model)

    def forward(self, x):
        # x is (batch_size, seq_len, d_model)
        # self.pe is (1, max_seq_length, d_model)
        # self.pe[:, :x.size(1)] is (1, seq_len, d_model), will broadcast with x
        return x + self.pe[:, :x.size(1)]

class BasicTransformer(nn.Module):
    def __init__(self, vocab_size, d_model=512, n_heads=8, n_layers=6, d_ff=2048, max_seq_length=512):
        super().__init__()
        self.d_model = d_model
        # max_seq_length is crucial for PE and for slicing inputs during generation
        self.max_seq_length = max_seq_length

        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(d_model, max_seq_length)

        self.transformer_blocks = nn.ModuleList([
            TransformerBlock(d_model, n_heads, d_ff) for _ in range(n_layers)
        ])

        self.layer_norm = nn.LayerNorm(d_model)
        self.output_projection = nn.Linear(d_model, vocab_size)
        self.dropout = nn.Dropout(0.1)

    def create_causal_mask(self, seq_len):
        # Creates a mask of shape (seq_len, seq_len)
        # Lower triangle (and diagonal) is 1, upper triangle is 0.
        mask = torch.tril(torch.ones(seq_len, seq_len))
        # Returns shape (1, 1, seq_len, seq_len)
        # This mask is created on CPU by default.
        return mask.unsqueeze(0).unsqueeze(0)

    def forward(self, x):
        # x: (batch_size, seq_len) - input indices. Device of x depends on where model and input data are.
        seq_len = x.size(1)

        # Token embeddings and positional encoding
        # x: (batch_size, seq_len, d_model)
        x = self.embedding(x) * math.sqrt(self.d_model)
        x = self.pos_encoding(x) # self.pe buffer is on the same device as model parameters
        x = self.dropout(x)

        # Create causal mask for autoregressive generation
        # Mask is created (likely on CPU) and then moved to x's device.
        # x.device will be CUDA if model and inputs are on CUDA.
        mask = self.create_causal_mask(seq_len).to(x.device)

        # Pass through transformer blocks
        for transformer_block in self.transformer_blocks:
            x = transformer_block(x, mask) # mask is (1,1,seq_len,seq_len)

        x = self.layer_norm(x)

        # Project to vocabulary size
        return self.output_projection(x)

class TextProcessor:
    def __init__(self):
        self.word_to_idx = {}
        self.idx_to_word = {}
        self.vocab_size = 0

    def tokenize(self, text):
        text = text.lower()
        tokens = re.findall(r'\b\w+\b|[^\w\s]', text)
        return tokens

    def build_vocab(self, text, min_freq=2):
        tokens = self.tokenize(text)
        word_counts = Counter(tokens)

        vocab = ['<pad>', '<unk>', '<start>', '<end>']
        vocab.extend([word for word, count in word_counts.items() if count >= min_freq])

        self.word_to_idx = {word: idx for idx, word in enumerate(vocab)}
        self.idx_to_word = {idx: word for word, idx in self.word_to_idx.items()}
        self.vocab_size = len(vocab)

        print(f"Vocabulary size: {self.vocab_size}")
        return vocab

    def text_to_indices(self, text):
        tokens = self.tokenize(text)
        return [self.word_to_idx.get(token, self.word_to_idx['<unk>']) for token in tokens]

    def indices_to_text(self, indices):
        return ' '.join([self.idx_to_word.get(idx, '<unk>') for idx in indices])

def create_training_data(text_indices, seq_length):
    inputs, targets = [], []
    for i in range(len(text_indices) - seq_length):
        input_seq = text_indices[i:i + seq_length]
        target_seq = text_indices[i + 1:i + seq_length + 1]
        inputs.append(input_seq)
        targets.append(target_seq)
    # Tensors are created on CPU by default here
    return torch.tensor(inputs), torch.tensor(targets)

def train_model(model, train_inputs, train_targets, epochs=10, batch_size=32, lr=0.001):
    # Uses the global `device` variable
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss() # Can add ignore_index=processor.word_to_idx['<pad>'] if padding is used

    model.train() # Set model to training mode

    for epoch in range(epochs):
        total_loss = 0
        num_batches = 0

        # Shuffle data (on CPU is fine)
        indices = torch.randperm(len(train_inputs))
        train_inputs_shuffled = train_inputs[indices]
        train_targets_shuffled = train_targets[indices]

        for i in range(0, len(train_inputs_shuffled), batch_size):
            # Move batches to the target device
            batch_inputs = train_inputs_shuffled[i:i + batch_size].to(device)
            batch_targets = train_targets_shuffled[i:i + batch_size].to(device)

            optimizer.zero_grad()

            # Forward pass
            # outputs: (batch_size, seq_len, vocab_size)
            outputs = model(batch_inputs)

            # Reshape for CrossEntropyLoss:
            # outputs needs to be (N, C) where C = num_classes (vocab_size)
            # targets needs to be (N)
            loss = criterion(outputs.reshape(-1, outputs.size(-1)), batch_targets.reshape(-1))

            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            num_batches += 1

        avg_loss = total_loss / num_batches
        print(f'Epoch {epoch + 1}/{epochs}, Average Loss: {avg_loss:.4f}')

def generate_text(model, processor, start_text, max_length=100, temperature=1.0):
    # Uses the global `device` variable
    model.eval() # Set model to evaluation mode

    tokens = processor.tokenize(start_text)
    indices = [processor.word_to_idx.get(token, processor.word_to_idx['<unk>']) for token in tokens]
    generated_indices = indices.copy()

    with torch.no_grad(): # Disable gradient calculations
        for _ in range(max_length):
            # Prepare input: last `model.max_seq_length` tokens
            # model.max_seq_length is the sequence length the model was trained with
            current_input_indices = generated_indices[-model.max_seq_length:]

            input_tensor = torch.tensor([current_input_indices], dtype=torch.long).to(device)

            # outputs: (1, current_seq_len, vocab_size)
            outputs = model(input_tensor)

            # Get logits for the next token prediction (after the last token in input_tensor)
            next_token_logits = outputs[0, -1, :] / temperature

            # Sample from the distribution
            probs = F.softmax(next_token_logits, dim=-1)
            next_token_idx = torch.multinomial(probs, 1).item()

            generated_indices.append(next_token_idx)

            # Stop if <end> token is generated
            if next_token_idx == processor.word_to_idx.get('<end>', -100): # Use a dummy if <end> not in vocab
                break

    return processor.indices_to_text(generated_indices)

def main():
    # Global `device` is used by model placement, train_model, and generate_text
    print(f"Main function running, will use device: {device}")

    try:
        with open('David_Copperfield.txt', 'r', encoding='utf-8') as file:
            text = file.read()
    except FileNotFoundError:
        text = """
        David Copperfield is a novel by Charles Dickens. The story follows the life of David from childhood to maturity.
        He faces many challenges and meets various characters along his journey. The novel explores themes of growth,
        love, and the human condition. David learns valuable lessons about life, friendship, and perseverance.
        Through his experiences, he develops into a mature and wise individual.
        This is a short sample text. For better results, provide a larger corpus.
        The model learns patterns from the data it is trained on. More data means more patterns.
        """
        print("Using sample text. For better results, please provide 'David_Copperfield.txt' or a similar large text file.")

    processor = TextProcessor()
    processor.build_vocab(text, min_freq=2)

    text_indices = processor.text_to_indices(text)

    # seq_length for training. This also becomes model.max_seq_length.
    seq_length = 64
    train_inputs, train_targets = create_training_data(text_indices, seq_length)

    if len(train_inputs) == 0:
        print(f"Not enough data to create training samples. Text length (tokens): {len(text_indices)}, Sequence length: {seq_length}.")
        print(f"Need at least {seq_length + 1} tokens to create one sample.")
        return None, None

    print(f"Training samples: {len(train_inputs)}")

    # Initialize model and move it to the determined device
    model = BasicTransformer(
        vocab_size=processor.vocab_size,
        d_model=256,      # Smaller model for faster example training
        n_heads=8,
        n_layers=4,       # Fewer layers for faster example training
        d_ff=1024,        # d_ff is usually 4 * d_model
        max_seq_length=seq_length # This is important for PE and generation context
    ).to(device)

    print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
    # Check actual device of a parameter
    if len(list(model.parameters())) > 0:
        print(f"Model is on device: {next(model.parameters()).device}")
    else:
        print("Model has no parameters.")


    print("\nStarting training...")
    train_model(model, train_inputs, train_targets, epochs=1, batch_size=128, lr=0.001)

    print("\nGenerating text...")
    start_text = "David"
    generated_text = generate_text(model, processor, start_text, max_length=50, temperature=0.8)
    print(f"Generated text: {generated_text}")

    return model, processor

if __name__ == "__main__":
    model, processor = main()

Using device: cuda
Main function running, will use device: cuda
Vocabulary size: 8790
Training samples: 454098
Model parameters: 7,668,822
Model is on device: cuda:0

Starting training...
Epoch 1/1, Average Loss: 3.0255

Generating text...
Generated text: david , ” said mr . spenlow . “ what can you believe , ” said i , “ what shall i do without you ? ” replied mr . spenlow . i looked aside , as he looked alternately on us downstairs . “ we shall see the <unk> of


In [None]:
start_text = "my deepest sorrows are"
generated_text = generate_text(model, processor, start_text, max_length=128, temperature=0.8)
print(f"Generated text: {generated_text}")

Generated text: my deepest sorrows are wings that she should come to see her house ; but she hurt bodily and all to me . ” “ i cannot leave her , ” said i , “ bear more than i can count , if i had had been brought up stronger than ever since . if i have been unhappy , i have loved her promise nothing , and the gentle cheerfulness way of doing anything but it ⁠ — in the <unk> ! ” “ oh , my goodness gracious sake , trotwood , ” i said , “ you ’ ll pray for me . if i couldn ’ t touch her ! ” “ i am very miserable , ” said i , “ and i never was of it
