# Setup and Imports

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import random
import numpy as np
from torch.utils.data import Dataset, DataLoader
from pathlib import Path

#Load Warren Buffet Letters Dataset

In [3]:
# Load text from local .txt file of Warren Buffet's letters
with open("/content/WarrenBuffet.txt", 'r', encoding='utf-8') as f:
    text = f.read()

# Build vocabulary
chars = sorted(set(text))
vocab_size = len(chars)
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for ch,i in stoi.items()}

encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

data = torch.tensor(encode(text), dtype=torch.long)
train_data = data[:int(0.9 * len(data))]
val_data = data[int(0.9 * len(data)):]

#Define transformer model

In [6]:
class MultiHeadAttention(nn.Module):
    def __init__(self, embed_size, num_heads):
        super().__init__()
        assert embed_size % num_heads == 0
        self.head_size = embed_size // num_heads
        self.num_heads = num_heads

        self.key = nn.Linear(embed_size, embed_size, bias=False)
        self.query = nn.Linear(embed_size, embed_size, bias=False)
        self.value = nn.Linear(embed_size, embed_size, bias=False)
        self.proj = nn.Linear(embed_size, embed_size)

        self.register_buffer("tril", torch.tril(torch.ones(1024, 1024)))

    def forward(self, x):
        B, T, C = x.shape

        k = self.key(x).view(B, T, self.num_heads, self.head_size).transpose(1, 2)
        q = self.query(x).view(B, T, self.num_heads, self.head_size).transpose(1, 2)
        v = self.value(x).view(B, T, self.num_heads, self.head_size).transpose(1, 2)

        scores = q @ k.transpose(-2, -1) / self.head_size**0.5
        mask = self.tril[:T, :T]
        scores = scores.masked_fill(mask == 0, float('-inf'))
        weights = F.softmax(scores, dim=-1)

        out = weights @ v  # (B, heads, T, head_size)
        out = out.transpose(1, 2).contiguous().view(B, T, C)  # concat heads
        return self.proj(out)

In [7]:
class TransformerBlock(nn.Module):
    def __init__(self, embed_size, num_heads):
        super().__init__()
        self.sa = MultiHeadAttention(embed_size, num_heads)
        self.ffwd = FeedForward(embed_size)
        self.ln1 = nn.LayerNorm(embed_size)
        self.ln2 = nn.LayerNorm(embed_size)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

In [8]:
class TransformerLM(nn.Module):
    def __init__(self, vocab_size, embed_size, block_size, n_heads):
        super().__init__()
        self.token_emb = nn.Embedding(vocab_size, embed_size)
        self.pos_emb = nn.Embedding(block_size, embed_size)
        self.blocks = nn.Sequential(*[TransformerBlock(embed_size, n_heads) for _ in range(4)])
        self.ln_f = nn.LayerNorm(embed_size)
        self.head = nn.Linear(embed_size, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_emb(idx)
        pos = torch.arange(T, device=idx.device)
        pos_emb = self.pos_emb(pos)
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.head(x)

        if targets is None:
            return logits, None
        B, T, C = logits.shape
        loss = F.cross_entropy(logits.view(B*T, C), targets.view(B*T))
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, _ = self(idx[:, -block_size:])
            probs = F.softmax(logits[:, -1, :], dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

#Train the model

In [12]:
# Hyperparameters
block_size = 128
batch_size = 32
embed_size = 128
n_heads = 4
learning_rate = 3e-4
epochs = 10
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = TransformerLM(vocab_size, embed_size, block_size, n_heads).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

def get_batch(split):
    data_split = train_data if split == 'train' else val_data
    ix = torch.randint(len(data_split) - block_size, (batch_size,))
    x = torch.stack([data_split[i:i+block_size] for i in ix])
    y = torch.stack([data_split[i+1:i+block_size+1] for i in ix])
    return x.to(device), y.to(device)

for epoch in range(epochs):
    model.train()
    for _ in range(1000):  # steps per epoch
        xb, yb = get_batch('train')
        logits, loss = model(xb, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1} | Train Loss: {loss.item():.4f}")

Epoch 1 | Train Loss: 2.1454
Epoch 2 | Train Loss: 1.7174
Epoch 3 | Train Loss: 1.5447
Epoch 4 | Train Loss: 1.3516
Epoch 5 | Train Loss: 1.3257
Epoch 6 | Train Loss: 1.2652
Epoch 7 | Train Loss: 1.1910
Epoch 8 | Train Loss: 1.1281
Epoch 9 | Train Loss: 1.0888
Epoch 10 | Train Loss: 1.0563


#Evaluate Model with perplexity

In [13]:
model.eval()
val_losses = []
with torch.no_grad():
    for _ in range(100):
        xb, yb = get_batch('val')
        _, val_loss = model(xb, yb)
        val_losses.append(val_loss.item())

avg_val_loss = sum(val_losses) / len(val_losses)
perplexity = math.exp(avg_val_loss)
print(f"Validation Perplexity: {perplexity:.2f}")

Validation Perplexity: 4.31


#Generate text like warren Buffet

In [14]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
generated = model.generate(context, max_new_tokens=500)[0].tolist()
print(decode(generated))


The one every diminished to 
about 200 largest fable in several in the after-tax we insurance creates, my the float will be used on aggregate to before certaint that recoverning and out operating can come can be of to plan fock-sationally, however, community realized the area fetch anstory. Amonitially that came and 
twise $19.8 billion 
to zero. (Sept of cours that normarkets and - that titles of about Kim's centry no costing now, GEICO 
managed by Bob 6.8% of to bundress as improved to the lat


# Most Impressive Generated Text
“The one every diminished to about 200 largest fable in several in the after-tax we insurance creates, my the float will be used on aggregate to before certaint that recoverning and out operating can come...”

“...twise $19.8 billion to zero. (Sept of cours that normarkets and - that titles of about Kim's centry no costing now, GEICO managed by Bob 6.8%...”

# What’s Impressive?
1. Financial Style Mimicry
Words like “after-tax,” “float,” “insurance,” “aggregate,” “operating,” “GEICO,” and dollar amounts are classic Warren Buffett vocabulary.

model is reproducing Buffett’s financial tone convincingly.

2. Numerical Fluency
Phrases like "$19.8 billion to zero" and "6.8%" shows model has learned to generate financial figures that are syntactically correct.

3. Domain-Specific Phrasing
Terms like “float will be used,” “operating can come,” “insurance creates,” and “realized the area fetch” may not always be grammatical, but they capture the rhythm of real shareholder letters.

These are not random words — the model clearly learned patterns from Buffett's writings.

4. Capitalization & Punctuation
The model is producing capitalized company names, parentheses, commas, and dollar signs in the right context — great for a character-level model.

"The model successfully captures domain-specific phrasing and financial vocabulary characteristic of Warren Buffett’s style. Phrases such as 'float will be used on aggregate' and '$19.8 billion to zero' demonstrate the model’s ability to mimic realistic financial discourse. Key design choices — such as multi-head attention, positional embeddings, and training exclusively on Buffett’s shareholder letters — contributed significantly to its success."