In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import random

# 1. Liten leksakskorpus
text = """
hej jag heter martin
hej jag heter kerstin
hej hej hej
"""

# 2. Bygg vokabulär av alla tecken som förekommer
chars = sorted(list(set(text)))
vocab_size = len(chars)
print("Vokabulär:", chars)
print("vocab_size:", vocab_size)

# 3. Mappning char <-> int
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for ch, i in stoi.items()}

def encode(s: str):
    return [stoi[c] for c in s]

def decode(ids):
    return "".join(itos[i] for i in ids)

print("Ex encode:", encode("hej"))
print("Ex decode:", decode(encode("hej")))


Vokabulär: ['\n', ' ', 'a', 'e', 'g', 'h', 'i', 'j', 'k', 'm', 'n', 'r', 's', 't']
vocab_size: 14
Ex encode: [5, 3, 7]
Ex decode: hej


In [3]:
# Gör hela texten till en lång sekvens av index
data = torch.tensor(encode(text), dtype=torch.long)

# Vi delar upp i (input, target)-sekvenser med fast längd
block_size = 16  # maxlängd på en sekvens modellen ser
print("Total sekvenslängd:", len(data))

def get_batch(batch_size=4):
    # Välj startpositioner slumpmässigt
    ix = torch.randint(0, len(data) - block_size - 1, (batch_size,))
    x_batch = []
    y_batch = []
    for i in ix:
        x = data[i:i+block_size]       # input
        y = data[i+1:i+block_size+1]   # target (1 steg fram)
        x_batch.append(x)
        y_batch.append(y)
    x_batch = torch.stack(x_batch)  # (B, T)
    y_batch = torch.stack(y_batch)  # (B, T)
    return x_batch, y_batch

xb, yb = get_batch(batch_size=2)
print("data:", data)
print("xb shape:", xb.shape)
print("yb shape:", yb.shape)
print("xb[0]:", xb[0])
print("yb[0]:", yb[0])
print("xb[0] decoded:", decode(xb[0].tolist()))
print("yb[0] decoded:", decode(yb[0].tolist()))


Total sekvenslängd: 56
data: tensor([ 0,  5,  3,  7,  1,  7,  2,  4,  1,  5,  3, 13,  3, 11,  1,  9,  2, 11,
        13,  6, 10,  0,  5,  3,  7,  1,  7,  2,  4,  1,  5,  3, 13,  3, 11,  1,
         8,  3, 11, 12, 13,  6, 10,  0,  5,  3,  7,  1,  5,  3,  7,  1,  5,  3,
         7,  0])
xb shape: torch.Size([2, 16])
yb shape: torch.Size([2, 16])
xb[0]: tensor([ 3, 11,  1,  9,  2, 11, 13,  6, 10,  0,  5,  3,  7,  1,  7,  2])
yb[0]: tensor([11,  1,  9,  2, 11, 13,  6, 10,  0,  5,  3,  7,  1,  7,  2,  4])
xb[0] decoded: er martin
hej ja
yb[0] decoded: r martin
hej jag


In [4]:
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        assert d_model % num_heads == 0
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads

        # Gemensamma linjära lager som projicerar till Q, K, V
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)

        # Ut-projektion
        self.W_o = nn.Linear(d_model, d_model)

    def forward(self, x):
        # x: (B, T, d_model)
        B, T, D = x.shape

        # 1) Linjära projektioner
        Q = self.W_q(x)  # (B, T, D)
        K = self.W_k(x)
        V = self.W_v(x)

        # 2) Dela upp i heads: (B, T, num_heads, d_k) -> (B, num_heads, T, d_k)
        Q = Q.view(B, T, self.num_heads, self.d_k).transpose(1, 2)
        K = K.view(B, T, self.num_heads, self.d_k).transpose(1, 2)
        V = V.view(B, T, self.num_heads, self.d_k).transpose(1, 2)

        # 3) Scaled dot-product attention per head:
        #    scores = Q K^T / sqrt(d_k)
        # Q: (B, H, T, d_k)
        # K: (B, H, T, d_k) -> (B, H, d_k, T)
        scores = Q @ K.transpose(-2, -1) / math.sqrt(self.d_k)  # (B, H, T, T)

        # 4) Causal mask: tillåt bara att titta bakåt eller på sig själv
        mask = torch.tril(torch.ones(T, T, device=x.device))  # (T, T)
        # mask=1 behåll, mask=0 blockera (sätt till -inf)
        scores = scores.masked_fill(mask == 0, float('-inf'))

        # 5) Softmax över sista dimensionen (”över T” = över positionsaxel)
        A = F.softmax(scores, dim=-1)  # (B, H, T, T)

        # 6) Vägda summor av V
        out = A @ V  # (B, H, T, d_k)

        # 7) Tillbaka till (B, T, D)
        out = out.transpose(1, 2).contiguous().view(B, T, D)

        # 8) Slutlig linjär projektion
        out = self.W_o(out)  # (B, T, D)
        return out


In [5]:
class TransformerBlock(nn.Module):
    def __init__(self, d_model, num_heads, d_ff):
        super().__init__()
        self.attn = MultiHeadSelfAttention(d_model, num_heads)
        self.ln1 = nn.LayerNorm(d_model)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Linear(d_ff, d_model),
        )
        self.ln2 = nn.LayerNorm(d_model)

    def forward(self, x):
        # x: (B, T, d_model)

        # 1) Self-attention + residual + layernorm
        attn_out = self.attn(x)           # (B, T, d_model)
        x = x + attn_out                  # residual
        x = self.ln1(x)                   # layer norm

        # 2) FFN + residual + layernorm
        ffn_out = self.ffn(x)
        x = x + ffn_out
        x = self.ln2(x)

        return x


In [6]:
class MiniTransformerLM(nn.Module):
    def __init__(self, vocab_size, d_model=32, num_heads=2, d_ff=64, num_layers=1, block_size=16):
        super().__init__()
        self.block_size = block_size

        # Token-embedding
        self.token_emb = nn.Embedding(vocab_size, d_model)
        # Positional embedding (learned)
        self.pos_emb = nn.Embedding(block_size, d_model)

        # Stapla ett (eller flera) transformerblock
        self.blocks = nn.ModuleList([
            TransformerBlock(d_model, num_heads, d_ff)
            for _ in range(num_layers)
        ])

        # Slutlig projektion till vokabulärens storlek
        self.lm_head = nn.Linear(d_model, vocab_size)

    def forward(self, x, targets=None):
        # x: (B, T)
        B, T = x.shape
        assert T <= self.block_size

        # 1) Token + positionsembedding
        tok_emb = self.token_emb(x)                      # (B, T, d_model)
        pos = torch.arange(T, device=x.device)           # (T,)
        pos_emb = self.pos_emb(pos)[None, :, :]          # (1, T, d_model)
        h = tok_emb + pos_emb                            # (B, T, d_model)

        # 2) Kör genom transformerblocken
        for block in self.blocks:
            h = block(h)                                 # (B, T, d_model)

        # 3) Projektion till logits
        logits = self.lm_head(h)                         # (B, T, vocab_size)

        loss = None
        if targets is not None:
            # Gör om till (B*T, vocab_size) respektive (B*T)
            B, T, C = logits.shape
            logits_2d = logits.view(B*T, C)
            targets_1d = targets.view(B*T)
            loss = F.cross_entropy(logits_2d, targets_1d)

        return logits, loss

    @torch.no_grad()
    def generate(self, idx, max_new_tokens=50):
        # idx: (B, T) startsekvens
        for _ in range(max_new_tokens):
            # klipp till block_size
            idx_cond = idx[:, -self.block_size:]

            logits, _ = self(idx_cond)
            # ta sista tidsstegets logits
            logits_last = logits[:, -1, :]  # (B, vocab_size)
            probs = F.softmax(logits_last, dim=-1)
            # sampel från fördelningen
            next_token = torch.multinomial(probs, num_samples=1)  # (B, 1)
            # append
            idx = torch.cat([idx, next_token], dim=1)
        return idx


In [7]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

model = MiniTransformerLM(vocab_size, d_model=32, num_heads=2, d_ff=64, num_layers=1, block_size=block_size)
model = model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

# Enkel träningsloop
for step in range(500):  # 500 steg räcker ofta för att se något på så liten data
    model.train()
    xb, yb = get_batch(batch_size=16)
    xb, yb = xb.to(device), yb.to(device)

    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    if step % 50 == 0:
        print(f"Steg {step}, loss {loss.item():.4f}")


Device: cpu
Steg 0, loss 2.8101
Steg 50, loss 1.5255
Steg 100, loss 0.7185
Steg 150, loss 0.3904
Steg 200, loss 0.2344
Steg 250, loss 0.1506
Steg 300, loss 0.1360
Steg 350, loss 0.1288
Steg 400, loss 0.1081
Steg 450, loss 0.1123


In [9]:
# Enkel inferens
model.eval()
start = "jag "
start_ids = torch.tensor([encode(start)], dtype=torch.long).to(device)
generated = model.generate(start_ids, max_new_tokens=50)
print(decode(generated[0].tolist()))


jag heter kerstin
hej hej hej jag heter kerstin
hej he
