<a href="https://colab.research.google.com/github/JustinJiangNext/Tiny-LLM/blob/main/Tiny_LLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install torch tqdm requests


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [11]:
#@title cuda
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

Using device: cuda


In [12]:

import requests

datasets = {
    "shakespeare": "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt",
    "poe_telltale": "https://www.gutenberg.org/cache/epub/2148/pg2148.txt",#gluten berg
    "sherlock_holmes": "https://www.gutenberg.org/cache/epub/1661/pg1661.txt",
}

selected_dataset = "poe_telltale"

if selected_dataset in datasets:
    url = datasets[selected_dataset]
    response = requests.get(url)
    data = response.text
    print(f"downloading dataset {selected_dataset}")
else:
    raise ValueError("dataset doesnt exist")

#Too much data
max_chars = 500_000
if len(data) > max_chars:
    data = data[:max_chars]

# Save to file
with open("training_text.txt", "w", encoding='utf-8') as f:
    f.write(data)

print(f"number of chars in data {len(data)}")
print("Preview:\n" + data[:400])



downloading dataset poe_telltale
number of chars in data 500000
Preview:
﻿The Project Gutenberg eBook of The Works of Edgar Allan Poe — Volume 2
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org. If you are no


In [13]:
import os
import requests
import tarfile
import json
import glob

LITTLE_STORIES = True
if LITTLE_STORIES:
  DATA_DIR = "data"
  ARCHIVE_NAME = "TinyStories_all_data.tar.gz"
  ARCHIVE_URL = "https://huggingface.co/datasets/roneneldan/TinyStories/resolve/main/TinyStories_all_data.tar.gz"
  EXTRACTED_DIR = os.path.join(DATA_DIR, "TinyStories_all_data")
  OUTPUT_FILE = "training_text.txt"

  os.makedirs(DATA_DIR, exist_ok=True)

  archive_path = os.path.join(DATA_DIR, ARCHIVE_NAME)
  if not os.path.exists(archive_path):
      print(f"downloading {ARCHIVE_URL}...")
      response = requests.get(ARCHIVE_URL, stream=True)
      response.raise_for_status()
      with open(archive_path, "wb") as f:
          for chunk in response.iter_content(chunk_size=8192):
              f.write(chunk)
      print("finished downloading stories")
  else:
      print("already downloaded the stories")

  if not os.path.exists(EXTRACTED_DIR):
      print("unzipping archive of tories")
      with tarfile.open(archive_path, "r:gz") as tar:
          tar.extractall(path=EXTRACTED_DIR)
      print("finish unzipping")
  else:
      print("already downloaded and unzipped stories")

  print("Going through JSON file to find the stories")
  MAX_STORIES = 1000
  story_texts = []
  json_files = glob.glob(os.path.join(EXTRACTED_DIR, "*.json"))
  for json_file in json_files:
      with open(json_file, "r", encoding="utf-8") as f:
          stories = json.load(f)
          for story in stories:
              if MAX_STORIES < 0:
                  break
              MAX_STORIES -= 1
              text = story.get("story", "").strip()
              if text:
                  story_texts.append(text)

  data = "\n\n".join(story_texts)


  with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
      f.write(data)

  print(f"number of chars in data {len(data)}")
  print("Preview:\n" + data[:400])


already downloaded the stories
already downloaded and unzipped stories
Going through JSON file to find the stories
number of chars in data 766557
Preview:
Once upon a time, there was a little girl named May. She was three years old, and she loved to explore. One day, May asked her mom if she could go explore the subway. Her mom said yes, and May was so excited!
May put on her coat and shoes, and off she went to the subway. She couldn't believe all the incredible things she saw! There were so many people, and the noise was incredible.
As May was look


In [14]:
# vocabs
chars = sorted(list(set(data)))
vocab_size = len(chars)

stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for ch, i in stoi.items()}

#basic encode & decode
def encode(s): return [stoi[c] for c in s]
def decode(l): return ''.join([itos[i] for i in l])

# encode dataset
data_ids = encode(data)

#splitting dataset
import torch

data_tensor = torch.tensor(data_ids, dtype=torch.long)
n = int(0.9 * len(data_tensor))
train_data = data_tensor[:n].to(device)
val_data = data_tensor[n:].to(device)


In [15]:
import random

block_size = 64
batch_size = 32

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,), device=device)
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y
#

In [16]:
import torch.nn as nn
import torch.nn.functional as F
#stupid
class TinyCharModel(nn.Module):
    def __init__(self, vocab_size, n_embed):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, n_embed)
        self.lm_head = nn.Linear(n_embed, vocab_size)

    def forward(self, x, targets=None):
        embeds = self.token_embedding(x)
        logits = self.lm_head(embeds)

        if targets is None:
            return logits, None
        B, T, C = logits.shape
        logits = logits.view(B*T, C)
        targets = targets.view(B*T)
        loss = F.cross_entropy(logits, targets)
        return logits, loss


In [20]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# === Hyperparameters ===
vocab_size = len(chars)      # number of unique tokens (from tokenizer)
n_embed = 128                # embedding size
block_size = 256              # context length
n_heads = 24                  # number of attention heads
n_layers = 4                 # number of transformer blocks
dropout = 0.1                # dropout rate

class TinyTransformer(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, n_embed)
        self.pos_embedding = nn.Embedding(block_size, n_embed)
        self.blocks = nn.Sequential(*[TransformerBlock(n_embed, n_heads) for _ in range(n_layers)])
        self.ln_f = nn.LayerNorm(n_embed)
        self.lm_head = nn.Linear(n_embed, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embedding(idx)
        pos_emb = self.pos_embedding(torch.arange(T, device=idx.device))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)

        if targets is None:
            return logits, None
        loss = F.cross_entropy(logits.view(-1, vocab_size), targets.view(-1))
        return logits, loss

class TransformerBlock(nn.Module):
    def __init__(self, n_embed, n_heads):
        super().__init__()
        head_size = n_embed // n_heads
        self.sa = MultiHeadAttention(n_heads, head_size)
        self.ff = nn.Sequential(
            nn.Linear(n_embed, 4 * n_embed),
            nn.ReLU(),
            nn.Linear(4 * n_embed, n_embed),
            nn.Dropout(dropout)
        )
        self.ln1 = nn.LayerNorm(n_embed)
        self.ln2 = nn.LayerNorm(n_embed)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ff(self.ln2(x))
        return x

class MultiHeadAttention(nn.Module):
    def __init__(self, n_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([SelfAttentionHead(head_size) for _ in range(n_heads)])
        self.proj = nn.Linear(n_heads * head_size, n_embed)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        return self.dropout(self.proj(out))

class SelfAttentionHead(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embed, head_size, bias=False)
        self.query = nn.Linear(n_embed, head_size, bias=False)
        self.value = nn.Linear(n_embed, head_size, bias=False)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size)))

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)
        q = self.query(x)
        wei = q @ k.transpose(-2, -1) / (C ** 0.5)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        v = self.value(x)
        return wei @ v


In [21]:
#model = TinyCharModel()
model = TinyTransformer().to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

for step in range(1500):
    model.train()
    xb, yb = get_batch('train')
    logits, loss = model(xb, yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if step % 100 == 0:
        print(f"Step {step}: train loss {loss.item():.4f}")


Step 0: train loss 4.4804
Step 100: train loss 2.3380
Step 200: train loss 2.2883
Step 300: train loss 2.2475
Step 400: train loss 2.1702
Step 500: train loss 2.0281
Step 600: train loss 1.8686
Step 700: train loss 1.7504
Step 800: train loss 1.7565
Step 900: train loss 1.5542
Step 1000: train loss 1.4942
Step 1100: train loss 1.4837
Step 1200: train loss 1.4519
Step 1300: train loss 1.3749
Step 1400: train loss 1.2928


In [22]:
def generate(model, start_str, max_new_tokens=200):
    model.eval()
    idx = torch.tensor(encode(start_str), dtype=torch.long)[None, :].to(device)

    for _ in range(max_new_tokens):
        logits, _ = model(idx[:, -block_size:])
        logits = logits[:, -1, :]
        probs = F.softmax(logits, dim=-1)
        next_token = torch.multinomial(probs, num_samples=1)
        idx = torch.cat((idx, next_token), dim=1)

    return decode(idx[0].tolist())

def generate(model, start_str, max_new_tokens = 200, temperature=0.3, top_k=20):
    model.eval()
    idx = torch.tensor(encode(start_str), dtype=torch.long)[None, :].to(device)

    for _ in range(max_new_tokens):
        idx_cond = idx[:, -block_size:]
        logits, _ = model(idx_cond)
        logits = logits[:, -1, :] / temperature
        if top_k is not None:
            v, _ = torch.topk(logits, top_k)
            logits[logits < v[:, [-1]]] = -float('Inf')
        probs = F.softmax(logits, dim=-1)
        next_token = torch.multinomial(probs, num_samples=1)
        idx = torch.cat((idx, next_token), dim=1)
    return decode(idx[0].tolist())

print(generate(model, "Tetris is the best game in the "))


Tetris is the best game in the store and the little bear for the dog. They took the the park with the boy and saw a big his friends her friends. They was sad so happy and said, "What to the can the store and the park. It was be to 
