In [1]:
!pip install huggingface_hub tiktoken

Defaulting to user installation because normal site-packages is not writeable
Collecting huggingface_hub
  Downloading huggingface_hub-0.27.0-py3-none-any.whl (450 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m450.5/450.5 KB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting tiktoken
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
Collecting tqdm>=4.42.1
  Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.5/78.5 KB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
Collecting requests
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 KB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting regex>=2022.1.18
  Downloa

In [None]:
scp -i C:\Users\James\.ssh\id_ed25519 ubuntu@209.20.159.87:/home/ubuntu/llm/finetune.ipynb C:\Users\James\git\LLMs\finetune.ipynb

In [71]:
import os
import math
import time
import inspect
from dataclasses import dataclass
import torch
import torch.nn as nn
from torch.nn import functional as F
# -----------------------------------------------------------------------------

class CausalSelfAttention(nn.Module):

    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        # key, query, value projections for all heads, but in a batch
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
        # output projection
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
        self.c_proj.NANOGPT_SCALE_INIT = 1
        # regularization
        self.n_head = config.n_head
        self.n_embd = config.n_embd

    def forward(self, x):
        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        # nh is "number of heads", hs is "head size", and C (number of channels) = nh * hs
        # e.g. in GPT-2 (124M), n_head=12, hs=64, so nh*hs=C=768 channels in the Transformer
        qkv = self.c_attn(x)
        q, k, v = qkv.split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        y = F.scaled_dot_product_attention(q, k, v, is_causal=True) # flash attention
        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
        # output projection
        y = self.c_proj(y)
        return y

class MLP(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.c_fc    = nn.Linear(config.n_embd, 4 * config.n_embd)
        self.gelu    = nn.GELU(approximate='tanh')
        self.c_proj  = nn.Linear(4 * config.n_embd, config.n_embd)
        self.c_proj.NANOGPT_SCALE_INIT = 1

    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        return x

class Block(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.n_embd)
        self.mlp = MLP(config)

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x

@dataclass
class GPTConfig:
    block_size: int = 1024 # max sequence length
    vocab_size: int = 50257 # number of tokens: 50,000 BPE merges + 256 bytes tokens + 1 <|endoftext|> token
    n_layer: int = 12 # number of layers
    n_head: int = 12 # number of heads
    n_embd: int = 768 # embedding dimension

class GPT(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.config = config

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            wpe = nn.Embedding(config.block_size, config.n_embd),
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f = nn.LayerNorm(config.n_embd),
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

        # weight sharing scheme
        self.transformer.wte.weight = self.lm_head.weight

        # init params
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            std = 0.02
            if hasattr(module, 'NANOGPT_SCALE_INIT'):
                std *= (2 * self.config.n_layer) ** -0.5
            torch.nn.init.normal_(module.weight, mean=0.0, std=std)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        # idx is of shape (B, T)
        B, T = idx.size()
        assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
        # forward the token and posisition embeddings
        pos = torch.arange(0, T, dtype=torch.long, device=idx.device) # shape (T)
        pos_emb = self.transformer.wpe(pos) # position embeddings of shape (T, n_embd)
        tok_emb = self.transformer.wte(idx) # token embeddings of shape (B, T, n_embd)
        x = tok_emb + pos_emb
        # forward the blocks of the transformer
        for block in self.transformer.h:
            x = block(x)
        # forward the final layernorm and the classifier
        x = self.transformer.ln_f(x)
        logits = self.lm_head(x) # (B, T, vocab_size)
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
        return logits, loss

    @classmethod
    def from_openai_pretrained(cls, model_type):
        """Loads pretrained GPT-2 model weights from huggingface"""
        assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
        from transformers import GPT2LMHeadModel
        print("loading weights from pretrained gpt: %s" % model_type)

        # n_layer, n_head and n_embd are determined from model_type
        config_args = {
            'gpt2':         dict(n_layer=12, n_head=12, n_embd=768),  # 124M params
            'gpt2-medium':  dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
            'gpt2-large':   dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
            'gpt2-xl':      dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
        }[model_type]
        config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
        config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
        # create a from-scratch initialized minGPT model
        config = GPTConfig(**config_args)
        model = GPT(config)
        sd = model.state_dict()
        sd_keys = sd.keys()
        sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param

        # init a huggingface/transformers model
        model_hf = GPT2LMHeadModel.from_pretrained(model_type)
        sd_hf = model_hf.state_dict()

        # copy while ensuring all of the parameters are aligned and match in names and shapes
        sd_keys_hf = sd_hf.keys()
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
        transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
        # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
        # this means that we have to transpose these weights when we import them
        assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
        for k in sd_keys_hf:
            if any(k.endswith(w) for w in transposed):
                # special treatment for the Conv1D weights we need to transpose
                assert sd_hf[k].shape[::-1] == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k].t())
            else:
                # vanilla copy over the other parameters
                assert sd_hf[k].shape == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k])

        return model

    def generate(self, tokens, num_return_sequences=1, max_length=64):
        self.eval()
        tokens = torch.tensor(tokens, dtype=torch.long)
        tokens = tokens.unsqueeze(0).repeat(num_return_sequences, 1)
        xgen = tokens.to(device)
        while xgen.size(1) < max_length:
            with torch.no_grad():
                with torch.autocast(device_type=device, dtype=torch.bfloat16):
                    logits, loss = model(xgen) # (B, T, vocab_size)
                logits = logits[:, -1, :] # (B, vocab_size)
                probs = F.softmax(logits, dim=-1)
                topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)
                ix = torch.multinomial(topk_probs, 1) # (B, 1)
                xcol = torch.gather(topk_indices, -1, ix) # (B, 1)
                xgen = torch.cat((xgen, xcol), dim=1)
        return xgen[:, :max_length]

    def prompt(self, prompt, max_length=64):
        enc = tiktoken.get_encoding("gpt2")
        tokens = enc.encode(prompt)
        xgen = self.generate(tokens, num_return_sequences=1, max_length=max_length)
        tokens = xgen[0].tolist()
        decoded = enc.decode(tokens)
        return decoded


    def configure_optimizers(self, weight_decay, learning_rate, device_type):
        # start with all of the candidate parameters (that require grad)
        param_dict = {pn: p for pn, p in self.named_parameters()}
        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
        # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
        # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
        optim_groups = [
            {'params': decay_params, 'weight_decay': weight_decay},
            {'params': nodecay_params, 'weight_decay': 0.0}
        ]
        num_decay_params = sum(p.numel() for p in decay_params)
        num_nodecay_params = sum(p.numel() for p in nodecay_params)
        print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
        print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
        # Create AdamW optimizer and use the fused version if it is available
        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
        use_fused = fused_available and device_type == "cuda"

        print(f"using fused AdamW: {use_fused}")
        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=(0.9, 0.95), eps=1e-8, fused=use_fused)
        return optimizer


# api.upload_file(
#     path_or_fileobj="/home/ubuntu/llm/build-nanogpt/log/model_19072.pt",
#     path_in_repo="gpt-v1.pt",
#     repo_id= "molten-ice/gpt"
# )

In [60]:
import os
import json
import huggingface_hub

api = huggingface_hub.HfApi()
with open('hugging_apikey.json', 'r') as f:
    api_key = json.load(f)['api_key']

huggingface_hub.login(token=api_key)

os.makedirs('models', exist_ok=True)
model_path = huggingface_hub.hf_hub_download(
    repo_id="Molten-Ice/gpt", # https://huggingface.co/Molten-Ice/gpt/tree/main
    filename="gpt-v1.pt",
    local_dir='models'
)

  from .autonotebook import tqdm as notebook_tqdm


In [61]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
vocab_size = 50304
model = GPT(GPTConfig(vocab_size=vocab_size))
model.to(device)
print(f'device: {device}')

model_path = 'models/gpt-v1.pt'
loaded_checkpoint = torch.load(model_path)
model.load_state_dict(loaded_checkpoint['model'])

device: cuda


  loaded_checkpoint = torch.load(model_path)


<All keys matched successfully>

In [62]:
import tiktoken
model.prompt("Hello, I'm a language model,")

"Hello, I'm a language model, meaning I'm a model of models, that is, a system of ways that language, like systems or structures, come together across the language and help to make certain the same rules apply to different ways. So, I'd like to say language models are useful, I think,"

In [16]:
!wget https://raw.githubusercontent.com/tatsu-lab/stanford_alpaca/main/alpaca_data.json

--2024-12-25 11:12:03--  https://raw.githubusercontent.com/tatsu-lab/stanford_alpaca/main/alpaca_data.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 22773992 (22M) [text/plain]
Saving to: ‘alpaca_data.json.1’


2024-12-25 11:12:11 (2.87 MB/s) - ‘alpaca_data.json.1’ saved [22773992/22773992]



In [20]:
import json
with open('alpaca_data.json', 'r') as f:
    data = json.load(f)

base_prompt = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Response:
{output}"""

all_instructions = [base_prompt.format(instruction=f'{d["instruction"]}', output=d['output']) for d in data if not d['input']]

In [22]:
for instruction in all_instructions[:5]:
    print(instruction)
    print('-'*30)


Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Give three tips for staying healthy.

### Response:
1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. 
2. Exercise regularly to keep your body active and strong. 
3. Get enough sleep and maintain a consistent sleep schedule.
------------------------------
Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
What are the three primary colors?

### Response:
The three primary colors are red, blue, and yellow.
------------------------------
Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Describe the structure of an atom.

### Response:
An atom is made up of a nucleus, which contains protons and neutrons, surrounded by electrons that travel in orbits around the nucleus. The protons and neutron

In [63]:
import numpy as np
# init the tokenizer
enc = tiktoken.get_encoding("gpt2")
eot = enc._special_tokens['<|endoftext|>'] # end of text token
def tokenize(text):
    # tokenizes a single document and returns a numpy array of uint16 tokens
    tokens = [eot] # the special <|endoftext|> token delimits all documents
    tokens.extend(enc.encode_ordinary(text))
    tokens_np = np.array(tokens)
    assert (0 <= tokens_np).all() and (tokens_np < 2**16).all(), "token dictionary too large for uint16"
    tokens_np_uint16 = tokens_np.astype(np.uint16)
    return tokens_np_uint16

def load_tokens(filename):
    npt = np.load(filename)
    npt = npt.astype(np.int32) # added after video
    ptt = torch.tensor(npt, dtype=torch.long)
    return ptt

all_tokens = []
for instruction in all_instructions:
    tokens = tokenize(instruction)
    all_tokens.extend(tokens)

print(f'len(all_tokens): {len(all_tokens):,}')
np.save('alpaca_tokens.npy', all_tokens)

len(all_tokens): 3,525,454


In [64]:

def load_tokens(filename):
    npt = np.load(filename)
    npt = npt.astype(np.int32) # added after video
    ptt = torch.tensor(npt, dtype=torch.long)
    return ptt

class AlpacaDataLoader:
    def __init__(self, B, T, split='train'):
        self.B = B
        self.T = T
        self.current_position = 0

        assert split in {'train', 'val'}
        self.tokens = load_tokens('alpaca_tokens.npy')
        if split == 'train':
            self.tokens = self.tokens[:int(0.9 * len(self.tokens))]
        else:
            self.tokens = self.tokens[int(0.9 * len(self.tokens)):]

        batches = len(self.tokens) // (B * T)
        print(f'[{split}] {len(self.tokens):,} tokens | {batches:,} batches | {self.tokens.tolist().count(50256):,} eot tokens')

    def reset(self):
        self.current_position = 0

    def next_batch(self):
        B, T = self.B, self.T
        buf = self.tokens[self.current_position : self.current_position+B*T+1]
        x = (buf[:-1]).view(B, T) # inputs
        y = (buf[1:]).view(B, T) # targets
        self.current_position += B * T
        if self.current_position + (B * T + 1) > len(self.tokens):
            self.current_position = 0
        return x, y

In [72]:
torch.set_float32_matmul_precision('high')

device = 'cuda' if torch.cuda.is_available() else 'cpu'
vocab_size = 50304
model = GPT(GPTConfig(vocab_size=vocab_size))
model.to(device)
print(f'device: {device}')

model_path = 'models/gpt-v1.pt'
loaded_checkpoint = torch.load(model_path)
model.load_state_dict(loaded_checkpoint['model'])

device: cuda


  loaded_checkpoint = torch.load(model_path)


<All keys matched successfully>

In [73]:
log_dir = "log"
os.makedirs(log_dir, exist_ok=True)
log_file = os.path.join(log_dir, f"log.txt")

In [74]:
def evaluate_and_save(model, val_loader, step):
    val_loss_steps = 40
    val_loader.reset()
    with torch.no_grad():
        val_loss_accum = 0.0
        for _ in range(val_loss_steps):
            x, y = val_loader.next_batch()
            x, y = x.to(device), y.to(device)
            with torch.autocast(device_type=device, dtype=torch.bfloat16):
                logits, loss = model(x, y)
            loss = loss / val_loss_steps
            val_loss_accum += loss.detach()
    print(f"validation loss: {val_loss_accum.item():.4f}")
    with open(log_file, "a") as f:
        f.write(f"{step} val {val_loss_accum.item():.4f}\n")

    # Save model to file
    # with open(log_file, "a") as f:
    #     f.write(f"{step} val {val_loss_accum.item():.4f}\n")
    # if step > 0 and (step % 5000 == 0 or last_step):
    #     # optionally write model checkpoints
    #     checkpoint_path = os.path.join(log_dir, f"model_{step:05d}.pt")
    #     checkpoint = {
    #         'model': raw_model.state_dict(),
    #         'config': raw_model.config,
    #         'step': step,
    #         'val_loss': val_loss_accum.item()
    #     }
    #     # you might also want to add optimizer.state_dict() and
    #     # rng seeds etc., if you wanted to more exactly resume training
    #     torch.save(checkpoint, checkpoint_path)

    # Generate example outputs based on instructions.

In [75]:
max_lr = 6e-4
min_lr = max_lr * 0.1
warmup_steps = 715
max_steps = 19073 # 19,073 steps is ~1 epoch, if data is 10B tokens and batch size 0.5M tokens
def get_lr(it):
    # 1) linear warmup for warmup_iters steps
    if it < warmup_steps:
        return max_lr * (it+1) / warmup_steps
    # 2) if it > lr_decay_iters, return min learning rate
    if it > max_steps:
        return min_lr
    # 3) in between, use cosine decay down to min learning rate
    decay_ratio = (it - warmup_steps) / (max_steps - warmup_steps)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # coeff starts at 1 and goes to 0
    return min_lr + coeff * (max_lr - min_lr)

In [78]:
max_steps = 1200

train_loader = AlpacaDataLoader(B, T, split='train')
val_loader = AlpacaDataLoader(B, T, split='val')


optimizer = model.configure_optimizers(weight_decay=0.1, learning_rate=6e-4, device_type=device)


for step in range(50): # ~3 epochs of 32k example instructions
    t0 = time.time()
    last_step = (step == max_steps - 1)

    model.train()
    optimizer.zero_grad()
    x, y = train_loader.next_batch()
    x, y = x.to(device), y.to(device)
    with torch.autocast(device_type=device, dtype=torch.bfloat16):
        logits, loss = model(x, y)
    loss.backward()

    norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

    t1 = time.time()
    dt = t1 - t0
    tokens_processed = train_loader.B * train_loader.T
    tokens_per_sec = tokens_processed / dt

    lr = get_lr(step)
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr
    optimizer.step()
   
    print(f"step {step:5d} | loss: {loss.item():.6f} | lr {lr:.4e} | norm: {norm:.4f} | dt: {dt*1000:.2f}ms | tok/sec: {tokens_per_sec:.2f}")

[train] 3,172,908 tokens | 387 batches | 28,210 eot tokens
[val] 352,546 tokens | 43 batches | 3,113 eot tokens
num decayed parameter tensors: 50, with 124,354,560 parameters
num non-decayed parameter tensors: 98, with 121,344 parameters
using fused AdamW: True
step     0 | loss: 2.927885 | lr 8.3916e-07 | norm: 13.1722 | dt: 393.85ms | tok/sec: 20799.94
step     1 | loss: 2.991701 | lr 1.6783e-06 | norm: 15.1542 | dt: 56.20ms | tok/sec: 145752.69
step     2 | loss: 2.858903 | lr 2.5175e-06 | norm: 15.9752 | dt: 50.09ms | tok/sec: 163532.14
step     3 | loss: 2.753336 | lr 3.3566e-06 | norm: 11.5365 | dt: 49.16ms | tok/sec: 166645.19
step     4 | loss: 2.727894 | lr 4.1958e-06 | norm: 9.8177 | dt: 53.44ms | tok/sec: 153300.67
step     5 | loss: 2.480538 | lr 5.0350e-06 | norm: 7.4541 | dt: 51.58ms | tok/sec: 158830.21
step     6 | loss: 2.649940 | lr 5.8741e-06 | norm: 6.0123 | dt: 51.60ms | tok/sec: 158758.29
step     7 | loss: 2.408336 | lr 6.7133e-06 | norm: 4.9929 | dt: 51.50ms | t