In [1]:
import torch
from torch import tensor, nn, optim
import torch.nn.functional as F
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.utils.data import DataLoader

In [2]:
import huggingface_hub

huggingface_hub.notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.svâ€¦

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = 'meta-llama/Llama-3.2-1B'

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

In [4]:
from datasets import load_dataset

ds = load_dataset("myothiha/jokes")

In [5]:
len(set(ds['train']['text']))

187641

In [6]:
vocab_size = tokenizer.vocab_size
context_length = 32
n_embs = 128
n_heads = 16
n_blocks = 8
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [7]:
ds

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'text'],
        num_rows: 187641
    })
    validation: Dataset({
        features: ['Unnamed: 0', 'text'],
        num_rows: 20850
    })
    test: Dataset({
        features: ['Unnamed: 0', 'text'],
        num_rows: 23166
    })
})

In [8]:
def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        padding='max_length',
        truncation=True,
        max_length=64,
        return_tensors='pt'
    )

tokenized_dataset = ds.map(tokenize_function, batched=True, remove_columns=ds['train'].column_names)

Map:   0%|          | 0/23166 [00:00<?, ? examples/s]

In [9]:
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

In [10]:
torch.set_float32_matmul_precision('high')

In [11]:
train_dataset = tokenized_dataset["train"]
dl = DataLoader(train_dataset, shuffle=True, batch_size=16)

In [12]:
len(dl)

11728

In [13]:
import gc
import torch

# Clear Python garbage collector
gc.collect()

# Empty PyTorch CUDA cache
torch.cuda.empty_cache()

In [14]:
llama = AutoModelForCausalLM.from_pretrained(model_name)
llama_params = sum(p.numel() for p in llama.parameters())
print("base model parameters: ", llama_params)

base model parameters:  1235814400


In [15]:
llama

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm):

In [16]:
from peft import LoraConfig, get_peft_model

# Define LoRA configuration
lora_config = LoraConfig(
    r=8,               # Rank of the low-rank decomposition
    lora_alpha=16,     # Scaling factor for LoRA
    target_modules=["q_proj", "k_proj", "v_proj", 'o_proj', 'fc1', 'fc2', 'gate_proj'],
    lora_dropout=0.1,  # Dropout for LoRA layers
    bias="none"        # Bias handling (options: "none", "all", or "lora_only")
)

# Apply LoRA to the model
model = get_peft_model(llama, lora_config).to(device)
# model.apply(initialize)

print("Lora parameters: ", sum(p.numel() for p in model.parameters()) - llama_params)

Lora parameters:  3014656


In [17]:
batch = next(iter(dl))
out = model(input_ids=batch['input_ids'].to(device), attention_mask=batch['attention_mask'].to(device), labels=batch['input_ids'].to(device))

In [18]:
batch

{'input_ids': tensor([[128000,  15546,    596,  ..., 128001, 128001, 128001],
         [128000,  20484,     56,  ..., 128001, 128001, 128001],
         [128000,  90243,    596,  ..., 128001, 128001, 128001],
         ...,
         [128000,   3923,    374,  ..., 128001, 128001, 128001],
         [128000,   5159,   7555,  ..., 128001, 128001, 128001],
         [128000,  66230,   3776,  ..., 128001, 128001, 128001]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]])}

In [19]:
len(batch['attention_mask'])

16

In [20]:
lr = 7e-4
opt = optim.AdamW(model.parameters(), lr)

In [21]:
# Training loop for fine-tuning a huggingface model
epochs = 1
sched = CosineAnnealingLR(opt, epochs * len(dl), lr * 0.01)
model.train()
for i in range(epochs):
    for step, batch in enumerate(dl):
        xb, mask = batch['input_ids'], batch['attention_mask']
        xb = xb.to(device)
        mask = mask.to(device)
        opt.zero_grad()
        output = model(input_ids=xb, attention_mask=mask, labels=xb)
        loss = output.loss
        loss.backward()
        opt.step()
        sched.step()
        if step % (len(dl)//10000) == 0 or step == len(dl):
            print(f"Epoch: {i}, Step: {step}, Loss: {loss}")

Epoch: 0, Step: 0, Loss: 7.4637532234191895
Epoch: 0, Step: 1, Loss: 2.0398666858673096
Epoch: 0, Step: 2, Loss: 1.7745153903961182
Epoch: 0, Step: 3, Loss: 1.404415249824524
Epoch: 0, Step: 4, Loss: 1.686361312866211
Epoch: 0, Step: 5, Loss: 1.4681618213653564
Epoch: 0, Step: 6, Loss: 1.7048354148864746
Epoch: 0, Step: 7, Loss: 1.428846836090088
Epoch: 0, Step: 8, Loss: 1.278314232826233
Epoch: 0, Step: 9, Loss: 1.5783041715621948
Epoch: 0, Step: 10, Loss: 1.3558272123336792
Epoch: 0, Step: 11, Loss: 1.4858266115188599
Epoch: 0, Step: 12, Loss: 1.3189113140106201
Epoch: 0, Step: 13, Loss: 1.2566765546798706
Epoch: 0, Step: 14, Loss: 1.2840001583099365
Epoch: 0, Step: 15, Loss: 1.7297327518463135
Epoch: 0, Step: 16, Loss: 1.26711106300354
Epoch: 0, Step: 17, Loss: 1.40896475315094
Epoch: 0, Step: 18, Loss: 1.1706717014312744
Epoch: 0, Step: 19, Loss: 1.2455848455429077
Epoch: 0, Step: 20, Loss: 1.1351265907287598
Epoch: 0, Step: 21, Loss: 1.3215280771255493
Epoch: 0, Step: 22, Loss: 1.

In [22]:
def generate(idx, max_tokens):
    model.eval()
    tokens = idx
    for i in range(max_tokens):
        logits = model(tokens[:, -context_length:])
        topk_values, topk_indices = torch.topk(logits[:, -1, :], 50)
        probs = topk_values.softmax(dim=-1)
        sample = torch.multinomial(probs, 1)
        token = torch.gather(topk_indices, 1, sample)
        tokens = torch.cat((tokens, token), dim=-1)
    
    return tokens

In [23]:
def generate_finetune(input_ids, attention_mask, max_tokens=50):
    model.eval()
    tokens = input_ids
    for i in range(max_tokens):
        logits = model(input_ids, attention_mask=attention_mask).logits
        topk_values, topk_indices = torch.topk(logits[:, -1, :], 200)
        probs = topk_values.softmax(dim=-1)
        sample = torch.multinomial(probs, 1)
        token = torch.gather(topk_indices, 1, sample)
        tokens = torch.cat((tokens, token), dim=-1)
        
    return tokens

In [24]:
start_tokens = tokenizer("How many", return_tensors='pt').to(device)
print(tokenizer.decode(generate_finetune(**start_tokens, max_tokens=100)[0]))

<|begin_of_text|>How many Japanese gang times fat Jews cann mice Germans psychologists black therapists Canadians kids apples cops snow hip feminists Irish feminists programmers Jews women n people lawyers jew lesbians men millennials cops J potatoes hip veget cows Germans years black people Americans cops pirates guys women Jews g guys n / black ped cops cops Americans / children dead Buddh feminists n people dicks ducks redd Mexicans jew hip redd hip Jews police millennials gang Jewish Reddit dead Democrats people cann ar homeless rabbits red hip hip gay black black dead apples feminists feminists g mice people police feminists hip Trump


In [25]:
tokenizer.decode(next(iter(dl))[0][0])

KeyError: 0

In [None]:
tokenizer.decode(model.generate(max_length=200, top_k=50)[0])

In [None]:
model.generate??

In [26]:
input_ids = tokenizer("", return_tensors='pt').to(device)

In [27]:
output = model.generate(
    **input_ids,
    max_length=50,          # Maximum length of generated text
    min_length=10,          # Minimum length
    do_sample=True,         # Enable sampling
    temperature=1.,        # Sampling temperature
    top_k=50,               # Top-k sampling
    top_p=0.9,              # Top-p nucleus sampling
    repetition_penalty=1.2, # Penalize repetition
    num_beams=5,            # Use beam search with 5 beams
    no_repeat_ngram_size=2, # Avoid repeating bigrams
    eos_token_id=tokenizer.eos_token_id,
)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [28]:
print(tokenizer.decode(output[0]))

<|begin_of_text|>What do you get when you cross an elephant and a rhinoceros? Elephino.
<|end_of_text|>


In [29]:
model

PeftModel(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 2048)
        (layers): ModuleList(
          (0-15): 16 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear(
        

In [35]:
for i in range(1000):
    output = model.generate(
        **input_ids,
        max_length=50,          # Maximum length of generated text
        min_length=10,          # Minimum length
        do_sample=True,         # Enable sampling
        temperature=2.,        # Sampling temperature
        top_k=50,               # Top-k sampling
        top_p=0.9,              # Top-p nucleus sampling
        repetition_penalty=1.2, # Penalize repetition
        num_beams=5,            # Use beam search with 5 beams
        no_repeat_ngram_size=2, # Avoid repeating bigrams
        eos_token_id=128001,
        pad_token_id=128001
    )
    print(tokenizer.decode(output[0][1:-1]))

What do you get when you cross an elephant and a bear? Elebore

Why did the little mermaid go on a date? She wanted to find a seagull! Why did she not find any? Because she was a little sea mammal.

What's the difference between a pizza and a hooker? One is a crusty whore and the other is one of the best things you'll ever put into your mouth.

What did the girl give to the guy at the bar for Christmas? An empty bottle and a shot glass.

I've always wanted to write a comedy about an old lady with no arms and no legs......but I can't think of a good title for it.

I don't like it when my friends talk about how I'm weird. I mean, if I had a dollar for every time I thought about them, I'd have a lot of dollars.

What's the difference between a woman and a computer? A computer can have a hardware failure.

I love it when people ask me what I do for a living. I tell them I'm a writer.

What is black, white and red all over? A nun with a gun

What's the difference between a cow and a chicken