<a href="https://colab.research.google.com/github/Matterelloo/Cloud-Computing-Performance-Testing/blob/main/index_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#1 Modello

In [None]:

from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig
from peft import get_peft_model
from datasets import load_dataset
from transformers import Trainer, TrainingArguments
from torch.utils.data import DataLoader, Dataset
import matplotlib.pyplot as plt
import torch


In [None]:
def preprocess_dataset(record, tokenizer):
  """
  Tokenize the instruction-answer pairs and returns the prediction labels (the tokens that the model has to predict).
  """
  max_length = 250
  conversation = [{"role": "user", "content": f"{record['question']}"}, {"role": "assistant", "content": f"{record['answer']}"}]
  chat = tokenizer.apply_chat_template(conversation, return_tensors='pt', tokenize=False, add_generation_prompt=False)  # chat template used by llama to represent the messages of the assistant and the user
  tokens = tokenizer(chat, padding='max_length', return_tensors='pt', add_special_tokens=False, max_length=max_length, truncation=True) # converting the chat template into tokens (used padding and fixed length, useful for the batch processing)
  starting_answer_idx = (tokens['input_ids'] == 128007).nonzero()[-1][-1] + 2 # looking where the answer starts (128007 is the token that ends the prompt of the chat)
  labels = tokens['input_ids'][0].clone()
  labels[:starting_answer_idx] = -100 # setting to -100 the tokens that won't be considered in the computation of the loss
  ending_answer_idx = (labels == 128009).nonzero()  # looking where the answer ends in order to identify the padding token (right padding)
  if len(ending_answer_idx) != 0:
    labels[ending_answer_idx[0]:] = -100  # the padding tokens won't be considered in the computation of the loss

  return {
    'chat': chat,
    'input_ids': tokens['input_ids'][0],
    'attention_mask': tokens['attention_mask'][0],
    "labels": labels,
  }

In [None]:
model_name = "meta-llama/Llama-3.2-1B-Instruct"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

In [None]:
gsm8k = load_dataset("gsm8k", "main")
dataset = gsm8k.map(lambda x: preprocess_dataset(x, tokenizer)) # tokenazing the instruction-answer pairs
dataset = dataset.remove_columns(['question', 'answer'])  # remove unused columns
dataset.set_format(type="torch", columns=dataset['train'].column_names) # used to get tensors instead of lists
train_loader = DataLoader(dataset['train'], batch_size=8, shuffle=True) # type: ignore
test_loader = DataLoader(dataset['test'], batch_size=8) # type: ignore

In [None]:
# looking at how many tokens every instruction-answer have in order to understand when to truncate the answer.
length = []
for item in train_loader:
  length.append(len(item['input_ids'][0]))

plt.bar(range(len(length)), length)
plt.xlabel('Sample Index')
plt.ylabel('Length')
plt.title('Barplot of Lengths')
plt.show()

We can truncate at 250 tokens

In [None]:
device = 'cpu'
dtype = torch.float16
model = AutoModelForCausalLM.from_pretrained(model_name, dtype=dtype).to(device)
config = LoraConfig(
  r=8,
  lora_alpha=32,
  # target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
  target_modules=["q_proj"], # TODO: to change
  lora_dropout=0.05,
)
lora_model = get_peft_model(model, config)
lora_model.print_trainable_parameters()

In [None]:
ignore_index = -100
optmizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

In [None]:
losses = []
epochs = 2

lora_model.train()
for epoch in range(epochs):
  total_loss = 0
  for i, batch in enumerate(train_loader):
    chat = batch['chat']
    input_ids = batch['input_ids'].to(lora_model.device)
    attention_mask = batch['attention_mask'].to(lora_model.device)
    labels = batch['labels'].to(lora_model.device)
    labels = input_ids.clone()
    output = lora_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels) # loss directly computed here (CrossEntropyLoss)
    optmizer.zero_grad()
    output.loss.backward()
    optmizer.step()
    total_loss += output.loss.item()
    print(f"[Batch {i + 1}/{len(train_loader)}] Loss: {output.loss.item()}")

  losses.append(total_loss / (i + 1))
  print(f"[Epoch {epoch + 1}/{epochs}] Loss: {losses[-1]}")

#2 Modello

In [None]:
import torch.nn as nn
from transformers import LlamaForCausalLM, LlamaTokenizer
from torch.nn import CrossEntropyLoss


In [None]:
tokenizer.add_special_tokens({"additional_special_tokens": ["<mask>"]})
MASK_TOKEN_ID = tokenizer.convert_tokens_to_ids("<mask>")
model.resize_token_embeddings(len(tokenizer))

In [None]:
class LlamaWrapperPromptResp(nn.Module):
    def __init__(self, model: LlamaForCausalLM):
        super().__init__()
        self.model = model

    def make_attention_mask(self, input_ids, prompt_len):
        batch_size, seq_size = input_ids.size()
        mask = torch.zeros((batch_size, seq_size, seq_size), dtype=torch.bool)
        # va be booleani startati a zero tutti i token visbili e poi si ha tipo
        # mask[b, seq_size=i, seq_size = j]il token i della sequenza b può guardare il token j

        for b in range(batch_size):
            p = prompt_len[b] if isinstance(prompt_len, (list, tuple)) else prompt_len
            # se prompt_len è una lista usa l'elemento corrispondente (ce va be capito)
            # altrimenti è un unico valore per tutto il batch
            causal = torch.tril(torch.ones(seq_size, seq_size, dtype=torch.bool))
            # sto trill è la triangolare inferiore per guardare solo i token prima
            mask[b] = causal # prompt vede solo a sx
            mask[b, p:, :] = 1  # risposta vede sx e dx

        return mask.to(input_ids.device)

    def forward(self, input_ids, prompt_len, **kwargs):
        attention_mask = self.make_attention_mask(input_ids, prompt_len)
        return self.model(input_ids = input_ids,attention_mask = attention_mask,
                          use_cache = False, **kwargs)

        # forward del modello usando la attention mask personalizzata
        # per prompt e response


In [None]:
def forward_diffusion_step(r0, t, MASK_TOKEN_ID):

    batch_size, seq_len = r0.shape
    rt = r0.clone()


    mask_prob = t.view(-1, 1).expand(batch_size, seq_len)  # broadcast per py.torch
    rand = torch.rand_like(rt.float()) #genere stesso shape di numero casuale u< della prob di rt
    mask = (rand < mask_prob).long() #coverte booleani in interi

    rt = rt * (1 - mask) + MASK_TOKEN_ID * mask
    # maschera i token selezionati a random precedentemente
    # è su rt perchè è stato clonato da r0
    return rt, mask

In [None]:
ignore_index = -100
optmizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
wrapped_model = LlamaWrapperPromptResp(model).to(device)

In [None]:
epochs = 5
losses = []

for epoch in range(epochs):
    total_loss = 0
    for i, batch in enumerate(train_loader): # train_loader con 'input_ids'
        r0 = batch['input_ids'].to(device)
        batch_size, seq_len = r0.shape

        # probabilità di mascheramento casuale per ogni esempio
        t = torch.rand(batch_size, device=device)

        # Forward diffusion
        rt, mask = forward_diffusion_step(r0, t, MASK_TOKEN_ID)

        # Forward pass
        outputs = wrapped_model(input_ids=rt,prompt_len=seq_len//2, labels=r0)

        # prompt_len=seq_len//2 -> serve a costruire correttamente l’attention mask
        # metà seq prompt, metà seq response

        logits = outputs.logits  # [batch_size, seq_len, vocab_size]

        # Loss sui token mascherati
        labels = r0.clone() #clona
        labels[mask == 0] = ignore_index #ignora le label non mascherate
        loss_fct = CrossEntropyLoss(ignore_index=ignore_index) # cross
        loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
        # calcolo effettivo allora da quello che ho capito la cross ha bisogno
        # di [N, C] per i logits (N esempi, C classi), quindi:
        # [batch_size, seq_len, vocab_size] in in [batch_size * seq_len, vocab_size]
        # con --> logits.view(-1, logits.size(-1))
        # e poi da .view(-1) trasforma [batch_size, seq_len] in [batch_size * seq_len]

        #Casino ma funziona

        # Backward e ottimizzazione
        optmizer.zero_grad()
        loss.backward()
        optmizer.step()

        #Avarage Loss
        total_loss += loss.item()
        print(f"[Batch {i+1}/{len(train_loader)}] Loss: {loss.item():.4f}")

    avg_loss = total_loss / len(train_loader)
    losses.append(avg_loss)
    print(f"[Epoch {epoch+1}/{epochs}] Avg Loss: {avg_loss:.4f}")

#Parte che worka

Le funzioni sopra non andavano per una questione di compatibilità della dim quindi ho chiesto a chat

In [None]:
import torch
from torch import nn
from torch.nn import CrossEntropyLoss
from transformers import LlamaForCausalLM


class LlamaWrapperPromptResp(nn.Module):
    def __init__(self, model: LlamaForCausalLM):
        super().__init__()
        self.model = model

    def make_attention_mask(self, input_ids, prompt_len):
        batch_size, seq_size = input_ids.size()
        mask = torch.zeros((batch_size, seq_size, seq_size), dtype=torch.bool)


        for b in range(batch_size):
            p = prompt_len[b] if isinstance(prompt_len, (list, tuple)) else prompt_len
            causal = torch.tril(torch.ones(seq_size, seq_size, dtype=torch.bool))
            mask[b] = causal
            mask[b, p:, :] = 1

        return mask.to(input_ids.device)

    def forward(self, input_ids, prompt_len, **kwargs):
        attention_mask = self.make_attention_mask(input_ids, prompt_len)
    # aggiungi dimensione "head" per compatibilità
        attention_mask = attention_mask.unsqueeze(1)
        return self.model(input_ids=input_ids,
                      attention_mask=attention_mask,
                      use_cache=False,
                      **kwargs)


def forward_diffusion_step(r0, t, MASK_TOKEN_ID):
    batch_size, seq_len = r0.shape
    rt = r0.clone()

    mask_prob = t.view(-1, 1).expand(batch_size, seq_len)
    rand = torch.rand_like(rt.float())
    mask = (rand < mask_prob).long()

    rt = rt * (1 - mask) + MASK_TOKEN_ID * mask
    return rt, mask

epochs = 5
losses = []

ignore_index = -100
wrapped_model = LlamaWrapperPromptResp(model).to(device)
optmizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

for epoch in range(epochs):
    total_loss = 0
    for i, batch in enumerate(train_loader):
        r0 = batch['input_ids'].to(device)
        batch_size, seq_len = r0.shape


        t = torch.rand(batch_size, device=device)


        rt, mask = forward_diffusion_step(r0, t, MASK_TOKEN_ID)

        # forward pass con la tua attention mask custom
        outputs = wrapped_model(input_ids=rt,
                                prompt_len=seq_len//2,  # <-- es. metà prompt / metà risposta
                                labels=r0)

        logits = outputs.logits


        labels = r0.clone()
        labels[mask == 0] = ignore_index
        loss_fct = CrossEntropyLoss(ignore_index=ignore_index)
        loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))


        optmizer.zero_grad()
        loss.backward()
        optmizer.step()

        total_loss += loss.item()
        print(f"[Batch {i+1}/{len(train_loader)}] Loss: {loss.item():.4f}")

    avg_loss = total_loss / len(train_loader)
    losses.append(avg_loss)
    print(f"[Epoch {epoch+1}/{epochs}] Avg Loss: {avg_loss:.4f}")


#Prova solo masking prompt-resp

Avevo finito il limite massimo di lavoro con colab utilizzando gpu e mi mancava di provare se effitavemente si faceva quello che avevo pensato per la roba del masking.

In [None]:
import torch
from torch import nn
from transformers import LlamaForCausalLM


model_name = "meta-llama/Llama-3.2-1B-Instruct"
device = "cpu" if torch.cuda.is_available() else "cpu"

# per questo esempio possiamo usare input casuali
vocab_size = 100
seq_len = 10
batch_size = 2
MASK_TOKEN_ID = 99  # esempio
prompt_len = 4      # prompt metà sequenza


r0 = torch.randint(0, vocab_size, (batch_size, seq_len)).to(device)


class LlamaWrapperPromptResp(nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def make_attention_mask(self, input_ids, prompt_len):
        B, S = input_ids.size()
        mask = torch.zeros((B, S, S), dtype=torch.bool)
        for b in range(B):
            p = prompt_len if not isinstance(prompt_len, (list, tuple)) else prompt_len[b]
            causal = torch.tril(torch.ones(S, S, dtype=torch.bool))
            mask[b] = causal
            mask[b, p:, :] = 1
        return mask

    def forward(self, input_ids, prompt_len, **kwargs):
        attention_mask = self.make_attention_mask(input_ids, prompt_len)
        return attention_mask


def forward_diffusion_step(r0, t, MASK_TOKEN_ID):
    B, S = r0.shape
    rt = r0.clone()
    mask_prob = t.view(-1, 1).expand(B, S)
    rand = torch.rand_like(rt.float())
    mask = (rand < mask_prob).long()
    rt = rt * (1 - mask) + MASK_TOKEN_ID * mask
    return rt, mask


wrapped_model = LlamaWrapperPromptResp(None)
t = torch.rand(batch_size)
rt, mask = forward_diffusion_step(r0, t, MASK_TOKEN_ID)
attention_mask = wrapped_model(input_ids=rt, prompt_len=prompt_len)


for b in range(batch_size):
    print(f"\nEsempio {b}:")
    print("Original input:", r0[b].tolist())
    print("Masked input  :", rt[b].tolist())
    print("Diffusion mask:", mask[b].tolist())
    print("Attention mask:\n", attention_mask[b].int())
