In [26]:
import torch
from datasets import load_from_disk
from transformers import AutoTokenizer

tokenizer_path = "/home/hpc01/Marcos/Patch_Assesment/Tokenizer"
tokenized_dataset_path = "/home/hpc01/Marcos/Patch_Assesment/Dataset/TokenizedDatasets/small"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
tokenized_datasets = load_from_disk(tokenized_dataset_path)

pad_token_id = tokenizer.convert_tokens_to_ids("<|end_of_text|>")  # Obtener el id de <|pad|>
batch_size = 4

def create_batches(dataset, batch_size, pad_token_id = 128001):
    """
    Iterador que divide el dataset en batches y añade padding para igualar las secuencias al tamaño máximo del batch.

    Args:
    - dataset: dataset tokenizado (e.g., tokenized_datasets["train"]).
    - batch_size: tamaño del batch.
    - pad_token_id: id del token de padding (en este caso, el id de <|pad|>).

    Yields:
    - batch: un batch que contiene 'input_ids', 'attention_mask' y 'labels'.
    """
    # Iterar sobre el dataset en pasos del tamaño del batch
    for i in range(0, len(dataset['input_ids']), batch_size):
        # Extraer el batch actual
        batch_input_ids = dataset['input_ids'][i:i + batch_size]
        batch_attention_mask = dataset['attention_mask'][i:i + batch_size]
        batch_labels = dataset['labels'][i:i + batch_size]

        # Encontrar la longitud máxima de 'input_ids' en el batch actual
        max_length = max(len(input_ids) for input_ids in batch_input_ids)+1

        # Crear listas para almacenar los input_ids y attention_mask con padding
        padded_input_ids = []
        padded_attention_mask = []

        # Aplicar padding a cada secuencia del batch
        for input_ids, attention_mask in zip(batch_input_ids, batch_attention_mask):
            # Calcular cuántos tokens de padding se necesitan
            padding_length = max_length - len(input_ids)
            
            # Rellenar con el token de padding
            padded_input_ids.append(input_ids + [pad_token_id] * padding_length)
            padded_attention_mask.append(attention_mask + [0] * padding_length)  # 0 para los tokens de padding
        
        # Yield del batch actual
        yield {
            'input_ids': torch.tensor(padded_input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(padded_attention_mask, dtype=torch.long),
            'labels': torch.tensor(batch_labels, dtype=torch.long)
        }


pad_token_id = tokenizer.convert_tokens_to_ids("<|end_of_text|>")  # Obtener el id de <|pad|>
batch_size = 4

batch_gen_train = create_batches(tokenized_datasets["train"], batch_size, pad_token_id)

batch = next(batch_gen_train)

In [128]:
from transformers import PreTrainedModel, LlamaConfig, AutoModel, AutoConfig
from torch import nn

class PatchALlama(PreTrainedModel):
    """
    Creates a DeBERTa model for fault localization.
    """

    config_class = LlamaConfig

    def __init__(self):
        super(PatchALlama, self).__init__(AutoConfig.from_pretrained("meta-llama/Llama-3.2-1B"))

        self.llama = AutoModel.from_pretrained("meta-llama/Llama-3.2-1B")
        self.linear = nn.Linear(self.llama.config.hidden_size, 1)

    def first_eots_pos(self, input_ids):
        r_tensor = []

        for input in input_ids:

            condicion = (input == 128001)
            r_tensor.append(torch.nonzero(condicion)[0].item())

        return r_tensor

    def get_last_tokens(self, last_hidden_state, input_ids):
        ids = self.first_eots_pos(input_ids)

        tokens = []

        for i in range(len(input_ids)):
            tokens.append(last_hidden_state[i][ids[i]])

        return torch.stack(tokens)
        
    def forward(self, input_ids, attention_mask,labels):

        output = self.llama(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = output["last_hidden_state"]
        #Del last hidden state tengo que buscar el último eos que es el que tiene todo el contexto del fragmento del batch
        last_token = self.get_last_tokens(last_hidden_state, input_ids)

        output_lin = self.linear(last_token).squeeze(-1)
        # Output: [batch_size, number_of_lines, 1] -> [batch_size, number_of_lines]

        # Flatten output tensor
        output = output_lin.view(-1)

        # Labels 
        loss_fnc = nn.BCEWithLogitsLoss()
        loss = loss_fnc(output, labels.float())

        return (loss, output) if loss is not None else output


In [129]:
config_path = "meta-llama/Llama-3.2-1B"

model = PatchALlama()

In [130]:
input_ids = batch["input_ids"]
attention_mask = batch["attention_mask"]
labels = batch["labels"]

In [132]:
model(input_ids, attention_mask, labels)

(tensor(0.7106, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>),
 tensor([1.1449, 0.0908, 0.5000, 0.4496], grad_fn=<ViewBackward0>))

In [117]:
print(labels.float())

tensor([1., 1., 0., 0.])


In [71]:
print(tokenizer.decode(input_ids[0]))

<|begin_of_text|>@@ -140,7 +140,6 @@
        */
       if (NodeUtil.hasFinally(n)) {
         Node finallyBlock = n.getLastChild();
-        tryMinimizeExits(finallyBlock, exitType, labelName);
       }
     }

<|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_

In [98]:
def first_eots_pos(input_ids):
    r_tensor = []

    for i, input in enumerate(input_ids):

        condicion = (input == 128001)
        r_tensor.append(torch.nonzero(condicion)[0].item())

    return r_tensor

first_eots = first_eots_pos(input_ids)
first_eots

[53, 183, 69, 118]

In [99]:
output = model.llama(input_ids=input_ids, attention_mask=attention_mask)
last_hidden_state = output["last_hidden_state"]
        #Del last hidden state tengo que buscar el último eos que es el que tiene todo el contexto del fragmento del batch
last_hidden_state

tensor([[[ 1.5497, -1.5041,  2.7454,  ..., -0.6332,  0.6005, -1.0105],
         [-3.0679,  3.7994,  1.1108,  ..., -8.1647, -2.2493, -1.1095],
         [ 0.5897,  2.4325,  0.9655,  ..., -4.2744,  0.1402, -0.4675],
         ...,
         [-0.1433,  3.2924, -0.7329,  ...,  0.5978,  2.6403,  1.5293],
         [-0.1700,  3.2424, -0.7118,  ...,  0.6912,  2.5687,  1.5834],
         [-0.1510,  3.2228, -0.6738,  ...,  0.7782,  2.4936,  1.5973]],

        [[ 1.5497, -1.5041,  2.7454,  ..., -0.6332,  0.6005, -1.0105],
         [-3.0679,  3.7994,  1.1108,  ..., -8.1647, -2.2492, -1.1095],
         [ 0.5897,  2.4325,  0.9655,  ..., -4.2744,  0.1402, -0.4675],
         ...,
         [ 0.8978, -1.5651,  1.3260,  ..., -0.8632, -3.4923, -1.1031],
         [-2.2748, -1.1865,  1.3182,  ..., -0.7239, -0.1708,  0.0831],
         [ 1.3420,  3.1930, -2.2678,  ..., -0.9257,  4.1256,  1.7292]],

        [[ 1.5497, -1.5041,  2.7454,  ..., -0.6332,  0.6005, -1.0105],
         [-3.0679,  3.7994,  1.1108,  ..., -8

In [106]:
def get_last_tokens(last_hidden_state, input_ids):
    ids = first_eots_pos(input_ids)
    print(ids)
    tokens = []

    for i in range(len(input_ids)):
        tokens.append(last_hidden_state[i][ids[i]])

    return torch.stack(tokens)

last_tokens = get_last_tokens(last_hidden_state, input_ids)
last_tokens

[53, 183, 69, 118]


tensor([[ 1.5489,  4.9625, -3.5646,  ..., -1.9848,  4.0793,  1.2826],
        [ 1.3420,  3.1930, -2.2678,  ..., -0.9257,  4.1256,  1.7292],
        [ 2.0299,  4.0167, -3.0058,  ..., -0.8630,  4.5257,  1.7899],
        [ 2.6225,  4.2697, -3.1690,  ..., -0.3946,  4.6472,  1.9237]],
       grad_fn=<StackBackward0>)

In [107]:

output_lin = model.linear(last_tokens).squeeze(-1)
        # Output: [batch_size, number_of_lines, 1] -> [batch_size, number_of_lines]
output_lin

tensor([-0.8786, -0.3365, -0.6402, -0.8607], grad_fn=<SqueezeBackward1>)

In [108]:

        # Flatten output tensor
output = output_lin.view(-1)

output


tensor([-0.8786, -0.3365, -0.6402, -0.8607], grad_fn=<ViewBackward0>)

In [118]:

        # Labels 
loss_fnc = nn.BCEWithLogitsLoss()
loss = loss_fnc(output, labels.float())

loss

tensor(0.7194, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)