In [11]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DefaultDataCollator
import torch
from datasets import load_from_disk

model_path = "/home/hpc01/Marcos/Patch_Assesment/Model"
tokenizer_path = "/home/hpc01/Marcos/Patch_Assesment/Tokenizer"
tokenized_dataset_path = "/home/hpc01/Marcos/Patch_Assesment/Dataset/TokenizedDatasets/large"


tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
tokenized_datasets = load_from_disk(tokenized_dataset_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path, problem_type="single_label_classification")
model.config.pad_token_id = tokenizer.eos_token_id


def create_batches(dataset, batch_size, pad_token_id):
    """
    Iterador que divide el dataset en batches y añade padding para igualar las secuencias al tamaño máximo del batch.

    Args:
    - dataset: dataset tokenizado (e.g., tokenized_datasets["train"]).
    - batch_size: tamaño del batch.
    - pad_token_id: id del token de padding (en este caso, el id de <|pad|>).

    Yields:
    - batch: un batch que contiene 'input_ids', 'attention_mask' y 'labels'.
    """
    # Iterar sobre el dataset en pasos del tamaño del batch
    for i in range(0, len(dataset['input_ids']), batch_size):
        # Extraer el batch actual
        batch_input_ids = dataset['input_ids'][i:i + batch_size]
        batch_attention_mask = dataset['attention_mask'][i:i + batch_size]
        batch_labels = dataset['labels'][i:i + batch_size]

        # Encontrar la longitud máxima de 'input_ids' en el batch actual
        max_length = max(len(input_ids) for input_ids in batch_input_ids)

        # Crear listas para almacenar los input_ids y attention_mask con padding
        padded_input_ids = []
        padded_attention_mask = []

        # Aplicar padding a cada secuencia del batch
        for input_ids, attention_mask in zip(batch_input_ids, batch_attention_mask):
            # Calcular cuántos tokens de padding se necesitan
            padding_length = max_length - len(input_ids)
            
            # Rellenar con el token de padding
            padded_input_ids.append(input_ids + [pad_token_id] * padding_length)
            padded_attention_mask.append(attention_mask + [0] * padding_length)  # 0 para los tokens de padding
        
        # Yield del batch actual
        yield {
            'input_ids': torch.tensor(padded_input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(padded_attention_mask, dtype=torch.long),
            'labels': torch.tensor(batch_labels, dtype=torch.long)
        }


pad_token_id = tokenizer.convert_tokens_to_ids("<|end_of_text|>")  # Obtener el id de <|pad|>
batch_size = 4




Loading checkpoint shards: 100%|██████████| 7/7 [00:01<00:00,  5.59it/s]


In [12]:
print(model)

LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      

In [14]:
# Inicializar el iterador
batch_gen_train = create_batches(tokenized_datasets["train"], batch_size, pad_token_id)
#batch_gen_test = create_batches(tokenized_datasets["test"], batch_size, pad_token_id)


batch = next(batch_gen_train)
print(batch)
#for k, v in batch.items():
    #print(k,v.shape)

#outputs = model(**batch)
#print(outputs.loss, outputs.logits.shape)


{'input_ids': tensor([[128000,  19741,    482,   5037,     11,     22,    489,   5037,     11,
             22,  88220,   7163,    748,    320,   8514,      8,    341,  13342,
           5939,   2594,   2986,    446,  21831,    311,   3047,    489,   2576,
           5180,   1449,  13342,   5939,   2594,   2986,  22274,   2956,   3288,
           7338,     77,      1,    489,    828,   5180,   1449,     12,    298,
           5939,     13,    414,  40426,    396,   2312,  22274,    803,     10,
            298,   5939,   2594,  40426,    396,   2312,  22274,    803,   7163,
            197,    534,    720,   7163,    197,    322,   2175,    279,   2077,
            198,  19741,    482,   9674,     11,     22,    489,   9674,     11,
             22,  88220,   7163,    862,   4635,    280,    220,    197,    534,
            720,     12,   2514,   1118,  11664,   1373,  16533,  83922,   1988,
            340,     10,   2514,  11664,   1373,  16533,  83922,   1988,    340,
          1334