<a href="https://colab.research.google.com/github/JuanRivera182003/LLM-/blob/main/Tokenizar.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Fuerza reset, quita toda versión conflictiva y reinstala en limpio
!pip uninstall -y transformers datasets seqeval
!pip install -U transformers datasets seqeval

Found existing installation: transformers 4.53.0
Uninstalling transformers-4.53.0:
  Successfully uninstalled transformers-4.53.0
Found existing installation: datasets 2.14.4
Uninstalling datasets-2.14.4:
  Successfully uninstalled datasets-2.14.4
[0mCollecting transformers
  Downloading transformers-4.53.0-py3-none-any.whl.metadata (39 kB)
Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading transformers-4.53.0-py3-none-any.whl (10.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m105.2 MB/s[0m eta [36m0:00:00[0m

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import json
from datasets import Dataset

# Cargar JSONL manualmente
examples = []
with open("/content/drive/My Drive/LLM/dataset.jsonl") as f:
    for line in f:
        examples.append(json.loads(line))

dataset = Dataset.from_dict({
    "tokens": [ex["tokens"] for ex in examples],
    "labels": [ex["labels"] for ex in examples]
})

print(f"Cargado: {len(dataset)} ejemplos")

# Preparar etiquetas
unique_labels = set()
for ex in dataset:
    unique_labels.update(ex["labels"])
label_list = sorted(list(unique_labels))
label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for l, i in label2id.items()}
print(f"Labels: {label_list}")

from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
    )
    labels = []
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])
            else:
                label_ids.append(label2id[label[word_idx]])
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

dataset = dataset.map(tokenize_and_align_labels, batched=True)

# Dividir
dataset = dataset.train_test_split(test_size=0.1)
train_ds = dataset["train"]
eval_ds = dataset["test"]


Cargado: 1000 ejemplos
Labels: ['B-ATTR', 'B-CONTENT', 'B-FILTER', 'B-LOC', 'B-OBJ', 'B-PERSON', 'B-VERB', 'I-CONTENT', 'I-FILTER', 'I-LOC', 'I-PERSON', 'I-VERB', 'O']


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
from transformers import (
    BertForTokenClassification,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer
)
import numpy as np
from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score

model = BertForTokenClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id,
)

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_labels = [
        [id2label[l] for l in label if l != -100]
        for label in labels
    ]
    true_preds = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return {
        "precision": precision_score(true_labels, true_preds),
        "recall": recall_score(true_labels, true_preds),
        "f1": f1_score(true_labels, true_preds),
        "accuracy": accuracy_score(true_labels, true_preds),
    }

from transformers import TrainingArguments

args = TrainingArguments(
    output_dir="/content/drive/My Drive/LLM/bert_token_classification_model",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to="none",  # 👈 Desactiva W&B
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    tokenizer=tokenizer,
    data_collator=DataCollatorForTokenClassification(tokenizer),
    compute_metrics=compute_metrics,
)

trainer.train()

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.013458,1.0,1.0,1.0,1.0
2,No log,0.004096,1.0,1.0,1.0,1.0
3,No log,0.003213,1.0,1.0,1.0,1.0


TrainOutput(global_step=339, training_loss=0.19093943908151273, metrics={'train_runtime': 20.8897, 'train_samples_per_second': 129.25, 'train_steps_per_second': 16.228, 'total_flos': 27369474138768.0, 'train_loss': 0.19093943908151273, 'epoch': 3.0})

In [None]:
import sys
sys.path.append("/content/drive/My Drive/LLM/CommandGenerator/src")
print(sys.path)


['/content', '/env/python', '/usr/lib/python311.zip', '/usr/lib/python3.11', '/usr/lib/python3.11/lib-dynload', '', '/usr/local/lib/python3.11/dist-packages', '/usr/lib/python3/dist-packages', '/usr/local/lib/python3.11/dist-packages/IPython/extensions', '/usr/local/lib/python3.11/dist-packages/setuptools/_vendor', '/root/.ipython', '/tmp/tmpy0lwrteg', '/content/drive/My Drive/LLM/CommandGenerator/src']


In [None]:
!ls "/content/drive/My Drive/LLM/CommandGenerator/src/robocupathome_generator"


database.py	   gpsr_commands.py  __pycache__	      validator.py
egpsr_commands.py  __init__.py	     robocupathome_generator
generator.py	   labeler.py	     to_jsonl.py


In [None]:
import random
import re
import warnings
import os

from robocupathome_generator.gpsr_commands import CommandGenerator
from robocupathome_generator.egpsr_commands import EgpsrCommandGenerator

from transformers import BertTokenizerFast, BertForTokenClassification
import torch

# === LOAD MODEL ===
MODEL_PATH = "/content/drive/My Drive/LLM/bert_token_classification_model/checkpoint-339"
tokenizer = BertTokenizerFast.from_pretrained(MODEL_PATH)
model = BertForTokenClassification.from_pretrained(MODEL_PATH)
model.eval()
id2label = model.config.id2label


def run_inference(command):
    words = command.strip().split()
    if not words:
        print("\n[⚠️] Empty command. Skipping inference.\n")
        return

    inputs = tokenizer(words, is_split_into_words=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)

    predictions = outputs.logits.argmax(dim=-1).squeeze().tolist()
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"].squeeze())

    print("\n=== INFERENCE ===")
    print("TOKEN\tLABEL")
    for token, pred_id in zip(tokens, predictions):
        label = id2label.get(pred_id, "O")
        print(f"{token}\t{label}")
    print("---------------------------\n")


def read_data(file_path):
    with open(file_path, "r") as file:
        return file.read()


def parse_names(data):
    parsed = re.findall(r"\|\s*([A-Za-z]+)\s*\|", data, re.DOTALL)
    return [name.strip() for name in parsed[1:]] if parsed else []


def parse_locations(data):
    parsed = re.findall(r"\|\s*([0-9]+)\s*\|\s*([A-Za-z,\s, \(,\)]+)\|", data, re.DOTALL)
    locs_raw = [b.strip() for (_, b) in parsed]
    placement = [b.replace("(p)", "").strip() for b in locs_raw if "(p)" in b]
    locs = [b.replace("(p)", "").strip() for b in locs_raw]
    return locs, placement


def parse_rooms(data):
    parsed = re.findall(r"\|\s*(\w+ \w*)\s*\|", data, re.DOTALL)
    return [room.strip() for room in parsed[1:]] if parsed else []


def parse_objects(data):
    parsed = re.findall(r"\|\s*(\w+)\s*\|", data, re.DOTALL)
    objects = [o.replace("_", " ") for o in parsed if o != "Objectname"]
    cats = re.findall(r"# Class \s*([\w,\s, \(,\)]+)\s*", data, re.DOTALL)
    cats = [c.replace("(", "").replace(")", "").split() for c in cats]
    plural = [c[0].replace("_", " ") for c in cats]
    singular = [c[1].replace("_", " ") for c in cats]
    return objects, plural, singular


# === Mount your paths ===
data_dir = "/content/drive/My Drive/LLM/CommandGenerator"

names = parse_names(read_data(f"{data_dir}/names/names.md"))
locs, plocs = parse_locations(read_data(f"{data_dir}/maps/location_names.md"))
rooms = parse_rooms(read_data(f"{data_dir}/maps/room_names.md"))
objs, cats_plural, cats_singular = parse_objects(read_data(f"{data_dir}/objects/objects.md"))

gpsr_gen = CommandGenerator(names, locs, plocs, rooms, objs, cats_plural, cats_singular)
egpsr_gen = EgpsrCommandGenerator(gpsr_gen)

print("=== OPTIONS ===")
print("'1': Any GPSR command")
print("'2': GPSR no manipulation")
print("'3': GPSR manipulation")
print("'4': EGPSR multi-task (5 tasks)")
print("'q': Quit\n")

while True:
    option = input("Select option: ").strip()
    command = ""

    try:
        if option == "1":
            command = gpsr_gen.generate_command_start("")
            if command == "WARNING":
                raise ValueError("Empty GPSR command")
        elif option == "2":
            command = gpsr_gen.generate_command_start("people")
            if command == "WARNING":
                raise ValueError("Empty GPSR command [people]")
        elif option == "3":
            command = gpsr_gen.generate_command_start("objects")
            if command == "WARNING":
                raise ValueError("Empty GPSR command [objects]")
        elif option == "4":
            setups = egpsr_gen.generate_setup(5)
            if not setups:
                raise ValueError("EGPSR setup returned empty")
            command = "\n".join([f"{i+1}) {t.task}" for i, t in enumerate(setups)])
        elif option == "q":
            break
        else:
            print("[⚠️] Invalid option")
            continue

    except Exception as e:
        print(f"[❌] Failed to generate command: {e}")
        continue

    print(f"\n=== COMMAND ===\n{command}\n")
    run_inference(command)


=== OPTIONS ===
'1': Any GPSR command
'2': GPSR no manipulation
'3': GPSR manipulation
'4': EGPSR multi-task (5 tasks)
'q': Quit

Select option: 1

=== COMMAND ===
go to the workshop then locate the waving person and answer a question


=== INFERENCE ===
TOKEN	LABEL
[CLS]	O
go	B-VERB
to	O
the	O
workshop	B-LOC
then	O
locate	B-VERB
the	O
waving	B-FILTER
person	B-PERSON
and	O
answer	B-VERB
a	O
question	B-CONTENT
[SEP]	I-FILTER
---------------------------

Select option: 1

=== COMMAND ===
tell me what is the lightest object on the counter


=== INFERENCE ===
TOKEN	LABEL
[CLS]	O
tell	B-VERB
me	I-VERB
what	I-VERB
is	O
the	O
light	B-ATTR
##est	B-ATTR
object	O
on	B-LOC
the	I-LOC
counter	I-LOC
[SEP]	I-FILTER
---------------------------

Select option: 1

=== COMMAND ===
tell me how many cup there are on the desk


=== INFERENCE ===
TOKEN	LABEL
[CLS]	O
tell	B-VERB
me	I-VERB
how	I-VERB
many	I-VERB
cup	B-OBJ
there	O
are	O
on	B-LOC
the	I-LOC
desk	I-LOC
[SEP]	I-FILTER
---------------------------

S