In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from transformers import AutoTokenizer
import json

# Cargar el archivo JSON
with open("/content/drive/My Drive/Resultado_etiquetado.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Inicializar el tokenizer de BERT
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

# Función para convertir datos al formato BIO
def convert_to_bio(data):
    formatted_data = []
    for entry in data:
        # Verificar que el texto exista y no esté vacío
        if "text" not in entry or not entry["text"]:
            continue  # Ignorar textos vacíos

        text = entry["text"]
        labels = entry.get("label", [])  # Usar una lista vacía si no hay etiquetas

        # Tokenizar el texto y obtener offsets
        tokenized = tokenizer(text, return_offsets_mapping=True, truncation=True)
        tokens = tokenized["input_ids"]
        offsets = tokenized["offset_mapping"]

        # Crear lista de etiquetas inicializadas como "O"
        bio_tags = ["O"] * len(offsets)

        for label in labels:
            start, end, entity = label["start"], label["end"], label["labels"][0]

            # Marcar los tokens dentro del rango start-end
            for idx, (char_start, char_end) in enumerate(offsets):
                if char_start is None or char_end is None:  # Ignorar tokens especiales
                    continue
                if start <= char_start < end:  # Token dentro del rango
                    if char_start == start:  # Inicio de la entidad
                        bio_tags[idx] = f"B-{entity}"
                    else:  # Continuación de la entidad
                        bio_tags[idx] = f"I-{entity}"

        # Convertir tokens a texto y añadir al conjunto formateado
        formatted_data.append({
            "tokens": tokenizer.convert_ids_to_tokens(tokens),
            "ner_tags": bio_tags
        })
    return formatted_data

# Convertir al formato BIO
bio_data = convert_to_bio(data)

# Guardar en archivo JSON
with open("bio_data.json", "w", encoding="utf-8") as f:
    json.dump(bio_data, f, indent=4)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"  # Desactivar WandB

from datasets import Dataset
import evaluate
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer, AutoTokenizer, EarlyStoppingCallback
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import torch

# Crear un diccionario para todas las etiquetas
unique_tags = list(set(tag for entry in bio_data for tag in entry["ner_tags"]))
label2id = {label: idx for idx, label in enumerate(sorted(unique_tags))}
id2label = {idx: label for label, idx in label2id.items()}

print("Etiquetas encontradas:", label2id)

# Función para convertir etiquetas a índices
def convert_tags_to_ids(batch):
    batch["ner_tags"] = [[label2id[tag] for tag in tags] for tags in batch["ner_tags"]]
    return batch

# Preparar el conjunto de datos
def prepare_dataset(bio_data):
    tokens = [entry["tokens"] for entry in bio_data]
    tags = [entry["ner_tags"] for entry in bio_data]
    return Dataset.from_dict({"tokens": tokens, "ner_tags": tags})

# Dividir datos en entrenamiento y validación
train_data = prepare_dataset(bio_data[:int(0.8 * len(bio_data))])
val_data = prepare_dataset(bio_data[int(0.8 * len(bio_data)):])

train_data = train_data.map(convert_tags_to_ids, batched=True)
val_data = val_data.map(convert_tags_to_ids, batched=True)

# Calcular pesos de clase
all_labels = [label for entry in bio_data for label in entry["ner_tags"]]
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(all_labels),
    y=all_labels
)
class_weights = torch.tensor(class_weights, dtype=torch.float)

# Cargar modelo y tokenizer
tokenizer = AutoTokenizer.from_pretrained("dccuchile/bert-base-spanish-wwm-cased")
model = AutoModelForTokenClassification.from_pretrained(
    "dccuchile/bert-base-spanish-wwm-cased",
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

# Modificar la función de pérdida para incluir los pesos de clase
from torch.nn import CrossEntropyLoss
loss_fct = CrossEntropyLoss(weight=class_weights)

# Tokenizar y alinear etiquetas
def tokenize_and_align_labels(batch):
    tokenized_inputs = tokenizer(
        batch["tokens"],
        is_split_into_words=True,
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )
    labels = []
    for i, label in enumerate(batch["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = [-100 if word_id is None else label[word_id] for word_id in word_ids]
        labels.append(aligned_labels)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

train_data = train_data.map(tokenize_and_align_labels, batched=True)
val_data = val_data.map(tokenize_and_align_labels, batched=True)

# Métricas de evaluación
metric = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remover etiquetas ignoradas (-100)
    true_predictions = [
        [id2label[p] for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(predictions, labels)
    ]
    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"]
    }

# Configuración del entrenamiento
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    num_train_epochs=50,
    weight_decay=0.01,
    load_best_model_at_end=True,
    logging_dir="./logs",
    metric_for_best_model="eval_precision",
    save_strategy="epoch"
)

# Inicializar Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=15)]
)

# Entrenar y evaluar el modelo
trainer.train()

# Guardar el modelo
model.save_pretrained("./modelo_bert_optimizado")
tokenizer.save_pretrained("./modelo_bert_optimizado")


Etiquetas encontradas: {'B-AREA_H': 0, 'B-AREA_M': 1, 'B-EXTENSION': 2, 'B-HECTAREA': 3, 'B-METROS_2': 4, 'I-AREA_H': 5, 'I-AREA_M': 6, 'I-EXTENSION': 7, 'I-HECTAREA': 8, 'I-METROS_2': 9, 'O': 10}


Map:   0%|          | 0/1644 [00:00<?, ? examples/s]

Map:   0%|          | 0/412 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1644 [00:00<?, ? examples/s]

Map:   0%|          | 0/412 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.198406,0.349073,0.427151,0.384186,0.95145
2,No log,0.069502,0.734713,0.750233,0.742392,0.978857
3,No log,0.061056,0.828436,0.814539,0.821429,0.980304
4,No log,0.053087,0.841774,0.872631,0.856925,0.983732
5,No log,0.05623,0.831835,0.891271,0.860528,0.981718
6,No log,0.048507,0.863719,0.885989,0.874712,0.984562
7,No log,0.042569,0.888382,0.907425,0.897802,0.989064
8,No log,0.056006,0.875492,0.897794,0.886503,0.984452
9,No log,0.048837,0.875406,0.921094,0.897669,0.986737
10,0.068100,0.049701,0.896857,0.913016,0.904865,0.987626


('./modelo_bert_optimizado/tokenizer_config.json',
 './modelo_bert_optimizado/special_tokens_map.json',
 './modelo_bert_optimizado/vocab.txt',
 './modelo_bert_optimizado/added_tokens.json',
 './modelo_bert_optimizado/tokenizer.json')