In [1]:
import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    DataCollatorForTokenClassification,
    EarlyStoppingCallback,
    Trainer,
    TrainingArguments,
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
path = "train.csv"
data = pd.read_csv(path, sep=";")

In [3]:
# Определяем метки
label_list = [
    "O",
    "B-TYPE",
    "I-TYPE",
    "B-BRAND",
    "I-BRAND",
    "B-VOLUME",
    "I-VOLUME",
    "B-PERCENT",
    "I-PERCENT",
]


# Создаем mapping
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for i, label in enumerate(label_list)}

In [4]:
# Выбор устройства
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [5]:
# Загрузка модели и токенизатора
model_name = "distilbert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
    model_name, num_labels=len(label_list), id2label=id_to_label, label2id=label_to_id
)
model = model.to(device)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def parse_annotation_row_simple(row):
    text = row["sample"]
    annotation_str = row["annotation"]

    fixed_annotation = (
        annotation_str.replace("'0'", "'O'")
        .replace("'0,", "'O',")
        .replace(",0'", ",O'")
    )

    return {"text": text, "annotations_str": fixed_annotation}

In [None]:
def tokenize_and_align_labels(example):
    try:
        annotations = eval(example["annotations_str"])
    except:
        print(f"Ошибка парсинга: {example['annotations_str']}")
        annotations = []

    # Токенизация
    tokenized = tokenizer(
        example["text"],
        truncation=True,
        padding=True,
        return_offsets_mapping=True,
        is_split_into_words=False,
    )

    labels = []
    for i, offset in enumerate(tokenized["offset_mapping"]):
        start, end = offset
        if start == end == 0:
            labels.append(-100)
            continue

        label_found = False
        for ann_start, ann_end, label in annotations:
            if start >= ann_start and end <= ann_end:
                if start == ann_start:
                    labels.append(label_to_id[label])
                else:
                    labels.append(label_to_id[label.replace("B-", "I-")])
                label_found = True
                break

        if not label_found:
            labels.append(label_to_id["O"])

    tokenized["labels"] = labels
    return tokenized

In [8]:
parsed_data = [parse_annotation_row_simple(row) for _, row in data.iterrows()]
dataset = Dataset.from_list(parsed_data)  # ← ВОТ ОН!
tokenized_dataset = dataset.map(tokenize_and_align_labels)
print(f"Создан датасет с {len(dataset)} примерами")

Map: 100%|██████████| 27251/27251 [00:08<00:00, 3328.35 examples/s]

Создан датасет с 27251 примерами





In [9]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, padding=True)

In [10]:
# Разделяем данные
train_test = tokenized_dataset.train_test_split(test_size=0.2, seed=42)

In [56]:
training_args = TrainingArguments(
    output_dir="./results",
    lr_scheduler_type="linear",
    warmup_steps=500,
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.15,
    logging_dir="./logs",
    logging_steps=500,
    eval_steps=500,
    save_steps=1000,
    eval_strategy="steps",
    save_strategy="steps",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    disable_tqdm=False,
)

In [57]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_test["train"],
    eval_dataset=train_test["test"],
    data_collator=data_collator,
    callbacks=[
        EarlyStoppingCallback(early_stopping_patience=4, early_stopping_threshold=0.01)
    ],
)

In [59]:
trainer.train()

Step,Training Loss,Validation Loss
500,0.0657,0.327925
1000,0.0578,0.337442
1500,0.0728,0.341201
2000,0.0514,0.335988
2500,0.0599,0.297786
3000,0.0508,0.35451
3500,0.0387,0.322251
4000,0.0427,0.360993
4500,0.0297,0.370858


TrainOutput(global_step=4500, training_loss=0.0521611631181505, metrics={'train_runtime': 652.4981, 'train_samples_per_second': 167.05, 'train_steps_per_second': 10.444, 'total_flos': 188100983610288.0, 'train_loss': 0.0521611631181505, 'epoch': 3.301540719002201})

In [60]:
model.save_pretrained("./ner_model_v4")
tokenizer.save_pretrained("./ner_model_v4")
print("Модель сохранена в папку ./ner_model_v4")

Модель сохранена в папку ./ner_model_v4


### Первый вариант

In [None]:
submission_data = pd.read_csv("submission.csv", sep=";")

In [None]:
def predictions_to_annotations(text, predictions, offset_mapping):
    annotations = []

    i = 0
    while i < len(offset_mapping):
        start, end = offset_mapping[i]
        if start == end == 0:
            i += 1
            continue

        word_label = id_to_label[predictions[i]]

        j = i + 1
        while j < len(offset_mapping):
            next_start, next_end = offset_mapping[j]
            if next_start == end:
                end = next_end
                j += 1
            else:
                break

        annotations.append((start.item(), end.item(), word_label))
        i = j

    return annotations

In [None]:
def predict_annotations(text):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        return_offsets_mapping=True,
    )
    offset_mapping = inputs.pop("offset_mapping")
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=-1)[0].cpu().numpy()

    return predictions_to_annotations(text, predictions, offset_mapping[0])

In [None]:
# Предсказания для всех примеров
submission_annotations = []
for text in submission_data["sample"]:
    ann = predict_annotations(text)
    submission_annotations.append(str(ann))

In [None]:
result_df = pd.DataFrame(
    {"sample": submission_data["sample"], "annotation": submission_annotations}
)

In [67]:
# Сохраняем в файл
result_df.to_csv("test_submission/submission_2.csv", sep=";", index=False)
print("Предсказания сохранены в submission_2.csv")

Предсказания сохранены в submission_2.csv
