Файл предназначен для его запуска в Google Colab. Предварительная подготовка описана в Readme.md

# Ячейка 0 импорты, конфиг, закрепление основных переменных

In [64]:
!pip -q install transformers==4.44.2 seqeval==1.2.2 pytorch-crf==0.7.2 accelerate==0.34.2


Все необходимые импорты

In [65]:
from __future__ import annotations
import os, re, ast, json, math, time, random, logging
from typing import List, Tuple, Dict, Any

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler

from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from torchcrf import CRF
from seqeval.metrics import f1_score, classification_report
import re
from collections import Counter, defaultdict

from copy import deepcopy

Конфиг

In [66]:
SEED = 42
MODEL_NAME = "xlm-roberta-base"
EPOCHS = 8
BATCH_SIZE = 32 if torch.cuda.is_available() else 8
LR = 3e-5
MAX_LEN = 96
WARMUP_RATIO = 0.1
GRAD_CLIP = 1.0
BIAS_SCALE = 1

USE_SYNTH_NEG = True
SYNTH_NEG_FRAC = 0.05
SYNTH_NEG_SEED = 42
SYNTH_DROP_DUPES = False
SYNTH_NEG_WEIGHT = 0.5


Подключаемся к google drive

In [67]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


Настраиваем пути для файлов, которые подтягиваем

In [68]:
TRAIN_CSV = "/content/drive/MyDrive/train.csv"
OUT_DIR   = "/content/drive/MyDrive/var_final/"
SUB_IN  = "/content/drive/MyDrive/submission.csv"
SUB_OUT = "/content/drive/MyDrive/sub_folder/submission_final.csv"
SYNTH_NEG_PATH   = "/content/drive/MyDrive/synth_negative_products.csv"
os.makedirs(OUT_DIR, exist_ok=True)
os.makedirs(os.path.dirname(SUB_OUT), exist_ok=True)

Настраиваем случайности и выбор device

In [69]:
def set_seed(seed: int):
    random.seed(seed); np.random.seed(seed)
    torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
set_seed(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

BIO - метки

In [70]:
LABELS = [
    "O",
    "B-BRAND","I-BRAND",
    "B-TYPE","I-TYPE",
    "B-VOLUME","I-VOLUME",
    "B-PERCENT","I-PERCENT",
]
LABEL2ID = {l:i for i,l in enumerate(LABELS)}
ID2LABEL = {i:l for l,i in LABEL2ID.items()}
ENTITY_TAGS = {"BRAND","TYPE","VOLUME","PERCENT"}

Оверсэмплим volume и percent так как они имеют наименьшие число экземпляров и каждая ошибка очень сильно отражается на итоговом F1-macro

In [71]:
ROW_BOOST_BY_ENTITY = {
    "BRAND":   4.0,
    "TYPE":    2.0,
    "VOLUME":  1.2,
    "PERCENT": 1.2,
    "O": 2,
}

ROW_BOOST_COMBINE = "max"
ROW_BOOST_CAP     = 4

SAMPLER_SEED = SEED

Добавляем гиперпараметры для аугменатции и CharCNN, их гимерпараметры и регулярки

In [72]:
AUGMENT_TRAIN = True
AUG_P_BRANDTYPE = 0.25
AUG_P_O         = 0.12
AUG_SWAP_P      = 0.50


CYR_LO = "абвгдеёжзийклмнопрстуфхцчшщьыъэюя"
CYR_UP = CYR_LO.upper()
LAT_LO = "abcdefghijklmnopqrstuvwxyz"
LAT_UP = LAT_LO.upper()
DIGITS = "0123456789"
PUNCT  = "-–—._'/\\%+&()[],:;!?@"

RE_CYR = re.compile(r"[А-Яа-яЁё]")
RE_LAT = re.compile(r"[A-Za-z]")

NUM_RE   = re.compile(r"^\d+[.,]?\d*$")
UNIT_RE  = re.compile(r"(?i)^(л|литр[а-я]*|мл|г|гр|грамм[а-я]*|кг|килограмм[а-я]*|шт|штук[аи])$")
PCT_RE   = re.compile(r"^%|процент[а-я]*$", re.IGNORECASE)

CHAR_ALPHABET = ["<pad>", "<unk>"] + list(CYR_LO + CYR_UP + LAT_LO + LAT_UP + DIGITS + PUNCT)
CHAR2ID = {c:i for i,c in enumerate(CHAR_ALPHABET)}
PAD_CHAR, UNK_CHAR = 0, 1

In [73]:
# === CharCNN ===
USE_CHARCNN        = True
CHAR_MAXLEN        = 16
CHAR_EMB_DIM       = 24
CHAR_CHANNELS      = 64
CHAR_KERNEL_SIZES  = (2, 3, 4)
CHAR_SCALE         = 1

# Ячейка 1 - вспомогательные утилиты

считываем train

In [74]:
def read_csv(path: str) -> pd.DataFrame:
    df = pd.read_csv(path, sep=';', dtype=str, keep_default_na=False)
    return df

Перевод из строки в формат массива кортеджей

In [75]:
def safe_parse_annotation(s: str) -> List[Tuple[int,int,str]]:
    out: List[Tuple[int,int,str]] = []
    # literal_eval
    try:
        v = ast.literal_eval(s)
        for it in v:
            out.append((int(it[0]), int(it[1]), it[2]))
        return sorted(out, key=lambda z:(z[0],z[1]))
    except: pass
    # json
    try:
        v = json.loads(s)
        for it in v:
            out.append((int(it[0]), int(it[1]), it[2]))
        return sorted(out, key=lambda z:(z[0],z[1]))
    except:
        return []

Выделение BIO сущностей

In [76]:
WS_RE = re.compile(r"\S+")
def ws_tokens_with_offsets(text: str) -> List[Tuple[str,int,int]]:
    return [(m.group(0), m.start(), m.end()) for m in WS_RE.finditer(text or "")]

In [77]:
def spans_to_bio_on_words(text: str, spans: List[Tuple[int,int,str]]) -> List[str]:
    toks = ws_tokens_with_offsets(text)
    labels = ["O"]*len(toks)
    for i, (_, ts, te) in enumerate(toks):
        best_iou, best_tag = 0.0, "O"
        for (a,b,t) in spans:
            inter = max(0, min(te,b)-max(ts,a))
            union = max(te,b)-min(ts,a)
            iou = inter/union if union>0 else 0.0
            if iou > best_iou:
                best_iou = iou
                best_tag = t.split("-")[-1] if t!="O" else "O"
        if best_iou > 0 and best_tag in ENTITY_TAGS:
            labels[i] = ("I-" if i>0 and labels[i-1].endswith(best_tag) else "B-") + best_tag
    return labels

Оверсэмплинг

In [78]:
def _entities_in_spans(spans: List[Tuple[int,int,str]]):
    ents = set()
    for _,_,lab in spans or []:
        if not lab or lab == "O":
            continue
        ents.add(str(lab).split("-")[-1].upper())
    return ents

def _combine_boosts(values, mode=None, cap=None) -> float:
    if not values:
        return 1.0
    mode = mode if mode is not None else (ROW_BOOST_COMBINE if 'ROW_BOOST_COMBINE' in globals() else 'max')
    cap  = float(cap if cap is not None else (ROW_BOOST_CAP if 'ROW_BOOST_CAP' in globals() else 3.0))
    vals = [float(v) for v in values]
    if mode == "prod":
        out = 1.0
        for v in vals: out *= v
    elif mode == "mean":
        out = sum(vals) / len(vals)
    elif mode == "sum":
        out = sum(vals)
    else:  # 'max'
        out = max(vals)
    return float(min(cap, out))

def row_boost_from_spans(text: str, spans: List[Tuple[int,int,str]]) -> float:
    """
    Итоговый множитель строки:
      • по сущностям: берём множители из ROW_BOOST_BY_ENTITY для {BRAND, TYPE, VOLUME, PERCENT, ...}
      • по 'O' (если указан в ROW_BOOST_BY_ENTITY): добавляем вклад пропорционально доле O-меток.
        Пример: O=2.0 и доля_O=0.8 → o_mult = 1 + (2.0-1)*0.8 = 1.8
    Затем смешиваем мультипликаторы по правилу ROW_BOOST_COMBINE и обрезаем CAP-ом.
    """
    # 1) сущности, кроме 'O'
    try:
        mapping = {k.upper(): float(v) for k,v in ROW_BOOST_BY_ENTITY.items()}
    except Exception:
        mapping = {"BRAND": 1.2, "TYPE": 1.0, "VOLUME": 2.0, "PERCENT": 2.0}  # дефолт
    ents = _entities_in_spans(spans)
    mults = [mapping.get(e, 1.0) for e in ents]

    # 2) вклад 'O' — по доле 'O' в разметке слов
    if "O" in mapping:
        ws = [w for w,_,_ in ws_tokens_with_offsets(text)]
        labs = spans_to_bio_on_words(text, spans)
        if ws and labs and len(ws) == len(labs):
            o_ratio = sum(1 for t in labs if t == "O") / max(1, len(labs))  # 0..1
            o_mult  = 1.0 + (mapping["O"] - 1.0) * o_ratio
            mults.append(o_mult)

    return _combine_boosts(mults)

Создание еще сырого train, но уже со всеми необходимыми столбцами

In [79]:
# 1) базовый train
train_real = pd.read_csv(TRAIN_CSV, sep=';', dtype=str, keep_default_na=False)
for c in ("sample","annotation"):
    assert c in train_real.columns, f"В {TRAIN_CSV} нет колонки {c!r}"
train_real["sample"] = train_real["sample"].astype(str).str.strip()
train_real["annotation"] = train_real["annotation"].astype(str)
train_real["__src"] = "real"

# 2) синтетика (по желанию)
synth = pd.DataFrame(columns=["sample","annotation"])
if USE_SYNTH_NEG and os.path.exists(SYNTH_NEG_PATH):
    synth = pd.read_csv(SYNTH_NEG_PATH, sep=';', dtype=str, keep_default_na=False)
    for c in ("sample","annotation"):
        assert c in synth.columns, f"В {SYNTH_NEG_PATH} нет колонки {c!r}"
    synth["sample"] = synth["sample"].astype(str).str.strip()
    synth["annotation"] = synth["annotation"].astype(str)

    # убрать пустые и, опционально, дубли по sample относительно real
    synth = synth[synth["sample"].str.len() > 0].copy()
    if SYNTH_DROP_DUPES:
        before = len(synth)
        synth = synth[~synth["sample"].isin(set(train_real["sample"]))].copy()
        print(f"[info] synth: drop dup samples vs real: {before} -> {len(synth)}")

    # взять долю от размера real-train
    k_target = int(round(len(train_real) * float(SYNTH_NEG_FRAC)))
    k = min(max(k_target, 0), len(synth))
    if k > 0:
        synth = synth.sample(n=k, random_state=SYNTH_NEG_SEED).reset_index(drop=True)
        synth["__src"] = "synth"
    else:
        synth = synth.iloc[0:0].copy()
else:
    print(f"[info] synth disabled or not found: {SYNTH_NEG_PATH}")

# 3) объединение и перемешивание
train_df = pd.concat([train_real, synth], ignore_index=True)
train_df = train_df.sample(frac=1.0, random_state=SYNTH_NEG_SEED).reset_index(drop=True)

# 4) служебные колонки — твоими же функциями
train_df["__spans"]      = train_df["annotation"].map(safe_parse_annotation)
train_df["__row_boost"] = train_df.apply(lambda r: row_boost_from_spans(r["sample"], r["__spans"]), axis=1).astype("float32")
train_df["__has_minority"] = (train_df["__row_boost"] > 1.0)

print(f"[info] train real:  {len(train_real)}")
print(f"[info] train synth: {len(synth)}")
print(f"[info] train final: {len(train_df)}")
print(train_df["__src"].value_counts(dropna=False).to_string())

[info] train real:  28848
[info] train synth: 1442
[info] train final: 30290
__src
real     28848
synth     1442


# Ячейка 2 - корректировка весов и сборка итогового датасета

Добавляем подсказки в соответствии с исследованиями

In [80]:
UNITS = {"л","литр","литра","литров","мл","гр","г","кг","шт","килограммов", "грамм", "граммов", "миллилитров"}
PCT_WORDS = {"%","процент","проц", "процентов"}
NUM_RE = re.compile(r"^\d+[\d,.]*$"); ASCII_RE = re.compile(r"^[A-Za-z]+$")

def token_feature_bias(words: List[str]) -> List[Dict[str,float]]:
    feats = []
    for i,w in enumerate(words):
        wl = w.lower()
        f: Dict[str,float] = {}
        is_num   = bool(NUM_RE.match(wl))
        is_ascii = bool(ASCII_RE.match(w))
        prev = words[i-1].lower() if i-1>=0 else ""
        nxt  = words[i+1].lower() if i+1<len(words) else ""
        if is_ascii and wl not in UNITS:
            f["B-BRAND"] = f.get("B-BRAND", 0.0) + 0.25
        if is_num and (nxt in PCT_WORDS or prev in PCT_WORDS or "%" in nxt or "%" in prev):
            f["B-PERCENT"] = f.get("B-PERCENT", 0.0) + 0.35
            f["I-PERCENT"] = f.get("I-PERCENT", 0.0) + 0.15
        if is_num and (nxt in UNITS):
            f["B-VOLUME"] = f.get("B-VOLUME", 0.0) + 0.35
        if wl in UNITS and bool(NUM_RE.match(prev)):
            f["I-VOLUME"] = f.get("I-VOLUME", 0.0) + 0.20
        feats.append(f)
    return feats

Утилиты для аугментации

In [81]:
def _script_of(word: str) -> str:
    has_cyr = bool(RE_CYR.search(word))
    has_lat = bool(RE_LAT.search(word))
    if has_cyr and not has_lat: return "cyr"
    if has_lat and not has_cyr: return "lat"
    return "mixed"  # не трогаем такие, чтобы не плодить мусор

def _sub_same_script(word: str) -> str:
    """Одна безопасная подстановка в пределах того же алфавита (сохраняем регистр)."""
    idxs = [i for i,ch in enumerate(word) if ch.isalpha()]  # только буквы
    if not idxs: return word
    i = random.choice(idxs)
    ch = word[i]
    scr = _script_of(word)
    if scr == "cyr":
        pool = CYR_UP if ch.isupper() else CYR_LO
    elif scr == "lat":
        pool = LAT_UP if ch.isupper() else LAT_LO
    else:
        return word  # mixed — не трогаем
    c = random.choice(pool)
    if c == ch:  # редко — перегенерим
        pool2 = pool.replace(ch, "") or pool
        c = random.choice(pool2)
    s = list(word); s[i] = c
    return "".join(s)

def _swap_adjacent(word: str) -> str:
    if len(word) < 4: return word
    i = random.randrange(0, len(word)-1)
    # не меняем пробелы/дефисы на границе
    if word[i].isspace() or word[i+1].isspace(): return word
    s = list(word); s[i], s[i+1] = s[i+1], s[i]
    return "".join(s)

def _is_protected_token(w: str, lab: str) -> bool:
    if len(w) < 3: return True
    wl = w.lower()
    if NUM_RE.match(wl) or UNIT_RE.match(wl) or PCT_RE.match(wl): return True
    if all(ch in "-–—.,:;()[]{}+/\\|&!?" for ch in w): return True
    if _script_of(w) == "mixed": return True
    return False

def encode_word_chars(word: str, maxlen: int = CHAR_MAXLEN):
    out = []
    for ch in word[:maxlen]:
        out.append(CHAR2ID.get(ch, UNK_CHAR))
    while len(out) < maxlen:
        out.append(PAD_CHAR)
    return out

Обработчик данных

In [82]:
class NERDataset(Dataset):
    def __init__(self, frame: pd.DataFrame, tok: AutoTokenizer, max_len: int,
                 augment: bool = False):
        self.tok = tok
        self.max_len = max_len
        self.augment = augment

        self.texts = frame["sample"].astype(str).tolist()
        self.spans = frame["__spans"].tolist()

        # слова и BIO по словам
        self.ws_tokens = [[t for t,_,_ in ws_tokens_with_offsets(x)] for x in self.texts]
        self.ws_labels = [spans_to_bio_on_words(x, s) for x, s in zip(self.texts, self.spans)]
        assert all(len(a)==len(b) for a,b in zip(self.ws_tokens, self.ws_labels)), "Длины слов/меток не совпали"

        # фиксированные биасы по БАЗОВЫМ словам (не меняем их аугментацией)
        self.biases = [token_feature_bias(ws) for ws in self.ws_tokens]

    def _maybe_augment_words(self, words, labels):
        if not self.augment: return words
        out = words[:]
        for i,(w, lab) in enumerate(zip(words, labels)):
            base = lab.split("-")[-1] if lab != "O" else "O"
            p = AUG_P_BRANDTYPE if base in {"BRAND","TYPE"} else AUG_P_O
            if p <= 0 or random.random() > p:
                continue
            if _is_protected_token(w, lab):
                continue
            out[i] = _swap_adjacent(w) if random.random() < AUG_SWAP_P else _sub_same_script(w)
        return out

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        words = self.ws_tokens[idx]
        labs  = self.ws_labels[idx]
        words_aug = self._maybe_augment_words(words, labs)

        # char-IDs для слов (для CharCNN)
        chars_per_word = [encode_word_chars(w) for w in words_aug]
        feats = self.biases[idx]  # биасы — стабильные (по базовым словам)

        enc = self.tok(words_aug, is_split_into_words=True, truncation=True, max_length=self.max_len)
        word_ids = enc.word_ids()

        aligned_labels, aligned_bias, aligned_chars = [], [], []
        prev = None
        for wi in word_ids:
            if wi is None:
                aligned_labels.append(-100)
                aligned_bias.append([0.0]*len(LABELS))
                aligned_chars.append([0]*CHAR_MAXLEN)
            elif wi != prev:
                aligned_labels.append(LABEL2ID.get(labs[wi], 0))
                vec = [0.0]*len(LABELS)
                for k,v in feats[wi].items():
                    if k in LABEL2ID: vec[LABEL2ID[k]] += v
                aligned_bias.append(vec)
                aligned_chars.append(chars_per_word[wi])
            else:
                aligned_labels.append(-100)
                aligned_bias.append([0.0]*len(LABELS))
                aligned_chars.append([0]*CHAR_MAXLEN)
            prev = wi

        enc["labels"]    = aligned_labels
        enc["feat_bias"] = aligned_bias
        enc["char_ids"]  = aligned_chars
        return enc


Класс для создания батчей тензоров

In [83]:
class Collator:
    def __init__(self, tok): self.tok = tok
    def __call__(self, batch):
        model_inputs, labels_list, bias_list, char_list = [], [], [], []
        for ex in batch:
            labels_list.append(ex.pop("labels"))
            bias_list.append(ex.pop("feat_bias"))
            char_list.append(ex.pop("char_ids"))
            model_inputs.append(ex)
        padded = self.tok.pad(model_inputs, padding=True, return_tensors="pt")
        # на всякий случай уберём token_type_ids, если появится
        if "token_type_ids" in padded:
            padded.pop("token_type_ids")

        T = padded["input_ids"].size(1); B = len(labels_list); C = len(LABELS)
        labels = torch.full((B,T), -100, dtype=torch.long)
        bias   = torch.zeros((B,T,C), dtype=torch.float32)
        chars  = torch.zeros((B,T,CHAR_MAXLEN), dtype=torch.long)

        for i,(lab,fb,ch) in enumerate(zip(labels_list, bias_list, char_list)):
            L = min(len(lab), T)
            labels[i,:L] = torch.tensor(lab[:L], dtype=torch.long)

            fb_arr = torch.tensor(fb, dtype=torch.float32)
            Lb = min(fb_arr.shape[0], T)
            bias[i,:Lb,:C] = fb_arr[:Lb,:C]

            ch_arr = torch.tensor(ch, dtype=torch.long)
            Lc = min(ch_arr.shape[0], T)
            chars[i,:Lc,:CHAR_MAXLEN] = ch_arr[:Lc,:CHAR_MAXLEN]

        padded["labels"]    = labels
        padded["feat_bias"] = bias
        padded["char_ids"]  = chars
        return padded

Сборка итоговых обучающий данных

In [84]:
# Вес источника: синтетику приглушаем
src_weight = np.where(train_df["__src"].values == "synth", float(SYNTH_NEG_WEIGHT), 1.0).astype("float32")

# Пер-сущностный приоритет на строку — берём из заранее посчитанного __row_boost
ent_weight = train_df["__row_boost"].values.astype("float32")

# Итоговый вес строки = источник × пер-сущностный множитель
row_weights = src_weight * ent_weight
row_weights = np.clip(row_weights, 1e-6, None).astype("float32")
row_weights /= row_weights.mean()

# Сборка лоадера
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
train_ds  = NERDataset(train_df, tokenizer, MAX_LEN, augment=AUGMENT_TRAIN)
collate   = Collator(tokenizer)

weights_t = torch.as_tensor(row_weights, dtype=torch.float)
g = torch.Generator().manual_seed(int(SAMPLER_SEED))
sampler = WeightedRandomSampler(weights=weights_t, num_samples=len(weights_t), replacement=True, generator=g)

train_loader = DataLoader(
    train_ds,
    batch_size=BATCH_SIZE,
    sampler=sampler,
    collate_fn=collate,
    num_workers=0 if (os.name == "nt") else 2,
    pin_memory=torch.cuda.is_available()
)

# Короткий отчёт
real_mean  = float(row_weights[train_df["__src"].values == "real"].mean()) if (train_df["__src"]=="real").any() else 0.0
synth_mean = float(row_weights[train_df["__src"].values == "synth"].mean()) if (train_df["__src"]=="synth").any() else 0.0
print(f"[sampler] rows={len(row_weights)}, mean={row_weights.mean():.3f}, real_mean={real_mean:.3f}, synth_mean={synth_mean:.3f}")

# разложение по сущностям — удобный sanity-check
def _rows_with(ent):
    ent = ent.upper()
    return train_df["__spans"].map(lambda s: any((t!='O' and t.split('-')[-1].upper()==ent) for _,_,t in s)).values

for ent, mult in ROW_BOOST_BY_ENTITY.items():
    m = _rows_with(ent)
    if m.any():
        print(f"  - {ent}: cfg_boost={float(mult):.2f}, mean_w={float(row_weights[m].mean()):.3f}, share={m.mean():.3f}")
print(f"  combine='{ROW_BOOST_COMBINE}', cap={ROW_BOOST_CAP}, SYNTH_NEG_WEIGHT={SYNTH_NEG_WEIGHT}")




[sampler] rows=30290, mean=1.000, real_mean=1.028, synth_mean=0.441
  - BRAND: cfg_boost=4.00, mean_w=1.566, share=0.292
  - TYPE: cfg_boost=2.00, mean_w=0.969, share=0.892
  - VOLUME: cfg_boost=1.20, mean_w=1.509, share=0.051
  - PERCENT: cfg_boost=1.20, mean_w=1.541, share=0.045
  combine='max', cap=4, SYNTH_NEG_WEIGHT=0.5


# Ячейка 3 - Сборка трансформера (основной модели) и его обучение

Основной трансформер

In [85]:
class TransformerCRF(nn.Module):
    """
    XLM-R/Roberta-бэкбон → CharCNN → Linear(emissions) → CRF.

    Параметры:
      model_name:     имя модели HuggingFace
      num_labels:     количество меток BIO
      o_id:           индекс класса "O" в вашей разметке
      bias_scale:     коэффициент к внешним эмиссиям feat_bias
      use_charcnn:    подключать ли CharCNN над символами
      char_vocab_size, char_emb_dim, char_channels, char_kernels, char_scale:
                      гиперпараметры символного блока
      lean_to_O:      включить ли склонение к "O"
      lean_tau:       порог «сомнительности»: top1 - top2 < tau
      lean_delta:     насколько прибавлять к логиту "O" на сомнительных позициях
      lean_in_train:  применять ли наклон и во время обучения (иначе только на инференсе)
      pad_char_id:    индекс паддинга для символов
    """
    def __init__(
        self,
        model_name: str,
        num_labels: int,
        o_id: int,
        bias_scale: float = 1.0,
        use_charcnn: bool = False,
        char_vocab_size: int = 0,
        char_emb_dim: int = 24,
        char_channels: int = 64,
        char_kernels: tuple = (2, 3, 4),
        char_scale: float = 0.5,
        lean_to_O: bool = True,
        lean_tau: float = 0.25,
        lean_delta: float = 0.35,
        lean_in_train: bool = False,
        pad_char_id: int = 0,
    ):
        super().__init__()
        self.backbone = AutoModel.from_pretrained(model_name)
        hidden = self.backbone.config.hidden_size

        # --- CharCNN ---
        self.use_charcnn = bool(use_charcnn)
        self.pad_char_id = int(pad_char_id)
        if self.use_charcnn:
            self.char_emb   = nn.Embedding(char_vocab_size, char_emb_dim, padding_idx=self.pad_char_id)
            self.char_convs = nn.ModuleList([
                nn.Conv1d(char_emb_dim, char_channels, k, padding=k//2) for k in char_kernels
            ])
            self.char_proj  = nn.Linear(char_channels * len(char_kernels), hidden)
            self.char_scale = float(char_scale)

        # --- Emissions + CRF ---
        self.emissions = nn.Linear(hidden, num_labels)
        self.crf = CRF(num_labels, batch_first=True)
        self.bias_scale = float(bias_scale)

        # --- Наклон к O ---
        self.o_id = int(o_id)
        self.lean_to_O   = bool(lean_to_O)
        self.lean_tau    = float(lean_tau)
        self.lean_delta  = float(lean_delta)
        self.lean_in_train = bool(lean_in_train)

    # --- Символьные признаки ---
    def _char_features(self, char_ids: torch.Tensor) -> torch.Tensor:
        """
        char_ids: [B, T, L]
        Возвращает тензор [B, T, H] выровненный по токенам.
        """
        if char_ids is None:
            return None
        B, T, L = char_ids.shape
        x = self.char_emb(char_ids)                        # [B,T,L,E]
        x = x.view(B*T, L, x.size(-1)).transpose(1, 2)     # [B*T, E, L]
        convs = [torch.relu(conv(x)).amax(dim=2) for conv in self.char_convs]  # список [B*T, C]
        feat = torch.cat(convs, dim=1)                     # [B*T, C*K]
        feat = self.char_proj(feat).view(B, T, -1)         # [B, T, H]

        mask = (char_ids.ne(self.pad_char_id).any(dim=-1)).unsqueeze(-1)  # [B, T, 1]
        return feat * mask

    # --- Наклон к O ДО CRF ---
    @torch.no_grad()
    def _uncertain_mask(self, logits: torch.Tensor, tau: float) -> torch.Tensor:

        top2 = logits.topk(2, dim=-1).values
        return (top2[..., 0] - top2[..., 1]) < tau

    def _apply_lean_to_O(self, logits: torch.Tensor, tok_mask: torch.Tensor) -> torch.Tensor:
        """
        Мягко смещаем эмиссии в сторону 'O' на сомнительных позициях.
        """
        if not self.lean_to_O:
            return logits
        if self.training and not self.lean_in_train:
            return logits

        with torch.no_grad():
            uncertain = self._uncertain_mask(logits, self.lean_tau) & tok_mask

        bump = uncertain.to(logits.dtype) * self.lean_delta
        logits[..., self.o_id] = logits[..., self.o_id] + bump
        return logits

    # --- Основной forward ---
    def forward(self, input_ids, attention_mask, labels=None, feat_bias=None, char_ids=None):
        """
        Если labels переданы — возвращаем loss (mean NLL).
        Иначе — decode CRF (список путей по батчу).
        """
        # Бэкбон
        h = self.backbone(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state  # [B, T_full, H]

        # CharCNN (опционально)
        if self.use_charcnn and char_ids is not None:
            h = h + self.char_scale * self._char_features(char_ids)

        # Линейный слой в эмиссии
        logits = self.emissions(h)  # [B, T_full, C]

        # Внешние байасы
        if feat_bias is not None:
            logits = logits + self.bias_scale * feat_bias.to(logits.dtype)

        # Вырезаем спец-токены <s> ... </s> (Roberta-подобные)
        logits   = logits[:, 1:-1, :]                      # [B, T, C]
        tok_mask = attention_mask[:, 1:-1].to(torch.bool)  # [B, T]

        # --- Наклон к O ДО CRF ---
        logits = self._apply_lean_to_O(logits, tok_mask)

        if labels is not None:
            # маска валидных меток (игнорим -100)
            gold = labels[:, 1:-1]                         # [B, T]
            tag  = gold.clone()
            tag[gold == -100] = 0
            mask_gold = (gold != -100)                     # [B, T]

            nll = -self.crf(logits, tag.long(), mask=mask_gold, reduction='mean')
            return nll
        else:
            # Viterbi decode
            return self.crf.decode(logits, mask=tok_mask)


Заданиче гимерпараметров модели, шудлера, оптимизатора

In [86]:
model = TransformerCRF(
    model_name=MODEL_NAME,
    num_labels=len(LABELS),
    o_id=LABELS.index("O"),
    bias_scale=BIAS_SCALE,

    # --- CharCNN ---
    use_charcnn=USE_CHARCNN,
    char_vocab_size=len(CHAR2ID),
    char_emb_dim=CHAR_EMB_DIM,
    char_channels=CHAR_CHANNELS,
    char_kernels=CHAR_KERNEL_SIZES,
    char_scale=CHAR_SCALE,
    pad_char_id=PAD_CHAR,

    # --- «наклон к O» до CRF ---
    lean_to_O=True,
    lean_tau=0.25,
    lean_delta=0.35,
    lean_in_train=False
).to(device)


optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=0.01, fused=torch.cuda.is_available())
steps_per_epoch = max(1, math.ceil(len(train_df)/BATCH_SIZE))
total_steps = EPOCHS * steps_per_epoch
scheduler = get_linear_schedule_with_warmup(optimizer, int(WARMUP_RATIO*total_steps), total_steps)
scaler = torch.cuda.amp.GradScaler(enabled=torch.cuda.is_available())

  scaler = torch.cuda.amp.GradScaler(enabled=torch.cuda.is_available())


Обучение

In [87]:
@torch.no_grad()
def eval_on_train(model, loader_eval):
    model.eval(); y_true, y_pred = [], []
    pbar = tqdm(loader_eval, desc="eval[train]", leave=False)
    for batch in pbar:
        batch = {k:v.to(device) for k,v in batch.items()}
        paths = model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"], feat_bias=batch["feat_bias"])
        gold  = batch["labels"].cpu().tolist()
        for i, path in enumerate(paths):
            inner = gold[i][1:-1]
            first_mask = [x != -100 for x in inner]
            true_seq = [ID2LABEL[x] for x,m in zip(inner, first_mask) if m]
            pred_seq = [ID2LABEL[p] for p,m in zip(path,  first_mask) if m]
            y_true.append(true_seq); y_pred.append(pred_seq)
    rep = classification_report(y_true, y_pred, digits=3, zero_division=0)
    macro = f1_score(y_true, y_pred)  # macro-F1 без 'O'
    return macro, rep

best_f1 = -1.0
for epoch in range(1, EPOCHS+1):
    model.train(); total_loss = 0.0; t0 = time.time(); tokens_seen = 0
    pbar = tqdm(train_loader, desc=f"train[{epoch}/{EPOCHS}]", leave=True)
    for step, batch in enumerate(pbar, start=1):
        batch = {k:v.to(device) for k,v in batch.items()}
        with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
            loss = model(**batch)
        scaler.scale(loss).backward()
        nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
        scaler.step(optimizer); scaler.update(); optimizer.zero_grad(set_to_none=True)
        scheduler.step()
        total_loss += loss.item()
        tokens_seen += int(batch['attention_mask'][:,1:-1].sum().item())
        tokps = tokens_seen / max(1e-6, time.time()-t0)
        pbar.set_postfix({"loss": f"{total_loss/step:.4f}", "lr": f"{scheduler.get_last_lr()[0]:.2e}", "tok/s": f"{tokps:.0f}"})
    # отчёт по train
    eval_loader = DataLoader(train_ds, batch_size=max(32,BATCH_SIZE), shuffle=False, collate_fn=collate)
    f1m, rep = eval_on_train(model, eval_loader)
    print(rep)
    if f1m > best_f1:
        best_f1 = f1m
        torch.save(model.state_dict(), os.path.join(OUT_DIR, "model.pt"))
        tokenizer.save_pretrained(os.path.join(OUT_DIR, "tokenizer"))
        with open(os.path.join(OUT_DIR, "labels.txt"), "w", encoding="utf-8") as f:
            for l in LABELS: f.write(l+"\n")

train[1/8]:   0%|          | 0/947 [00:00<?, ?it/s]

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
Excep

eval[train]:   0%|          | 0/947 [00:00<?, ?it/s]

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


              precision    recall  f1-score   support

       BRAND      0.849     0.846     0.847      8837
     PERCENT      0.983     0.991     0.987      1375
        TYPE      0.905     0.946     0.925     27059
      VOLUME      0.955     0.952     0.954      1552

   micro avg      0.897     0.925     0.911     38823
   macro avg      0.923     0.934     0.928     38823
weighted avg      0.897     0.925     0.910     38823



train[2/8]:   0%|          | 0/947 [00:00<?, ?it/s]

eval[train]:   0%|          | 0/947 [00:00<?, ?it/s]

              precision    recall  f1-score   support

       BRAND      0.846     0.917     0.880      8837
     PERCENT      0.986     0.994     0.990      1375
        TYPE      0.934     0.945     0.940     27059
      VOLUME      0.957     0.952     0.954      1552

   micro avg      0.916     0.941     0.928     38823
   macro avg      0.931     0.952     0.941     38823
weighted avg      0.917     0.941     0.928     38823



train[3/8]:   0%|          | 0/947 [00:00<?, ?it/s]

eval[train]:   0%|          | 0/947 [00:00<?, ?it/s]

              precision    recall  f1-score   support

       BRAND      0.900     0.930     0.915      8837
     PERCENT      0.987     0.993     0.990      1375
        TYPE      0.946     0.960     0.953     27059
      VOLUME      0.981     0.985     0.983      1552

   micro avg      0.939     0.955     0.947     38823
   macro avg      0.954     0.967     0.960     38823
weighted avg      0.939     0.955     0.947     38823



train[4/8]:   0%|          | 0/947 [00:00<?, ?it/s]

eval[train]:   0%|          | 0/947 [00:00<?, ?it/s]

              precision    recall  f1-score   support

       BRAND      0.894     0.954     0.923      8837
     PERCENT      0.989     0.985     0.987      1375
        TYPE      0.959     0.954     0.957     27059
      VOLUME      0.980     0.987     0.984      1552

   micro avg      0.945     0.956     0.951     38823
   macro avg      0.956     0.970     0.963     38823
weighted avg      0.946     0.956     0.951     38823



train[5/8]:   0%|          | 0/947 [00:00<?, ?it/s]

eval[train]:   0%|          | 0/947 [00:00<?, ?it/s]

              precision    recall  f1-score   support

       BRAND      0.932     0.947     0.939      8837
     PERCENT      0.993     0.992     0.993      1375
        TYPE      0.960     0.966     0.963     27059
      VOLUME      0.983     0.986     0.985      1552

   micro avg      0.956     0.964     0.960     38823
   macro avg      0.967     0.973     0.970     38823
weighted avg      0.956     0.964     0.960     38823



train[6/8]:   0%|          | 0/947 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7c4da3815b20>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1647, in _shutdown_workers
    if w.is_alive():
       ^^^^^^^^^^^^
  File "/usr/lib/python3.12/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AssertionErrorcan only test a child process: 
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7c4da3815b20>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 16

eval[train]:   0%|          | 0/947 [00:00<?, ?it/s]

              precision    recall  f1-score   support

       BRAND      0.895     0.965     0.928      8837
     PERCENT      0.993     0.993     0.993      1375
        TYPE      0.968     0.954     0.961     27059
      VOLUME      0.985     0.989     0.987      1552

   micro avg      0.952     0.959     0.955     38823
   macro avg      0.960     0.975     0.967     38823
weighted avg      0.953     0.959     0.956     38823



train[7/8]:   0%|          | 0/947 [00:00<?, ?it/s]

eval[train]:   0%|          | 0/947 [00:00<?, ?it/s]

              precision    recall  f1-score   support

       BRAND      0.935     0.958     0.946      8837
     PERCENT      0.993     0.993     0.993      1375
        TYPE      0.967     0.966     0.966     27059
      VOLUME      0.986     0.988     0.987      1552

   micro avg      0.961     0.966     0.964     38823
   macro avg      0.970     0.976     0.973     38823
weighted avg      0.961     0.966     0.964     38823



train[8/8]:   0%|          | 0/947 [00:00<?, ?it/s]

eval[train]:   0%|          | 0/947 [00:00<?, ?it/s]

              precision    recall  f1-score   support

       BRAND      0.936     0.958     0.947      8837
     PERCENT      0.993     0.993     0.993      1375
        TYPE      0.970     0.968     0.969     27059
      VOLUME      0.985     0.988     0.986      1552

   micro avg      0.963     0.967     0.965     38823
   macro avg      0.971     0.977     0.974     38823
weighted avg      0.963     0.967     0.965     38823



# Ячейка 4 - постпроцессинг и загрузка sabmission

Регулярки и сам постпроцессинг

In [88]:
STOP_BRIDGE = {"и","с","в","на","по","без","для","к","от","из","под","у","со","при"}
UNITS = {"л","литр","литра","литров","мл","г","гр","кг","шт","пач","бут","банка","бутылка","уп","упак"}
NUM_RE = re.compile(r"^\d+[.,]?\d*$", re.I)
VOL_TOKEN_RE = re.compile(r"^\d+[.,]?\d*(?:мл|л|г|гр|кг|шт)$", re.I)
PCT_STANDALONE_RE = re.compile(r"^\d+[.,]?\d*%$", re.I)

In [89]:
_PUNKT_EDGES = re.compile(r"^\W+|\W+$")
def _norm(w: str) -> str:
    return _PUNKT_EDGES.sub("", w.lower())

def postprocess_bio(words, tags, bias_lean_O=True):
    n = len(words)
    T = tags[:]  # копия

    def set_tag(i, lab):
        if 0 <= i < n:
            T[i] = lab

    # ---------- 1) Правила для PERCENT ----------
    for i, w in enumerate(words):
        wl = w.lower()
        if PCT_STANDALONE_RE.match(wl):
            set_tag(i, "B-PERCENT"); continue
        if NUM_RE.match(wl) and i+1 < n:
            nxt = words[i+1]
            if nxt == "%":
                set_tag(i, "B-PERCENT"); set_tag(i+1, "I-PERCENT"); continue
            if nxt.lower().startswith("процент"):
                set_tag(i, "B-PERCENT"); set_tag(i+1, "I-PERCENT"); continue

    # ---------- 2) Правила для VOLUME ----------
    for i, w in enumerate(words):
        wl = w.lower()
        if VOL_TOKEN_RE.match(wl):
            set_tag(i, "B-VOLUME"); continue
        if NUM_RE.match(wl) and i+1 < n and words[i+1].lower() in UNITS:
            set_tag(i, "B-VOLUME"); set_tag(i+1, "I-VOLUME"); continue
        if T[i].endswith("VOLUME"):
            m = re.match(r"^\d+[.,]?\d*([a-zа-я]+)$", wl, re.I)
            if m and m.group(1) not in UNITS:
                T[i] = "O"


    # ---------- 6) Мягкий наклон одиноких B-* к O (эвристика) ----------
    if bias_lean_O:
        i = 0
        while i < n:
            if T[i].startswith("B-") and (i+1 == n or not T[i+1].startswith("I-")):
                lw = _norm(words[i])
                # короткие мостики/пустые токены — в фон
                if (len(lw) <= 2) or (lw in STOP_BRIDGE) or (not any(ch.isalpha() for ch in lw)):
                    T[i] = "O"
            i += 1

    return T

функция для формирования предсказаний

In [90]:
@torch.no_grad()
def infer_texts(texts: List[str]) -> List[str]:
    model.eval()
    preds = []
    encs, wids, toks_info = [], [], []
    for tx in texts:
        toks = ws_tokens_with_offsets(tx)
        words = [t for t,_,_ in toks]
        enc = tokenizer(words, is_split_into_words=True, truncation=True, max_length=MAX_LEN)
        encs.append(enc); wids.append(enc.word_ids()); toks_info.append(toks)

    BS = max(32, BATCH_SIZE)
    for i in tqdm(range(0, len(encs), BS), desc="infer", leave=True):
        part  = encs[i:i+BS]
        wpart = wids[i:i+BS]
        tpart = toks_info[i:i+BS]

        padded = tokenizer.pad(part, padding=True, return_tensors="pt")
        inp = {k: v.to(device) for k,v in padded.items()}

        # мягкие подсказки (нуджи) только на первые сабтокены слов
        feat_bias = torch.zeros((inp["input_ids"].size(0), inp["input_ids"].size(1), len(LABELS)),
                                dtype=torch.float32, device=device)
        for bi, wi in enumerate(wpart):
            words = [t for t,_,_ in tpart[bi]]
            feats = token_feature_bias(words)
            prev = None
            for pos, wid in enumerate(wi):
                if wid is None:
                    continue
                if wid != prev:
                    vec = [0.0]*len(LABELS)
                    for k,v in feats[wid].items():
                        if k in LABEL2ID: vec[LABEL2ID[k]] += v
                    feat_bias[bi, pos, :] = torch.tensor(vec, device=device)
                prev = wid

        paths = model(input_ids=inp["input_ids"], attention_mask=inp["attention_mask"], feat_bias=feat_bias)

        # в word-метки и строку с кортежами (start, end, 'TAG') по словам
        for k, path in enumerate(paths):
            wi   = wpart[k]
            toks = tpart[k]
            first = []
            prev = None
            for wid in wi[1:-1]:
                if wid is None:
                    first.append(False)
                elif wid != prev:
                    first.append(True)
                else:
                    first.append(False)
                prev = wid

            word_labels = [ID2LABEL[p] for p,m in zip(path, first) if m]
            words = [t for t,_,_ in toks]
            word_labels = postprocess_bio(words, word_labels, bias_lean_O=True)
            # выравниваем длину под число слов
            if len(word_labels) < len(toks):
                word_labels += ["O"] * (len(toks) - len(word_labels))
            elif len(word_labels) > len(toks):
                word_labels = word_labels[:len(toks)]

            spans_str = "[" + ", ".join(f"({s}, {e}, '{lab}')" for (_,s,e), lab in zip(toks, word_labels)) + "]"
            preds.append(spans_str)

    return preds

Считывание данных

In [91]:
sub_df = pd.read_csv(SUB_IN, sep=';', dtype=str, keep_default_na=False)
assert "sample" in sub_df.columns, "В файле сабмишна должна быть колонка 'sample'"

# подгружаем лучшие веса (если не загружены)
best_path = os.path.join(OUT_DIR, "model.pt")
if os.path.exists(best_path):
    model.load_state_dict(torch.load(best_path, map_location=device))

texts = sub_df["sample"].astype(str).tolist()

Получение предсказаний и загрузка sabmission

In [92]:
pred_ann = infer_texts(texts)

submission = pd.DataFrame({
    "id": np.arange(1, len(sub_df) + 1),
    "search_query": sub_df["sample"].astype(str).values,
    "annotation": pred_ann
})
submission.to_csv(SUB_OUT, sep=';', index=False)
print(submission.head(3))

infer:   0%|          | 0/157 [00:00<?, ?it/s]

   id               search_query  \
0   1          форма для выпечки   
1   2                фарш свиной   
2   3  сок ананасовый без сахара   

                                          annotation  
0     [(0, 5, 'B-TYPE'), (6, 9, 'O'), (10, 17, 'O')]  
1              [(0, 4, 'B-TYPE'), (5, 11, 'I-TYPE')]  
2  [(0, 3, 'B-TYPE'), (4, 14, 'I-TYPE'), (15, 18,...  
