In [None]:
!pip -q install transformers==4.44.2 seqeval==1.2.2 pytorch-crf==0.7.2 accelerate==0.34.2

In [None]:
from __future__ import annotations
import os, re, ast, json, math, time, random, logging
from typing import List, Tuple, Dict, Any

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler

from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from torchcrf import CRF
from seqeval.metrics import f1_score, classification_report

In [None]:
# --- Конфиг (2 эпохи, без отдельного валида) ---
SEED = 42
MODEL_NAME = "xlm-roberta-base"
EPOCHS = 3                      # как просили
BATCH_SIZE = 32 if torch.cuda.is_available() else 8
LR = 3e-5
MAX_LEN = 96
WARMUP_RATIO = 0.1
GRAD_CLIP = 1.0
BIAS_SCALE = 0.7

In [None]:
# Пути (сохраняем в .data, читаем train и используем его же для инференса)
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
TRAIN_CSV = "/content/drive/MyDrive/train.csv"         # sample;annotation
OUT_DIR   = "/content/drive/MyDrive/var_model_3/"
SUB_IN  = "/content/drive/MyDrive/submission.csv"
SUB_OUT = "/content/drive/MyDrive/sub_folder/submission.csv"
os.makedirs(OUT_DIR, exist_ok=True)
os.makedirs(SUB_OUT, exist_ok=True)

In [None]:
def set_seed(seed: int):
    random.seed(seed); np.random.seed(seed)
    torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
set_seed(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# BIO схема
LABELS = [
    "O",
    "B-BRAND","I-BRAND",
    "B-TYPE","I-TYPE",
    "B-VOLUME","I-VOLUME",
    "B-PERCENT","I-PERCENT",
]
LABEL2ID = {l:i for i,l in enumerate(LABELS)}
ID2LABEL = {i:l for l,i in LABEL2ID.items()}
ENTITY_TAGS = {"BRAND","TYPE","VOLUME","PERCENT"}

# Кого оверсемплить
MINORITY_CLASSES = {"VOLUME","PERCENT"}
MINORITY_BOOST   = 6.0

In [None]:
def read_csv(path: str) -> pd.DataFrame:
    df = pd.read_csv(path, sep=';', dtype=str, keep_default_na=False)
    assert "sample" in df.columns and "annotation" in df.columns, "Ожидаю колонки sample;annotation"
    return df

def safe_parse_annotation(s: str) -> List[Tuple[int,int,str]]:
    if s is None or str(s).strip() == "" or str(s).lower() in {"nan","none","null"}:
        return []
    txt = str(s).strip()
    out: List[Tuple[int,int,str]] = []
    def _push(a,b,t):
        try:
            a,b = int(a), int(b)
            if b > a:
                if t == "O":
                    out.append((a,b,"O")); return
                tag = str(t).split("-")[-1].upper()
                if tag in ENTITY_TAGS:
                    pref = str(t).split("-")[0] if "-" in str(t) else "B"
                    pref = "I" if pref.upper().startswith("I") else "B"
                    out.append((a,b,f"{pref}-{tag}"))
        except: pass
    # literal_eval
    try:
        v = ast.literal_eval(txt)
        if isinstance(v, list):
            for it in v:
                if isinstance(it, (list,tuple)) and len(it)>=3:
                    _push(it[0], it[1], it[2])
                elif isinstance(it, dict):
                    _push(it.get("start",0), it.get("end",0), it.get("label", it.get("tag","O")))
            return sorted(out, key=lambda z:(z[0],z[1]))
    except: pass
    # json
    try:
        v = json.loads(txt)
        if isinstance(v, list):
            for it in v:
                if isinstance(it, dict):
                    _push(it.get("start",0), it.get("end",0), it.get("label", it.get("tag","O")))
        return sorted(out, key=lambda z:(z[0],z[1]))
    except:
        return []

WS_RE = re.compile(r"\S+")
def ws_tokens_with_offsets(text: str) -> List[Tuple[str,int,int]]:
    return [(m.group(0), m.start(), m.end()) for m in WS_RE.finditer(text or "")]

def spans_to_bio_on_words(text: str, spans: List[Tuple[int,int,str]]) -> List[str]:
    toks = ws_tokens_with_offsets(text)
    labels = ["O"]*len(toks)
    for i, (_, ts, te) in enumerate(toks):
        best_iou, best_tag = 0.0, "O"
        for (a,b,t) in spans:
            inter = max(0, min(te,b)-max(ts,a))
            union = max(te,b)-min(ts,a)
            iou = inter/union if union>0 else 0.0
            if iou > best_iou:
                best_iou = iou
                best_tag = t.split("-")[-1] if t!="O" else "O"
        if best_iou > 0 and best_tag in ENTITY_TAGS:
            labels[i] = ("I-" if i>0 and labels[i-1].endswith(best_tag) else "B-") + best_tag
    return labels

def contains_minority(spans: List[Tuple[int,int,str]]) -> bool:
    for _,_,lab in spans:
        if lab!="O" and lab.split("-")[-1] in MINORITY_CLASSES:
            return True
    return False

train_df = read_csv(TRAIN_CSV)
train_df["__spans"] = train_df["annotation"].map(safe_parse_annotation)
train_df["__has_minority"] = train_df["__spans"].map(contains_minority)

In [None]:
# === Ячейка 3. Датасет/коллатор с мягкими «нуджами» и оверсемплингом =========
UNITS = {"л","литр","литра","литров","мл","гр","г","кг","шт","килограммов", "грамм", "граммов", "миллилитров"}
PCT_WORDS = {"%","процент","проц", "процентов"}
NUM_RE = re.compile(r"^\d+[\d,.]*$"); ASCII_RE = re.compile(r"^[A-Za-z]+$")

def token_feature_bias(words: List[str]) -> List[Dict[str,float]]:
    feats = []
    for i,w in enumerate(words):
        wl = w.lower()
        f: Dict[str,float] = {}
        is_num   = bool(NUM_RE.match(wl))
        is_ascii = bool(ASCII_RE.match(w))
        prev = words[i-1].lower() if i-1>=0 else ""
        nxt  = words[i+1].lower() if i+1<len(words) else ""
        if is_ascii and wl not in UNITS:
            f["B-BRAND"] = f.get("B-BRAND", 0.0) + 0.25
        if is_num and (nxt in PCT_WORDS or prev in PCT_WORDS or "%" in nxt or "%" in prev):
            f["B-PERCENT"] = f.get("B-PERCENT", 0.0) + 0.35
            f["I-PERCENT"] = f.get("I-PERCENT", 0.0) + 0.15
        if is_num and (nxt in UNITS):
            f["B-VOLUME"] = f.get("B-VOLUME", 0.0) + 0.35
        if wl in UNITS and bool(NUM_RE.match(prev)):
            f["I-VOLUME"] = f.get("I-VOLUME", 0.0) + 0.20
        feats.append(f)
    return feats

class NERDataset(Dataset):
    def __init__(self, frame: pd.DataFrame, tok: AutoTokenizer, max_len: int):
        self.tok = tok; self.max_len = max_len
        self.texts = frame["sample"].astype(str).tolist()
        self.spans = frame["__spans"].tolist()
        self.ws_tokens = [[t for t,_,_ in ws_tokens_with_offsets(x)] for x in self.texts]
        self.ws_labels = [spans_to_bio_on_words(x, s) for x,s in zip(self.texts, self.spans)]
        self.biases    = [token_feature_bias(ws) for ws in self.ws_tokens]
        self.has_minority = frame["__has_minority"].tolist()
        assert all(len(a)==len(b) for a,b in zip(self.ws_tokens, self.ws_labels)), "Длины слов/меток не совпали"
    def __len__(self): return len(self.texts)
    def __getitem__(self, idx):
        words, labs, feats = self.ws_tokens[idx], self.ws_labels[idx], self.biases[idx]
        enc = self.tok(words, is_split_into_words=True, truncation=True, max_length=self.max_len)
        word_ids = enc.word_ids()
        aligned_labels, aligned_bias = [], []
        prev = None
        for wi in word_ids:
            if wi is None:
                aligned_labels.append(-100); aligned_bias.append([0.0]*len(LABELS))
            elif wi != prev:
                aligned_labels.append(LABEL2ID.get(labs[wi], 0))
                vec = [0.0]*len(LABELS)
                for k,v in feats[wi].items():
                    if k in LABEL2ID: vec[LABEL2ID[k]] += v
                aligned_bias.append(vec)
            else:
                aligned_labels.append(-100); aligned_bias.append([0.0]*len(LABELS))
            prev = wi
        enc["labels"]    = aligned_labels
        enc["feat_bias"] = aligned_bias
        return enc

class Collator:
    def __init__(self, tok): self.tok = tok
    def __call__(self, batch):
        model_inputs, labels_list, bias_list = [], [], []
        for ex in batch:
            labels_list.append(ex.pop("labels"))
            bias_list.append(ex.pop("feat_bias"))
            model_inputs.append(ex)
        padded = self.tok.pad(model_inputs, padding=True, return_tensors="pt")
        T = padded["input_ids"].size(1); B = len(labels_list); C = len(LABELS)
        labels = torch.full((B,T), -100, dtype=torch.long)
        bias   = torch.zeros((B,T,C), dtype=torch.float32)
        for i,(lab,fb) in enumerate(zip(labels_list, bias_list)):
            L = min(len(lab), T)
            labels[i,:L] = torch.tensor(lab[:L], dtype=torch.long)
            fb_arr = torch.tensor(fb, dtype=torch.float32)
            Lb = min(fb_arr.shape[0], T)
            bias[i,:Lb,:C] = fb_arr[:Lb,:C]
        padded["labels"]    = labels
        padded["feat_bias"] = bias
        return padded

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
train_ds  = NERDataset(train_df, tokenizer, MAX_LEN)
weights   = np.where(train_df["__has_minority"].values, MINORITY_BOOST, 1.0).astype("float32")
sampler   = WeightedRandomSampler(weights=weights, num_samples=len(weights), replacement=True)
collate   = Collator(tokenizer)

train_loader = DataLoader(
    train_ds, batch_size=BATCH_SIZE, sampler=sampler, drop_last=False,
    collate_fn=collate, num_workers=2, pin_memory=torch.cuda.is_available(), persistent_workers=True
)

In [None]:
# === Ячейка 4. Модель (XLM-R → Linear → CRF) =================================
class TransformerCRF(nn.Module):
    def __init__(self, model_name: str, num_labels: int, bias_scale: float=1.0):
        super().__init__()
        self.backbone = AutoModel.from_pretrained(model_name)
        hidden = self.backbone.config.hidden_size
        self.emissions = nn.Linear(hidden, num_labels)
        self.crf = CRF(num_labels, batch_first=True)
        self.bias_scale = bias_scale
    def forward(self, input_ids, attention_mask, labels=None, feat_bias=None):
        h = self.backbone(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
        logits = self.emissions(h)
        if feat_bias is not None:
            logits = logits + self.bias_scale * feat_bias.to(logits.dtype)
        logits = logits[:,1:-1,:]                       # вырезаем <s> </s>
        mask_seq = attention_mask[:,1:-1].bool()
        if labels is not None:
            gold = labels[:,1:-1]
            tag = gold.clone(); tag[gold==-100] = 0
            mask_gold = (gold != -100)
            nll = -self.crf(logits, tag.long(), mask=mask_gold, reduction='mean')
            return nll
        else:
            return self.crf.decode(logits, mask=mask_seq)

model = TransformerCRF(MODEL_NAME, num_labels=len(LABELS), bias_scale=BIAS_SCALE).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=0.01, fused=torch.cuda.is_available())
steps_per_epoch = max(1, math.ceil(len(train_ds)/BATCH_SIZE))
total_steps = EPOCHS * steps_per_epoch
scheduler = get_linear_schedule_with_warmup(optimizer, int(WARMUP_RATIO*total_steps), total_steps)
scaler = torch.cuda.amp.GradScaler(enabled=torch.cuda.is_available())

In [None]:
# === Ячейка 5. Обучение (лог/прогресс) + отчёт по train ======================
@torch.no_grad()
def eval_on_train(model, loader_eval):
    model.eval(); y_true, y_pred = [], []
    pbar = tqdm(loader_eval, desc="eval[train]", leave=False)
    for batch in pbar:
        batch = {k:v.to(device) for k,v in batch.items()}
        paths = model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"], feat_bias=batch["feat_bias"])
        gold  = batch["labels"].cpu().tolist()
        for i, path in enumerate(paths):
            inner = gold[i][1:-1]
            first_mask = [x != -100 for x in inner]
            true_seq = [ID2LABEL[x] for x,m in zip(inner, first_mask) if m]
            pred_seq = [ID2LABEL[p] for p,m in zip(path,  first_mask) if m]
            y_true.append(true_seq); y_pred.append(pred_seq)
    rep = classification_report(y_true, y_pred, digits=3, zero_division=0)
    macro = f1_score(y_true, y_pred)  # macro-F1 без 'O'
    return macro, rep

best_f1 = -1.0
for epoch in range(1, EPOCHS+1):
    model.train(); total_loss = 0.0; t0 = time.time(); tokens_seen = 0
    pbar = tqdm(train_loader, desc=f"train[{epoch}/{EPOCHS}]", leave=True)
    for step, batch in enumerate(pbar, start=1):
        batch = {k:v.to(device) for k,v in batch.items()}
        with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
            loss = model(**batch)
        scaler.scale(loss).backward()
        nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
        scaler.step(optimizer); scaler.update(); optimizer.zero_grad(set_to_none=True)
        scheduler.step()
        total_loss += loss.item()
        tokens_seen += int(batch['attention_mask'][:,1:-1].sum().item())
        tokps = tokens_seen / max(1e-6, time.time()-t0)
        pbar.set_postfix({"loss": f"{total_loss/step:.4f}", "lr": f"{scheduler.get_last_lr()[0]:.2e}", "tok/s": f"{tokps:.0f}"})
    # отчёт по train (без отдельного валида)
    eval_loader = DataLoader(train_ds, batch_size=max(32,BATCH_SIZE), shuffle=False, collate_fn=collate)
    f1m, rep = eval_on_train(model, eval_loader)
    print(rep)
    if f1m > best_f1:
        best_f1 = f1m
        torch.save(model.state_dict(), os.path.join(OUT_DIR, "model.pt"))
        tokenizer.save_pretrained(os.path.join(OUT_DIR, "tokenizer"))
        with open(os.path.join(OUT_DIR, "labels.txt"), "w", encoding="utf-8") as f:
            for l in LABELS: f.write(l+"\n")

In [None]:
sub_df = pd.read_csv(SUB_IN, sep=';', dtype=str, keep_default_na=False)
assert "sample" in sub_df.columns, "В файле сабмишна должна быть колонка 'sample'"

# подгружаем лучшие веса (если не загружены)
best_path = os.path.join(OUT_DIR, "model.pt")
if os.path.exists(best_path):
    model.load_state_dict(torch.load(best_path, map_location=device))

texts = sub_df["sample"].astype(str).tolist()

@torch.no_grad()
def infer_texts(texts: List[str]) -> List[str]:
    model.eval()
    preds = []
    encs, wids, toks_info = [], [], []
    for tx in texts:
        toks = ws_tokens_with_offsets(tx)
        words = [t for t,_,_ in toks]
        enc = tokenizer(words, is_split_into_words=True, truncation=True, max_length=MAX_LEN)
        encs.append(enc); wids.append(enc.word_ids()); toks_info.append(toks)

    BS = max(32, BATCH_SIZE)
    for i in tqdm(range(0, len(encs), BS), desc="infer", leave=True):
        part  = encs[i:i+BS]
        wpart = wids[i:i+BS]
        tpart = toks_info[i:i+BS]

        padded = tokenizer.pad(part, padding=True, return_tensors="pt")
        inp = {k: v.to(device) for k,v in padded.items()}

        # мягкие подсказки (нуджи) только на первые сабтокены слов
        feat_bias = torch.zeros((inp["input_ids"].size(0), inp["input_ids"].size(1), len(LABELS)),
                                dtype=torch.float32, device=device)
        for bi, wi in enumerate(wpart):
            words = [t for t,_,_ in tpart[bi]]
            feats = token_feature_bias(words)
            prev = None
            for pos, wid in enumerate(wi):
                if wid is None:
                    continue
                if wid != prev:
                    vec = [0.0]*len(LABELS)
                    for k,v in feats[wid].items():
                        if k in LABEL2ID: vec[LABEL2ID[k]] += v
                    feat_bias[bi, pos, :] = torch.tensor(vec, device=device)
                prev = wid

        paths = model(input_ids=inp["input_ids"], attention_mask=inp["attention_mask"], feat_bias=feat_bias)

        # в word-метки и строку с кортежами (start, end, 'TAG') по словам
        for k, path in enumerate(paths):
            wi   = wpart[k]
            toks = tpart[k]
            first = []
            prev = None
            for wid in wi[1:-1]:
                if wid is None:
                    first.append(False)
                elif wid != prev:
                    first.append(True)
                else:
                    first.append(False)
                prev = wid

            word_labels = [ID2LABEL[p] for p,m in zip(path, first) if m]
            # выравниваем длину под число слов
            if len(word_labels) < len(toks):
                word_labels += ["O"] * (len(toks) - len(word_labels))
            elif len(word_labels) > len(toks):
                word_labels = word_labels[:len(toks)]

            spans_str = "[" + ", ".join(f"({s}, {e}, '{lab}')" for (_,s,e), lab in zip(toks, word_labels)) + "]"
            preds.append(spans_str)

    return preds

pred_ann = infer_texts(texts)

submission = pd.DataFrame({
    "id": np.arange(1, len(sub_df) + 1),
    "search_query": sub_df["sample"].astype(str).values,
    "annotation": pred_ann
})
submission.to_csv(SUB_OUT, sep=';', index=False)
print(submission.head(3))