In [None]:
# ===========================
# ФАЙЛ ДЛЯ ПОИСКА ОШИБОК ПРЕДСКАЗАНИЙ (Colab)
# ===========================
# Установка зависимостей (без шума)
!pip -q install transformers==4.44.2 tokenizers==0.19.1 pytorch-crf==0.7.2 seqeval==1.2.2 pandas tqdm

In [None]:
import os, re, ast, json, random
from typing import List, Tuple, Dict

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from torchcrf import CRF
from seqeval.metrics import classification_report, f1_score

In [None]:
# --- Подключаем Google Drive ---
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
# ===========================
# КОНФИГ
# ===========================
MODEL_DIR = "/content/drive/MyDrive/var_model_3"   # содержит: model.pt, labels.txt, tokenizer/
TRAIN_CSV = "/content/drive/MyDrive/train.csv"     # содержит: sample;annotation
OUT_DIR   = "/content/drive/MyDrive/var_model_3_analysis"
ERRORS_CSV = os.path.join(OUT_DIR, "errors.csv")

MAX_LEN    = 96        # длина последовательности (сабтокены)
BIAS_SCALE = 0.5       # сила мягких подсказок
SEED       = 42
os.makedirs(OUT_DIR, exist_ok=True)

random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", DEVICE)

In [None]:
# ===========================
# Утилиты: токены по словам, мягкие подсказки, парсер annotation
# ===========================
WS_RE = re.compile(r"\S+")
NUM_RE = re.compile(r"^\d+[\d,.]*$")
ASCII_RE = re.compile(r"^[A-Za-z]+$")
UNITS = {"л","литр","литра","литров","мл","гр","г","кг","шт","килограммов", "грамм", "граммов", "миллилитров"}
PCT_WORDS = {"%","процент","проц", "процентов"}
ENTITY_TAGS = {"BRAND","TYPE","VOLUME","PERCENT"}

def ws_tokens_with_offsets(text: str) -> List[Tuple[str,int,int]]:
    return [(m.group(0), m.start(), m.end()) for m in WS_RE.finditer(text or "")]

def token_feature_bias(words: List[str]) -> List[Dict[str,float]]:
    feats = []
    for i,w in enumerate(words):
        wl = w.lower()
        f: Dict[str,float] = {}
        is_num   = bool(NUM_RE.match(wl))
        is_ascii = bool(ASCII_RE.match(w))
        prev = words[i-1].lower() if i-1>=0 else ""
        nxt  = words[i+1].lower() if i+1<len(words) else ""
        # BRAND: латиница (не юнит)
        if is_ascii and wl not in UNITS:
            f["B-BRAND"] = f.get("B-BRAND", 0.0) + 0.25
        # PERCENT: число рядом со словом/знаком процента
        if is_num and (nxt in PCT_WORDS or prev in PCT_WORDS or "%" in nxt or "%" in prev):
            f["B-PERCENT"] = f.get("B-PERCENT", 0.0) + 0.35
            f["I-PERCENT"] = f.get("I-PERCENT", 0.0) + 0.15
        # VOLUME: число рядом с юнитом
        if is_num and (nxt in UNITS):
            f["B-VOLUME"] = f.get("B-VOLUME", 0.0) + 0.35
        if wl in UNITS and bool(NUM_RE.match(prev)):
            f["I-VOLUME"] = f.get("I-VOLUME", 0.0) + 0.20
        feats.append(f)
    return feats

def safe_parse_annotation(s: str) -> List[Tuple[int,int,str]]:
    if s is None: return []
    txt = str(s).strip()
    if txt == "" or txt.lower() in {"nan","none","null"}: return []
    out: List[Tuple[int,int,str]] = []
    def _push(a,b,t):
        try:
            a,b = int(a), int(b)
            if b > a:
                if t == "O": out.append((a,b,"O")); return
                tag = str(t).split("-")[-1].upper()
                pref = str(t).split("-")[0] if "-" in str(t) else "B"
                pref = "I" if pref.upper().startswith("I") else "B"
                if tag in ENTITY_TAGS: out.append((a,b,f"{pref}-{tag}"))
        except: pass
    # literal_eval
    try:
        v = ast.literal_eval(txt)
        if isinstance(v, list):
            for it in v:
                if isinstance(it, (list,tuple)) and len(it)>=3:
                    _push(it[0], it[1], it[2])
                elif isinstance(it, dict):
                    _push(it.get("start",0), it.get("end",0), it.get("label", it.get("tag","O")))
            return sorted(out, key=lambda z:(z[0],z[1]))
    except: pass
    # json
    try:
        v = json.loads(txt)
        if isinstance(v, list):
            for it in v:
                if isinstance(it, dict):
                    _push(it.get("start",0), it.get("end",0), it.get("label", it.get("tag","O")))
        return sorted(out, key=lambda z:(z[0],z[1]))
    except:
        return []

def spans_to_bio_on_words(text: str, spans: List[Tuple[int,int,str]]) -> List[str]:
    """Проецируем символьные спаны на слова через IoU и строим BIO на уровне слов."""
    toks = ws_tokens_with_offsets(text)
    labels = ["O"]*len(toks)
    for i, (_, ts, te) in enumerate(toks):
        best_iou, best_tag = 0.0, "O"
        for (a,b,t) in spans:
            inter = max(0, min(te,b)-max(ts,a))
            union = max(te,b)-min(ts,a)
            iou = inter/union if union>0 else 0.0
            if iou > best_iou:
                best_iou = iou
                best_tag = t.split("-")[-1] if t!="O" else "O"
        if best_iou > 0 and best_tag in ENTITY_TAGS:
            labels[i] = ("I-" if i>0 and labels[i-1].endswith(best_tag) else "B-") + best_tag
    return labels


In [None]:
# ===========================
# Архитектура + загрузка модели/токенайзера
# ===========================
# labels
labels_path = os.path.join(MODEL_DIR, "labels.txt")
assert os.path.exists(labels_path), f"labels.txt not found in {MODEL_DIR}"
with open(labels_path, "r", encoding="utf-8") as f:
    LABELS = [ln.strip() for ln in f if ln.strip()]
LABEL2ID = {l:i for i,l in enumerate(LABELS)}
ID2LABEL = {i:l for l,i in LABEL2ID.items()}

# tokenizer (из локальной папки)
tok_dir = os.path.join(MODEL_DIR, "tokenizer")
assert os.path.isdir(tok_dir), f"tokenizer dir not found: {tok_dir}"
tokenizer = AutoTokenizer.from_pretrained(tok_dir)

# backbone по имени модели, а не из папки токенайзера
BASE_NAME = "xlm-roberta-base"

class TransformerCRF(nn.Module):
    def __init__(self, model_name: str, num_labels: int, bias_scale: float=0.5):
        super().__init__()
        self.backbone = AutoModel.from_pretrained(model_name)
        hidden = self.backbone.config.hidden_size
        self.emissions = nn.Linear(hidden, num_labels)
        self.crf = CRF(num_labels, batch_first=True)
        self.bias_scale = float(bias_scale)
    def forward(self, input_ids, attention_mask, labels=None, feat_bias=None):
        h = self.backbone(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
        logits = self.emissions(h)
        if feat_bias is not None:
            logits = logits + self.bias_scale * feat_bias.to(logits.dtype)
        logits = logits[:,1:-1,:]
        mask_seq = attention_mask[:,1:-1].bool()
        if labels is not None:
            gold = labels[:,1:-1]
            tags = gold.clone(); tags[gold == -100] = 0
            mask_gold = (gold != -100)
            return -self.crf(logits, tags.long(), mask=mask_gold, reduction="mean")
        else:
            return self.crf.decode(logits, mask=mask_seq)

model = TransformerCRF(BASE_NAME, num_labels=len(LABELS), bias_scale=BIAS_SCALE).to(DEVICE)
state = torch.load(os.path.join(MODEL_DIR, "model.pt"), map_location=DEVICE)
model.load_state_dict(state, strict=True)
model.eval()
print("Model loaded:", BASE_NAME, "| labels:", len(LABELS))

In [None]:
# ===========================
# Загрузка train и подготовка "истины"
# ===========================
df = pd.read_csv(TRAIN_CSV, sep=';', dtype=str, keep_default_na=False)
assert {"sample","annotation"}.issubset(df.columns), "Нужны колонки sample;annotation"

df["__spans"]  = df["annotation"].map(safe_parse_annotation)
df["__tokens"] = df["sample"].astype(str).map(lambda t: [w for w,_,_ in ws_tokens_with_offsets(t)])
df["__ws_bio"] = [spans_to_bio_on_words(t, s) for t,s in zip(df["sample"], df["__spans"])]

ok = all(len(a)==len(b) for a,b in zip(df["__tokens"], df["__ws_bio"]))
print("Word/BIO lengths match:", ok, "| rows:", len(df))

In [None]:
# ===========================
# Инференс на всём train (BIO на уровне слов)
# ===========================
@torch.no_grad()
def predict_words(words_list: List[List[str]]) -> List[List[str]]:
    preds_all: List[List[str]] = []
    BS = 64
    for i in tqdm(range(0, len(words_list), BS), desc="infer(train)"):
        batch_words = words_list[i:i+BS]
        encs, wids, feats_list = [], [], []
        for words in batch_words:
            enc = tokenizer(words, is_split_into_words=True, truncation=True, max_length=MAX_LEN)
            encs.append(enc); wids.append(enc.word_ids()); feats_list.append(token_feature_bias(words))
        padded = tokenizer.pad(encs, padding=True, return_tensors="pt")
        inp = {k:v.to(DEVICE) for k,v in padded.items()}
        feat_bias = torch.zeros((inp["input_ids"].size(0), inp["input_ids"].size(1), len(LABELS)), dtype=torch.float32, device=DEVICE)
        # нуджи на первые сабтокены слов
        for bi, wi in enumerate(wids):
            feats = feats_list[bi]
            prev = None
            for pos, wid in enumerate(wi):
                if wid is None: continue
                if wid != prev:
                    vec = [0.0]*len(LABELS)
                    for k,v in feats[wid].items():
                        if k in LABEL2ID: vec[LABEL2ID[k]] += v
                    feat_bias[bi, pos, :] = torch.tensor(vec, dtype=torch.float32, device=DEVICE)
                prev = wid
        paths = model(input_ids=inp["input_ids"], attention_mask=inp["attention_mask"], feat_bias=feat_bias)
        # только первые сабтокены → метки на слова
        for bi, path in enumerate(paths):
            wi = wids[bi]
            first = []; prev = None
            for wid in wi[1:-1]:
                if wid is None: first.append(False)
                elif wid != prev: first.append(True)
                else: first.append(False)
                prev = wid
            word_labels = [ID2LABEL[p] for p, keep in zip(path, first) if keep]
            words = words_list[i+bi]
            if len(word_labels) < len(words):
                word_labels += ["O"]*(len(words)-len(word_labels))
            elif len(word_labels) > len(words):
                word_labels = word_labels[:len(words)]
            preds_all.append(word_labels)
    return preds_all

pred_bio = predict_words(df["__tokens"].tolist())
df["__pred_bio"] = pred_bio

In [None]:
# ===========================
# Метрики (seqeval) и отчёт
# ===========================
true_seqs = df["__ws_bio"].tolist()
pred_seqs = df["__pred_bio"].tolist()

print("\n=== Classification report (seqeval, без 'O') ===")
print(classification_report(true_seqs, pred_seqs, digits=3, zero_division=0))
print("macro-F1:", f1_score(true_seqs, pred_seqs))

In [None]:
# ===========================
# Сбор ошибок: где расходятся BIO-метки
# ===========================
def diff_indices(y_true: List[str], y_pred: List[str]) -> List[int]:
    return [i for i,(a,b) in enumerate(zip(y_true, y_pred)) if a != b]

rows = []
for idx, (text, toks, y_t, y_p) in enumerate(zip(df["sample"], df["__tokens"], df["__ws_bio"], df["__pred_bio"])):
    diffs = diff_indices(y_t, y_p)
    if diffs:
        # компактное превью: позиция:токен(истина→пред)
        preview = ", ".join(f"{i}:{toks[i]}({y_t[i]}→{y_p[i]})" for i in diffs[:25])
        rows.append({
            "row_id": idx,
            "sample": text,
            "tokens": " ".join(toks),
            "true_bio": " ".join(y_t),
            "pred_bio": " ".join(y_p),
            "diff_cnt": len(diffs),
            "diff_preview": preview,
            "has_VP_true": any(x.endswith(("VOLUME","PERCENT")) for x in y_t if x!="O"),
            "has_VP_pred": any(x.endswith(("VOLUME","PERCENT")) for x in y_p if x!="O"),
        })

errors_df = pd.DataFrame(rows).sort_values("diff_cnt", ascending=False).reset_index(drop=True)
errors_df.to_csv(ERRORS_CSV, index=False)
print(f"\nSaved errors → {ERRORS_CSV}")
print(f"Ошибочных строк: {len(errors_df)} из {len(df)}")
display(errors_df.head(15))