# Baseline

In [10]:
from ast import literal_eval
import pandas as pd

train_data = pd.read_csv('data_base/train_with_aug.csv', sep=';')
train_data = train_data.drop_duplicates()

def parse_annotation(ann):
    return literal_eval(ann)

def align_tokens_and_labels_v2(text, annotations):
    """
    Улучшенная версия с обработкой граничных случаев.
    """
    tokens = text.split()
    labels = ['O'] * len(tokens)
    
    # Сортируем аннотации по начальной позиции
    annotations = sorted(annotations, key=lambda x: x[0])
    
    # Находим позиции токенов
    current_pos = 0
    token_positions = []
    for token in tokens:
        start = text.find(token, current_pos)
        end = start + len(token)
        current_pos = end
        token_positions.append((start, end))
    
    # Сопоставляем токены с аннотациями
    for i, (token_start, token_end) in enumerate(token_positions):
        best_annotation = None
        best_overlap = 0
        
        for start, end, label in annotations:
            # Вычисляем перекрытие токена и аннотации
            overlap = min(token_end, end) - max(token_start, start)
            
            if overlap > 0 and overlap > best_overlap:
                best_overlap = overlap
                best_annotation = label
        
        if best_annotation:
            labels[i] = best_annotation
    
    return tokens, labels

def make_txt(df, txt_path):
    # Применяем функции ко всему датафрейму
    formatted_data = []
    for index, row in df.iterrows():
        text = row['sample']
        annotations = parse_annotation(row['annotation'])
        tokens, bio_labels = align_tokens_and_labels_v2(text, annotations)

        # Формируем строки для записи в файл
        for token, label in zip(tokens, bio_labels):
            formatted_data.append(f"{token} {label}")
        formatted_data.append('')  # Пустая строка между предложениями

    # Сохраняем данные в файл train.txt в формате, ожидаемом DeepPavlov
    with open(txt_path, 'w', encoding='utf-8') as f:
        f.write('\n'.join(formatted_data))

In [11]:
train_data.shape

(89568, 2)

In [14]:
def check_bio(tags):
    for i, tag in enumerate(tags):
        if tag == 'O':
            continue
        tag_list = tag.split('-')
        if len(tag_list) != 2 or tag_list[0] not in set(['B','I']):
            return False
        if tag_list[0] == 'B':
            continue
        elif i == 0 or tags[i-1] == 'O':
            tags[i] = 'B' + tag[1:]
        elif tags[i-1][1:] == tag[1:]:
            continue
        else:
            tags[i] = 'B' + tag[1:]
    return True

def get_tags(annotations):
    result = []
    for ann in annotations:
        result.append(ann[2])
    return result


annotations = train_data["annotation"].tolist()
annotations = [literal_eval(ann) for ann in annotations]
checks = [check_bio(get_tags(ann)) for ann in annotations]
train_data['is_valid_BIO'] = checks
train_data = train_data[train_data['is_valid_BIO'] == True]

In [15]:
from sklearn.model_selection import train_test_split

TAGS = ['B-TYPE', 'B-BRAND', 'O', 'I-TYPE', 'I-BRAND', 'B-VOLUME', 'I-VOLUME', 'B-PERCENT', 'I-PERCENT']
train_data['sample'] = train_data['sample'].apply(lambda x: x.replace('\xa0', ''))
for tag in TAGS:
    train_data[tag] = train_data['annotation'].apply(lambda x: 1 if tag in x else 0)

train_data = train_data.drop_duplicates()
train_df, val_df = train_test_split(train_data, test_size=0.1)

In [16]:
make_txt(train_df, 'data_base/train_aug_sub_new.txt')
make_txt(val_df, 'data_base/val_aug_sub_new.txt')

In [85]:
# !python3 -m spacy init config config/base_config.cfg -p ner

In [86]:
!python3 -m spacy init fill-config config/base_config.cfg config/config.cfg

[38;5;3m⚠ Nothing to auto-fill: base config is already complete[0m
[38;5;2m✔ Saved config[0m
config/config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [15]:
!python3 -m spacy convert data_base/train_aug_sub_new.txt data/ -c ner

[38;5;4mℹ Auto-detected token-per-line NER format[0m
[38;5;4mℹ Grouping every 1 sentences into a document.[0m
[38;5;3m⚠ To generate better training data, you may want to group sentences
into documents with `-n 10`.[0m
[38;5;2m✔ Generated output file (60777 documents):
data/train_aug_sub_new.spacy[0m


In [16]:
!python3 -m spacy convert data_base/val_aug_sub_new.txt data/ -c ner

[38;5;4mℹ Auto-detected token-per-line NER format[0m
[38;5;4mℹ Grouping every 1 sentences into a document.[0m
[38;5;3m⚠ To generate better training data, you may want to group sentences
into documents with `-n 10`.[0m
[38;5;2m✔ Generated output file (6137 documents):
data/val_aug_sub_new.spacy[0m


In [19]:
!python3 -m spacy train config/config_best_new.cfg --ou  tput model_ner/ --paths.train data/train_aug_sub_new.spacy --paths.dev data/val_aug_sub_new.spacy --gpu-id 1

[38;5;4mℹ Saving to output directory: model_ner[0m
[38;5;4mℹ Using GPU: 1[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.0005[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     87.06   39.02   32.37   49.11    0.39
  0     500        554.84  20262.20   86.27   85.85   86.69    0.86
  1    1000        877.48  17353.38   89.09   89.14   89.04    0.89
  2    1500       1335.89  20907.98   90.76   90.40   91.12    0.91
  5    2000       2064.68  24441.67   92.32   91.81   92.84    0.92
  8    2500       3403.21  28338.38   92.20   91.71   92.70    0.92
 12    3000       3754.71  23001.01   92.33   91.98   92.68    0.92
 16    3500       3951.49  18548.87   92.11   91.73   92.49    0.92
 20    4000       4197.47  16208.38   92.40   92.39   92.41    0.92
 24    4500       4649.52  14404.98   9

In [19]:
!python3 -m spacy evaluate model/model-best data/val_aug_sub_new.spacy --gpu-id 0

[38;5;4mℹ Using GPU: 0[0m
[1m

TOK     -    
NER P   91.68
NER R   93.09
NER F   92.38
SPEED   3822 

[1m

              P       R       F
TYPE      93.13   94.72   93.92
BRAND     87.35   88.50   87.92
VOLUME    93.44   94.53   93.98
PERCENT   98.72   98.72   98.72



In [20]:
import unicodedata
import re
from typing import List, Tuple

def _is_punct(ch: str) -> bool:
    # Любой символ категории Unicode "P" — пунктуация,
    # но % исключаем из удаления
    return unicodedata.category(ch).startswith("P") and ch != "%"

def _strip_punct(s: str) -> str:
    return "".join(ch for ch in s if not _is_punct(ch))

def predict_with_punct(nlp, s: str) -> List[List[str]]:
    """
    Возвращает список [[оригинальный_фрагмент_с_пунктуацией, label], ...]
    на основе предсказаний nlp по строке без пунктуации (кроме %).
    """
    orig_tokens: List[Tuple[int,int,str]] = []
    for m in re.finditer(r"\S+", s):
        start, end = m.span()
        orig_tokens.append((start, end, s[start:end]))

    clean_pieces = []
    clean_spans = []
    clean_cursor = 0
    kept_idx = []

    for i, (st, en, tok) in enumerate(orig_tokens):
        clean_tok = _strip_punct(tok)
        if not clean_tok:
            continue
        if clean_pieces:
            clean_cursor += 1
            clean_pieces.append(" ")
        clean_start = clean_cursor
        clean_pieces.append(clean_tok)
        clean_cursor += len(clean_tok)
        clean_spans.append((clean_start, clean_cursor, i))
        kept_idx.append(i)

    clean_text = "".join(clean_pieces).lower()
    if not clean_text.strip():
        return []

    doc = nlp(clean_text)

    results: List[List[str]] = []
    for ent in doc.ents:
        ent_start, ent_end = ent.start_char, ent.end_char
        covered = []
        for cst, cen, idx in clean_spans:
            if not (cen <= ent_start or cst >= ent_end):
                covered.append(idx)
        if not covered:
            continue
        i0, i1 = min(covered), max(covered)
        start0 = orig_tokens[i0][0]
        end1 = orig_tokens[i1][1]
        orig_fragment = s[start0:end1]
        results.append([orig_fragment, ent.label_])

    return results

In [68]:
new_df = train_df[['sample', 'annotation']]

In [69]:
preds = []
for v in new_df['sample']:
    pred = predict_with_punct(nlp, v)
    preds.append(pred)

In [70]:
len(preds)

36283

In [71]:
new_df.shape

(36283, 2)

In [21]:
import spacy
import pandas as pd

sub = pd.read_csv("submissions/sub_base.csv", sep=';')
# sub = pd.read_csv("data/train.csv", sep=';')

nlp = spacy.load("model_ner/model-best")

In [22]:
preds = []
for v in sub['sample']:
    pred = predict_with_punct(nlp, v)
    preds.append(pred)

In [23]:
sub["new_model"] = preds
sub['new_model'] = sub['new_model'].astype(str)

In [24]:
def _split_into_tokens(text):
    """Разбивает текст на токены по пробелам"""
    if not text:
        return []
    
    tokens = []
    start = 0
    for i, char in enumerate(text):
        if char == ' ':
            if start < i:
                tokens.append((start, i))
            start = i + 1
    
    if start < len(text):
        tokens.append((start, len(text)))
    
    return tokens

def _tokenize_text(text):
    """Токенизирует текст и возвращает список токенов с их текстом и позициями"""
    tokens = _split_into_tokens(text)
    token_texts = []
    for start, end in tokens:
        token_texts.append((text[start:end].lower(), start, end))
    return token_texts

def convert_model2_to_model1(text, model2_results):
    """
    Конвертирует результаты NER модели 2 в формат модели 1, используя токенизацию.
    """
    if not isinstance(text, str):
        return []
    
    # Токенизируем текст
    text_tokens = _tokenize_text(text)
    if not text_tokens:
        return []
    
    # Если результат модели 2 пустой, все токены помечаем как 'O'
    if not model2_results:
        return [(start, end, 'O') for _, start, end in text_tokens]
    
    # Создаем список для тегов каждого токена, по умолчанию 'O'
    tags = ['O'] * len(text_tokens)
    
    # Обрабатываем каждую сущность из model2_results
    for entity in model2_results:
        if not isinstance(entity, (list, tuple)) or len(entity) < 2:
            continue
            
        entity_text, entity_type = entity[0], entity[1]
        
        if not isinstance(entity_text, str) or not isinstance(entity_type, str):
            continue
        
        # Токенизируем сущность
        entity_tokens = [token.lower() for token in entity_text.split()]
        if not entity_tokens:
            continue
        
        # Ищем последовательность токенов сущности в тексте
        i = 0
        while i <= len(text_tokens) - len(entity_tokens):
            # Проверяем, совпадает ли последовательность токенов
            match = True
            for j in range(len(entity_tokens)):
                if text_tokens[i + j][0] != entity_tokens[j]:
                    match = False
                    break
            
            if match:
                # Нашли совпадение - размечаем токены
                tags[i] = 'B-' + entity_type
                for j in range(1, len(entity_tokens)):
                    tags[i + j] = 'I-' + entity_type
                
                # Перескакиваем через найденную сущность
                i += len(entity_tokens)
            else:
                i += 1
    
    # Формируем результат
    result = []
    for (_, start, end), tag in zip(text_tokens, tags):
        result.append((start, end, tag))
    
    return result   


def make_submission(df, spicy_col: str):
    formated_results = [convert_model2_to_model1(text, literal_eval(model2_results)) for (text, model2_results) in zip(df['sample'].tolist(), df[spicy_col].tolist())]
    df['annotation'] = formated_results
    return df[['sample', 'annotation']]

In [25]:
from ast import literal_eval

version = f"pushka_gonka_v6"
sub_df = make_submission(sub, 'new_model')
sub_df.to_csv(f"submissions/sub_{version}.csv", sep=';', index=False)

In [27]:
print("avd")

avd


In [19]:
import spacy
import pandas as pd
import unicodedata
import re
from ast import literal_eval
from typing import List, Tuple

# -----------------------------
# ТВОИ ФУНКЦИИ (вставь их как есть)
# -----------------------------

def _is_punct(ch: str) -> bool:
    return unicodedata.category(ch).startswith("P") and ch != "%"

def _strip_punct(s: str) -> str:
    return "".join(ch for ch in s if not _is_punct(ch))

def predict_with_punct(nlp, s: str) -> List[List[str]]:
    orig_tokens: List[Tuple[int,int,str]] = []
    for m in re.finditer(r"\S+", s):
        start, end = m.span()
        orig_tokens.append((start, end, s[start:end]))

    clean_pieces = []
    clean_spans = []
    clean_cursor = 0
    kept_idx = []

    for i, (st, en, tok) in enumerate(orig_tokens):
        clean_tok = _strip_punct(tok)
        if not clean_tok:
            continue
        if clean_pieces:
            clean_cursor += 1
            clean_pieces.append(" ")
        clean_start = clean_cursor
        clean_pieces.append(clean_tok)
        clean_cursor += len(clean_tok)
        clean_spans.append((clean_start, clean_cursor, i))
        kept_idx.append(i)

    clean_text = "".join(clean_pieces).lower()
    if not clean_text.strip():
        return []

    doc = nlp(clean_text)

    results: List[List[str]] = []
    for ent in doc.ents:
        ent_start, ent_end = ent.start_char, ent.end_char
        covered = []
        for cst, cen, idx in clean_spans:
            if not (cen <= ent_start or cst >= ent_end):
                covered.append(idx)
        if not covered:
            continue
        i0, i1 = min(covered), max(covered)
        start0 = orig_tokens[i0][0]
        end1 = orig_tokens[i1][1]
        orig_fragment = s[start0:end1]
        results.append([orig_fragment, ent.label_])

    return results

# -----------------------------
# ФУНКЦИИ ДЛЯ КОНВЕРТАЦИИ В [(start, end, tag)]
# -----------------------------

def _split_into_tokens(text):
    if not text:
        return []
    tokens = []
    start = 0
    for i, char in enumerate(text):
        if char == ' ':
            if start < i:
                tokens.append((start, i))
            start = i + 1
    if start < len(text):
        tokens.append((start, len(text)))
    return tokens

def _tokenize_text(text):
    tokens = _split_into_tokens(text)
    token_texts = []
    for start, end in tokens:
        token_texts.append((text[start:end].lower(), start, end))
    return token_texts

def convert_model2_to_model1(text, model2_results):
    if not isinstance(text, str):
        return []
    
    text_tokens = _tokenize_text(text)
    if not text_tokens:
        return []

    if not model2_results:
        return [(start, end, 'O') for _, start, end in text_tokens]

    tags = ['O'] * len(text_tokens)

    for entity in model2_results:
        if not isinstance(entity, (list, tuple)) or len(entity) < 2:
            continue
        entity_text, entity_type = entity[0], entity[1]
        if not isinstance(entity_text, str) or not isinstance(entity_type, str):
            continue

        entity_tokens = [token.lower() for token in entity_text.split()]
        if not entity_tokens:
            continue

        i = 0
        while i <= len(text_tokens) - len(entity_tokens):
            match = True
            for j in range(len(entity_tokens)):
                if text_tokens[i + j][0] != entity_tokens[j]:
                    match = False
                    break
            if match:
                tags[i] = 'B-' + entity_type
                for j in range(1, len(entity_tokens)):
                    tags[i + j] = 'I-' + entity_type
                i += len(entity_tokens)
            else:
                i += 1

    result = []
    for (_, start, end), tag in zip(text_tokens, tags):
        result.append((start, end, tag))
    
    return result

# -----------------------------
# ОСНОВНОЙ КОД
# -----------------------------

# Загрузка
sub = pd.read_csv("data/train.csv", sep=';')
nlp = spacy.load("model/model-best")

# Преобразуем истинную разметку из строки в список (если нужно)
def safe_eval(x):
    try:
        return literal_eval(x) if isinstance(x, str) else x
    except:
        return []

sub['true'] = sub['annotation'].apply(safe_eval)

# Делаем предсказания
print("Делаем предсказания...")
sub['pred_raw'] = sub['sample'].apply(lambda x: predict_with_punct(nlp, x))

# Конвертируем в формат [(start, end, tag)]
print("Конвертируем предсказания...")
sub['pred'] = [
    convert_model2_to_model1(text, pred)
    for text, pred in zip(sub['sample'], sub['pred_raw'])
]

# Нормализуем для сравнения (сортируем по start)
def normalize_spans(spans):
    if not isinstance(spans, list):
        return []
    try:
        # Убедимся, что каждый элемент — кортеж из 3 элементов
        normalized = []
        for span in spans:
            if isinstance(span, (list, tuple)) and len(span) == 3:
                normalized.append((int(span[0]), int(span[1]), str(span[2])))
        return sorted(normalized, key=lambda x: x[0])
    except Exception:
        return []

sub['true_norm'] = sub['true'].apply(normalize_spans)
sub['pred_norm'] = sub['pred'].apply(normalize_spans)

# Находим несоответствия
mismatches = sub[sub['true_norm'] != sub['pred_norm']].copy()

# Выводим только нужное
result = mismatches[['sample', 'true', 'pred']]
print(f"\nНайдено несоответствий: {len(result)}\n")

Делаем предсказания...
Конвертируем предсказания...

Найдено несоответствий: 494



In [20]:
result.head()

Unnamed: 0,sample,true,pred
55,aloe,"[(0, 4, B-TYPE)]","[(0, 4, B-BRAND)]"
179,avyanka,"[(0, 7, O)]","[(0, 7, B-BRAND)]"
227,beer,"[(0, 4, B-TYPE)]","[(0, 4, B-BRAND)]"
228,beerka,"[(0, 6, B-BRAND)]","[(0, 6, B-TYPE)]"
287,bodyr,"[(0, 5, O)]","[(0, 5, B-BRAND)]"


In [22]:
result.to_csv(f"mistake/mistake_v1.csv", sep=';', index=False)

In [21]:
result.iloc[:50,:]

Unnamed: 0,sample,true,pred
55,aloe,"[(0, 4, B-TYPE)]","[(0, 4, B-BRAND)]"
179,avyanka,"[(0, 7, O)]","[(0, 7, B-BRAND)]"
227,beer,"[(0, 4, B-TYPE)]","[(0, 4, B-BRAND)]"
228,beerka,"[(0, 6, B-BRAND)]","[(0, 6, B-TYPE)]"
287,bodyr,"[(0, 5, O)]","[(0, 5, B-BRAND)]"
312,bonfet,"[(0, 6, O)]","[(0, 6, B-BRAND)]"
558,cахъар,"[(0, 6, B-TYPE)]","[(0, 6, B-BRAND)]"
561,cливки,"[(0, 6, B-TYPE)]","[(0, 6, B-BRAND)]"
565,cушки,"[(0, 5, B-TYPE)]","[(0, 5, B-BRAND)]"
578,dececo,"[(0, 6, O)]","[(0, 6, B-BRAND)]"


In [5]:
result.iloc[:50,:]

Unnamed: 0,sample,true,pred
57,alpe,"[(0, 4, B-BRAND)]","[(0, 4, B-TYPE)]"
67,alpin gold he,"[(0, 5, B-BRAND), (6, 10, I-BRAND), (11, 13, O)]","[(0, 5, B-BRAND), (6, 10, I-BRAND), (11, 13, I..."
80,aotime,"[(0, 6, O)]","[(0, 6, B-BRAND)]"
119,art,"[(0, 3, O)]","[(0, 3, B-BRAND)]"
218,bastus,"[(0, 6, O)]","[(0, 6, B-BRAND)]"
368,caib,"[(0, 4, O)]","[(0, 4, B-TYPE)]"
377,caramel,"[(0, 7, B-TYPE)]","[(0, 7, B-BRAND)]"
430,cherry,"[(0, 6, B-TYPE)]","[(0, 6, B-BRAND)]"
495,coffeso,"[(0, 7, B-BRAND)]","[(0, 7, B-TYPE)]"
604,deonica кому-нибудь,"[(0, 7, B-BRAND), (8, 19, O)]","[(0, 7, B-BRAND), (8, 19, B-TYPE)]"


In [84]:
import spacy
from spacy.tokens import DocBin
import pandas as pd

def evaluate_spacy_model_errors_only(MODEL_PATH: str, DEV_DATA_PATH: str) -> pd.DataFrame:
    """
    Возвращает DataFrame только с теми примерами, где предсказание модели != истинной разметке.
    """
    nlp = spacy.load(MODEL_PATH)
    doc_bin = DocBin().from_disk(DEV_DATA_PATH)
    docs = list(doc_bin.get_docs(nlp.vocab))
    
    error_results = []
    
    for doc_true in docs:
        text = doc_true.text
        
        # Истинные метки
        true_labels = ["O"] * len(doc_true)
        for ent in doc_true.ents:
            for i in range(ent.start, ent.end):
                true_labels[i] = f"B-{ent.label_}" if i == ent.start else f"I-{ent.label_}"
        
        # Предсказания модели
        doc_pred = nlp(text)
        pred_labels = ["O"] * len(doc_pred)
        for ent in doc_pred.ents:
            for i in range(ent.start, ent.end):
                pred_labels[i] = f"B-{ent.label_}" if i == ent.start else f"I-{ent.label_}"
        
        # Приводим к одинаковой длине (на случай расхождений в токенизации)
        min_len = min(len(true_labels), len(pred_labels))
        true_labels = true_labels[:min_len]
        pred_labels = pred_labels[:min_len]
        
        # Сравниваем
        if true_labels != pred_labels:
            error_results.append({
                "text": text,
                "true_labels": true_labels,
                "pred_labels": pred_labels
            })
    
    return pd.DataFrame(error_results)

In [103]:
DEV_DATA_PATH = "data/valid_aug_v4.spacy"
MODEL_PATH = "model/model-best"

df_errors = evaluate_spacy_model_errors_only(MODEL_PATH, DEV_DATA_PATH)
print(f"Найдено ошибок: {len(df_errors)} из {len(DocBin().from_disk(DEV_DATA_PATH))} документов")

Найдено ошибок: 349 из 3047 документов


In [90]:
df_errors.to_csv("model_errors_only.csv", sep=";", index=False)

In [105]:
df_errors.iloc[200:250,:]

Unnamed: 0,text,true_labels,pred_labels
200,мороженое крем брюле,"[B-TYPE, O, O]","[B-TYPE, I-TYPE, B-BRAND]"
201,варено-копченые унипр,"[B-TYPE, B-BRAND]","[B-TYPE, O]"
202,водв святой источник,"[B-TYPE, O, O]","[B-TYPE, B-BRAND, I-BRAND]"
203,млдом,[O],[B-TYPE]
204,чиабат,[B-TYPE],[B-BRAND]
205,пемолюкс,[B-BRAND],[B-TYPE]
206,мажитель,[B-BRAND],[B-TYPE]
207,поп корн сладко соленный,"[B-TYPE, I-TYPE, O, O]","[B-TYPE, I-TYPE, B-BRAND, I-BRAND]"
208,"гримы, гуммо","[B-TYPE, I-TYPE]","[B-TYPE, O]"
209,сухарики с томатом,"[B-TYPE, I-TYPE, I-TYPE]","[B-TYPE, O, O]"


In [15]:
import spacy
from spacy.tokens import DocBin
from spacy import displacy
from pathlib import Path

# -----------------------------
# Настройки — замени на свои пути
# -----------------------------
# MODEL_PATH = "model/model-best"          # путь к обученной модели
DEV_DATA_PATH = "data/valid_aug_v4.spacy"   # путь к валидации

# -----------------------------
# Загрузка
# -----------------------------
# nlp = spacy.load(MODEL_PATH)
doc_bin = DocBin().from_disk(DEV_DATA_PATH)
docs = list(doc_bin.get_docs(nlp.vocab))

print(f"Загружено {len(docs)} документов из {DEV_DATA_PATH}\n")

def spacy_doc_to_bio(doc):
    """
    Преобразует SpaCy-документ (с doc.ents) в список токенов в формате [(start, end, BIO_tag), ...]
    """
    text = doc.text
    tokens = _tokenize_text(text)  # используем твою функцию
    if not tokens:
        return []

    # Инициализируем все токены как 'O'
    tags = ['O'] * len(tokens)

    # Создаём список сущностей как (start_char, end_char, label)
    entities = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]

    # Сортируем по началу (на всякий случай)
    entities.sort()

    # Присваиваем B/I теги
    for start_char, end_char, label in entities:
        # Найдём, какие токены попадают в этот диапазон
        token_indices = []
        for i, (_, tok_start, tok_end) in enumerate(tokens):
            # Проверяем пересечение: токен должен быть полностью внутри сущности
            if tok_start >= start_char and tok_end <= end_char:
                token_indices.append(i)

        if not token_indices:
            continue

        # Первый токен → B-, остальные → I-
        tags[token_indices[0]] = f"B-{label}"
        for idx in token_indices[1:]:
            tags[idx] = f"I-{label}"

    # Формируем результат
    result = []
    for (_, start, end), tag in zip(tokens, tags):
        result.append((start, end, tag))
    return result


all_error_rows = []

for doc in docs:
    pred_doc = nlp(doc.text)
    gold_bio = spacy_doc_to_bio(doc)
    pred_bio = spacy_doc_to_bio(pred_doc)
    
    # Пропускаем, если не совпадает число токенов (редко, но бывает)
    if len(gold_bio) != len(pred_bio):
        continue

    tokens = [doc.text[start:end] for (start, end, _) in gold_bio]
    gold_tags = [tag for (_, _, tag) in gold_bio]
    pred_tags = [tag for (_, _, tag) in pred_bio]

    # Проверяем, есть ли хоть одна ошибка в этом документе
    has_error = any(g != p for g, p in zip(gold_tags, pred_tags))
    if not has_error:
        continue

    # Сохраняем ВЕСЬ документ как один пример ошибки
    all_error_rows.append({
        "Выражение": doc.text,
        "Истинные метки": gold_tags,
        "Предсказание модели": pred_tags
    })

# Создаём датафрейм
df_all_errors = pd.DataFrame(all_error_rows)

print(f"Всего документов с ошибками: {len(df_all_errors)}")
df_all_errors.head()

Загружено 3047 документов из data/valid_aug_v4.spacy



In [None]:
df = df_all_errors.copy()

# Пример: найти строки с символами
import re
def has_special_char(text):
    return bool(re.search(r"[^а-яА-ЯёЁ\s.,!?;:]", text))

df["has_special"] = df["Выражение"].apply(has_special_char)
df["has_english"] = df["Выражение"].str.contains(r"[a-zA-Z]", regex=True)

print("Ошибки с символами:", df[df["has_special"]].shape[0])
print("Ошибки с английским:", df[df["has_english"]].shape[0])
print("Пересечение:", df[df["has_special"] & df["has_english"]].shape[0])

In [None]:
df_all_errors.iloc[50:100,:]

In [53]:
from collections import defaultdict
from collections import Counter

error_examples = []

for doc in docs:
    # Получаем предсказание
    pred_doc = nlp(doc.text)

    # Конвертируем в BIO-формат
    gold_bio = spacy_doc_to_bio(doc)        # эталон
    pred_bio = spacy_doc_to_bio(pred_doc)   # предсказание

    # Убедимся, что количество токенов совпадает
    if len(gold_bio) != len(pred_bio):
        print(f"⚠️ Длина токенов не совпадает в тексте: {doc.text[:50]}...")
        continue

    # Сравниваем по токенам
    has_error = False
    annotated_tokens = []

    for (g_start, g_end, g_tag), (p_start, p_end, p_tag) in zip(gold_bio, pred_bio):
        token_text = doc.text[g_start:g_end]
        if g_tag != p_tag:
            has_error = True
            # Помечаем ошибку
            annotated_tokens.append(f"[{g_tag}→{p_tag}: {token_text}]")
        else:
            annotated_tokens.append(token_text)

    if has_error:
        error_examples.append({
            "text": doc.text,
            "gold_bio": gold_bio,
            "pred_bio": pred_bio,
            "annotated": " ".join(annotated_tokens)
        })

error_counter = Counter()
for ex in error_examples:
    for (g_start, g_end, g_tag), (p_start, p_end, p_tag) in zip(ex['gold_bio'], ex['pred_bio']):
        if g_tag != p_tag:
            error_counter[(g_tag, p_tag)] += 1

print("\nТоп-10 ошибок (gold → pred):")
for (gold, pred), count in error_counter.most_common(10):
    print(f"{gold:10} → {pred:10} : {count}")

In [19]:
import pandas as pd
from collections import defaultdict

# Соберём все ошибки с деталями
error_details = defaultdict(list)  # ключ: (gold_tag, pred_tag)

for doc in docs:
    pred_doc = nlp(doc.text)
    gold_bio = spacy_doc_to_bio(doc)
    pred_bio = spacy_doc_to_bio(pred_doc)
    
    if len(gold_bio) != len(pred_bio):
        continue  # пропускаем несогласованные
    
    tokens = [doc.text[start:end] for (start, end, _) in gold_bio]
    
    for i, ((g_start, g_end, g_tag), (p_start, p_end, p_tag)) in enumerate(zip(gold_bio, pred_bio)):
        if g_tag != p_tag:
            # Сохраняем полный контекст + токены + теги
            error_details[(g_tag, p_tag)].append({
                "text": doc.text,
                "tokens": tokens,
                "gold_tags": [ent[2] for ent in gold_bio],
                "pred_tags": [ent[2] for ent in pred_bio],
                "error_token": tokens[i],
                "token_index": i
            })

# Собираем топ-10 ошибок по частоте
top_errors = []
for (gold, pred), examples in error_details.items():
    top_errors.append({
        "gold_tag": gold,
        "pred_tag": pred,
        "count": len(examples),
        "example": examples[0]  # берём первый пример
    })

top_errors = sorted(top_errors, key=lambda x: x["count"], reverse=True)[:10]

# Формируем итоговый DataFrame
rows = []
for err in top_errors:
    ex = err["example"]
    rows.append({
        "Выражение": ex["text"],
        "Истинные метки": ex["gold_tags"],
        "Предсказание модели": ex["pred_tags"]
    })

df_errors = pd.DataFrame(rows)
df_errors.index = [f"{err['gold_tag']} → {err['pred_tag']} (n={err['count']})" for err in top_errors]
df_errors.index.name = "Тип ошибки"

In [79]:
df_errors.head()

Unnamed: 0_level_0,Выражение,Истинные метки,Предсказание модели
Тип ошибки,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
B-BRAND → B-TYPE (n=73),цико,[B-BRAND],[B-TYPE]
B-TYPE → O (n=61),добродел варено-копчены,"[B-BRAND, B-TYPE]","[B-BRAND, O]"
O → B-TYPE (n=59),лебок,[O],[B-TYPE]
O → B-BRAND (n=29),famicollection,[O],[B-BRAND]
B-TYPE → B-BRAND (n=25),гапиток,[B-TYPE],[B-BRAND]


In [57]:
df_errors.head()

Unnamed: 0_level_0,Выражение,Истинные метки,Предсказание модели
Тип ошибки,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
B-TYPE → O (n=71),добродел варено-копчены,"[B-BRAND, B-TYPE]","[B-BRAND, O]"
B-BRAND → B-TYPE (n=62),цико,[B-BRAND],[B-TYPE]
O → B-TYPE (n=53),лебок,[O],[B-TYPE]
I-TYPE → O (n=39),"дневники, органайзер","[B-TYPE, I-TYPE]","[O, O]"
B-TYPE → B-BRAND (n=38),аннанасы,[B-TYPE],[B-BRAND]


Всего документов с ошибками: 298


Unnamed: 0,Выражение,Истинные метки,Предсказание модели
0,листб,[B-TYPE],[B-BRAND]
1,пилы дисковы,"[B-TYPE, I-TYPE]","[O, B-BRAND]"
2,добродел варено-копчены,"[B-BRAND, B-TYPE]","[B-BRAND, O]"
3,пудинг чудо,"[B-TYPE, I-TYPE]","[B-TYPE, B-BRAND]"
4,"дневники, органайзер","[B-TYPE, I-TYPE]","[O, O]"


In [21]:
df = df_all_errors.copy()

# Пример: найти строки с символами
import re
def has_special_char(text):
    return bool(re.search(r"[^а-яА-ЯёЁ\s.,!?;:]", text))

df["has_special"] = df["Выражение"].apply(has_special_char)
df["has_english"] = df["Выражение"].str.contains(r"[a-zA-Z]", regex=True)

print("Ошибки с символами:", df[df["has_special"]].shape[0])
print("Ошибки с английским:", df[df["has_english"]].shape[0])
print("Пересечение:", df[df["has_special"] & df["has_english"]].shape[0])

Ошибки с символами: 71
Ошибки с английским: 45
Пересечение: 45


In [82]:
df_all_errors.iloc[50:100,:]

Unnamed: 0,Выражение,Истинные метки,Предсказание модели
50,энергетик драйв,"[B-TYPE, I-TYPE]","[B-BRAND, B-TYPE]"
51,пюре детское мясное,"[B-TYPE, O, O]","[B-TYPE, I-TYPE, I-TYPE]"
52,"пятновыводители, отбеливатели","[B-TYPE, I-TYPE]","[O, O]"
53,порционые слики,"[O, O]","[B-TYPE, I-TYPE]"
54,syeret,[O],[B-BRAND]
55,даширак,[B-BRAND],[B-TYPE]
56,"кола без, сахара","[B-TYPE, O, O]","[B-TYPE, O, B-TYPE]"
57,рис.,[B-TYPE],[O]
58,сельд под шубой,"[B-TYPE, I-TYPE, I-TYPE]","[B-TYPE, O, O]"
59,польза и вкус,"[B-BRAND, I-BRAND, I-BRAND]","[B-TYPE, I-TYPE, I-TYPE]"


In [60]:
df_all_errors.to_csv("errors_by_document.csv", index=False, sep=';')

In [None]:
import pandas as pd

token_level_errors = []

for doc in docs:
    pred_doc = nlp(doc.text)
    gold_bio = spacy_doc_to_bio(doc)
    pred_bio = spacy_doc_to_bio(pred_doc)
    
    if len(gold_bio) != len(pred_bio):
        continue

    tokens = [doc.text[start:end] for (start, end, _) in gold_bio]
    text = doc.text

    for i, ((_, _, g_tag), (_, _, p_tag)) in enumerate(zip(gold_bio, pred_bio)):
        if g_tag != p_tag:
            token_level_errors.append({
                "Выражение": text,
                "Токен": tokens[i],
                "Позиция токена": i,
                "Истинная метка": g_tag,
                "Предсказанная метка": p_tag,
                "Тип ошибки": f"{g_tag} → {p_tag}"
            })

df_token_errors = pd.DataFrame(token_level_errors)
print(f"Всего ошибочных токенов: {len(df_token_errors)}")