In [52]:
# -*- coding: utf-8 -*-
import random, re
from typing import List, Tuple, Optional
import pandas as pd

# ===== утилиты =====
_WS_RE = re.compile(r"\S+")
def ws_offsets(text: str):
    return [(m.group(0), m.start(), m.end()) for m in _WS_RE.finditer(text or "")]

def bio_init(n: int) -> List[str]:
    return ["O"] * n

def paint(y: List[str], s: int, e: int, tag: str):
    y[s] = f"B-{tag}"
    for i in range(s+1, e+1):
        y[i] = f"I-{tag}"

def to_text_and_ann(tokens: List[str], labels: List[str]) -> Tuple[str, str]:
    text = " ".join(tokens)
    triples = []
    for (tok, s, e), lab in zip(ws_offsets(text), labels):
        triples.append(f"({s}, {e}, '{lab}')")
    return text, "[" + ", ".join(triples) + "]"

# ===== словари =====
# бренды: короткие, реалистичные; оставляю латиницу — это полезно для нуджа BRAND
BRANDS = [
    "Danone", "Prostokvashino", "Oatly", "Heinz", "Nestle",
    "Milka", "Mirel", "Activia", "Rama", "Chudo", "Vkusnoteevo"
]
# типы (категории/наименования товара) — русские, иногда многословные
TYPES = [
    "йогурт", "кефир", "молоко", "сыр", "творог", "сливочное масло",
    "растительное масло", "печенье", "батончик", "шоколад",
    "лимонад", "сок", "соус", "кетчуп", "майонез", "творожный сыр"
]
# вкусы/описания для лёгкого контекста
FLAVORS = ["клубника", "ваниль", "шоколад", "персик", "без сахара", "классический", "лайт"]

# единицы — только русские (включая склонения и частые опечатки)
UNITS_SHORT = ["мл", "л", "г", "гр", "кг", "шт"]
UNITS_FULL  = [
    "миллилитр","миллилитра","миллилитров",
    "литр","литра","литров",
    "грамм","грамма","граммов",
    "килограмм","килограмма","килограммов",
    "штука","штуки","штук"
]
UNITS_TYPO  = ["милилитр","милилитра","милилитров","киллограмм","киллограмма","киллограммов","грам"]
UNITS = UNITS_SHORT + UNITS_FULL + UNITS_TYPO
COMPACT_UNITS = {"мл","л","г","гр","кг","шт"}  # допустимо слитно: 500г, 1,5л

PCT_WORDS = ["%", "процент", "процента", "процентов", "проценты", "проц", "проц."]
ATTR_PCT  = ["жирность","содержание сахара","сахар","какао","белок"]

# ===== генерация блоков =====
def _rand_number():
    if random.random() < 0.55:
        return str(random.choice([100,150,200,250,300,330,400,450,500,700,750,900,1000,1500,2000]))
    val = random.choice([0.2,0.25,0.3,0.33,0.5,0.75,1.0,1.5,2.0,2.5,3.2,5.0,7.5,10.0,12.0,15.0,20.0,25.0])
    s = f"{val}"
    return s if random.random() < 0.5 else s.replace(".", ",")

def block_volume():
    num, unit = _rand_number(), random.choice(UNITS)
    if unit in COMPACT_UNITS and random.random() < 0.30:
        tok = num + unit
        return [tok], [("VOLUME", 0, 0)]
    return [num, unit], [("VOLUME", 0, 1)]

def block_percent():
    num, pct = _rand_number(), random.choice(PCT_WORDS)
    toks, spans = [], []
    if random.random() < 0.55:
        toks.extend(random.choice(ATTR_PCT).split())
    if random.random() < 0.45:
        tok = num + ("" if pct == "%" else "") + pct   # 3,2%
        toks.append(tok); spans.append(("PERCENT", len(toks)-1, len(toks)-1))
    else:
        toks += [num, pct]; spans.append(("PERCENT", len(toks)-2, len(toks)-1))
    return toks, spans

def make_sample_row(
    brand: Optional[str],
    typ: Optional[str],
    need_volume: bool,
    need_percent: bool,
    add_flavor_prob: float = 0.4,
):
    tokens: List[str] = []
    spans: List[Tuple[str,int,int]] = []

    # TYPE (максимум один)
    if typ:
        tks = typ.split()
        s = len(tokens); tokens += tks; e = len(tokens)-1
        spans.append(("TYPE", s, e))

    # BRAND (максимум один)
    if brand:
        bks = brand.split()
        s = len(tokens); tokens += bks; e = len(tokens)-1
        spans.append(("BRAND", s, e))

    # немного контекста
    if random.random() < add_flavor_prob:
        tokens += random.choice(FLAVORS).split()

    # измерения
    blocks = []
    if need_volume:  blocks.append(block_volume())
    if need_percent: blocks.append(block_percent())
    random.shuffle(blocks)
    for btoks, bspans in blocks:
        shift = len(tokens); tokens += btoks
        for tag, s_rel, e_rel in bspans:
            spans.append((tag, shift + s_rel, shift + e_rel))

    # BIO
    y = bio_init(len(tokens))
    for tag, s, e in spans:
        paint(y, s, e, tag)

    return to_text_and_ann(tokens, y)

def generate_fixed_brand_type(
    n_rows: int = 2000,
    seed: int = 123,
    ensure_both_frac: float = 0.6,  # доля строк с ОБОИМИ: VOLUME и PERCENT
    p_brand: float = 1.0,           # вероятность вставить один BRAND
    p_type: float  = 1.0,           # вероятность вставить один TYPE
):
    random.seed(seed)
    rows = []
    need_both = int(n_rows * ensure_both_frac)

    while len(rows) < n_rows:
        brand = random.choice(BRANDS) if random.random() < p_brand else None
        typ   = random.choice(TYPES)  if random.random() < p_type  else None

        # минимум одна измерительная сущность
        if need_both > 0:
            need_volume = True; need_percent = True
        else:
            need_volume  = random.random() < 0.9
            need_percent = random.random() < 0.8
            if not (need_volume or need_percent):
                need_volume = True

        text, ann = make_sample_row(brand, typ, need_volume, need_percent)
        rows.append((text, ann))
        if need_volume and need_percent and need_both > 0:
            need_both -= 1

    return pd.DataFrame(rows, columns=["sample","annotation"]).drop_duplicates().reset_index(drop=True)

In [62]:
df_sync = generate_fixed_brand_type(
    n_rows=1400,
    seed=123,
    ensure_both_frac=0.6,
    p_brand=1.0,
    p_type=1.0
)

In [63]:
df_sync.to_csv(r"./.data/sync_data.csv", sep=";", index=False)

In [64]:
df_train = pd.read_csv(r"./.data/input/train.csv", sep = ";", encoding="utf-8")

In [65]:
df_mix = pd.concat([df_train, df_sync], ignore_index=True)

In [66]:
df_mix.shape

(28849, 2)

In [67]:
df_mix.to_csv(r"./.data/train_.csv", sep=";", index=False)