In [14]:
import os, io, re, gc, json, gzip, random, urllib.request
from pathlib import Path
from typing import List, Tuple

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

try:
    import datasets
except Exception:
    import sys, subprocess
    subprocess.run([sys.executable, "-m", "pip", "install", "-q", "datasets==2.19.0"], check=False)
    import datasets

# --- reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

# --- device
USE_CPU_ONLY = False  # если Colab без GPU — поставьте True
device = torch.device("cpu" if USE_CPU_ONLY or not torch.cuda.is_available() else "cuda")
device

# --- dirs
BASE = Path("work")
DATA_DIR = BASE/"data"; CKPT_DIR = BASE/"ckpt"
DATA_DIR.mkdir(parents=True, exist_ok=True); CKPT_DIR.mkdir(parents=True, exist_ok=True)

# --- data size limits
WB_MAX   = 5_000      # Wildberries: названия товаров
OSUB_LINES = 120_000  # OpenSubtitles RU — короткие разговорные фразы
GENIUS_MAX = 120_000  # Genius lyrics RU — тексты песен

# --- model/training
MAX_LEN    = 128
EMB_DIM    = 96
HIDDEN_DIM = 192
BATCH_SIZE = 256 if device.type == "cuda" else 128
EPOCHS     = 50
LR         = 3e-3

print("device:", device, "| batch:", BATCH_SIZE)


device: cuda | batch: 256


In [15]:
def clean_line(s: str) -> str:
    s = re.sub(r"\s+", " ", str(s)).strip()
    # ограничим алфавит до букв/цифр/распространённой пунктуации
    s = re.sub(r"[^0-9A-Za-zА-Яа-яЁё ,.!?:;()\\[\\]{}\"'«»—–…/+#&\\-]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def spaced_to_pair(text: str, max_len: int = MAX_LEN) -> Tuple[str, str]:
    """
    Из "правильной" строки с пробелами делаем:
      x: строка без пробелов (обрезанная до max_len)
      y: строка из '0'/'1' длиной len(x); '1' = ПОСЛЕ символа i в x должен стоять пробел.
    """
    t = re.sub(r"\s+", " ", str(text)).strip()
    if not t:
        return "", ""
    # убираем пробелы, параллельно помечая позиции
    x_chars, y_bits = [], []
    last_non = -1
    i_compact = -1
    for ch in t:
        if ch.isspace():
            continue
        x_chars.append(ch)
        i_compact += 1
        y_bits.append('0')
        last_non = i_compact

    compact_idx = -1
    prev_was_non = False
    for i, ch in enumerate(t):
        if not ch.isspace():
            compact_idx += 1
            prev_was_non = True
        else:
            if prev_was_non:
                ahead = t[i+1:]
                if any(not c.isspace() for c in ahead):
                    j = compact_idx
                    if 0 <= j < len(y_bits):
                        y_bits[j] = '1'
            prev_was_non = False

    x = "".join(x_chars)[:max_len]
    y = "".join(y_bits[:len(x)])
    return x, y

def append_texts(texts: List[str], out_csv: Path):
    out_csv = Path(out_csv)
    first_write = not out_csv.exists()
    with open(out_csv, "a", encoding="utf-8") as f:
        if first_write:
            f.write("text\n")
        for s in texts:
            s2 = clean_line(s)
            if len(s2) >= 5:
                f.write(s2.replace("\n"," ").strip()+"\n")

print("utils ready")


utils ready


In [16]:
import csv, io, gzip, urllib.request, gc
from pathlib import Path

TEXTS_CSV = DATA_DIR / "train_texts.csv"

if "clean_line" not in globals():
    import re
    def clean_line(s: str) -> str:
        s = re.sub(r"\s+", " ", str(s)).strip()
        s = re.sub(r"[^0-9A-Za-zА-Яа-яЁё ,.!?:;()\\[\\]{}\"'«»—–…/+#&\\-]", " ", s)
        s = re.sub(r"\s+", " ", s).strip()
        return s

def append_texts_iter(text_iter, out_csv: Path, header_written: bool):
    """
    Безопасная запись: csv.writer с QUOTE_MINIMAL.
    Возвращает (сколько реально записали, заголовок_уже_записан).
    """
    wrote = 0
    out_csv.parent.mkdir(parents=True, exist_ok=True)
    with open(out_csv, "a", encoding="utf-8", newline="") as f:
        w = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
        if not header_written:
            w.writerow(["text"])
            header_written = True
        for s in text_iter:
            s2 = clean_line(s)
            if len(s2) >= 5:
                w.writerow([s2])  # writer сам поставит кавычки при необходимости
                wrote += 1
    return wrote, header_written

# начинаем с чистого файла
if TEXTS_CSV.exists():
    TEXTS_CSV.unlink()

header_done = False
seen = set()  # для дедупликации (по полной строке)
total_written = 0

def dedup_batch(batch_list):
    # удаляем уже встречавшиеся строки
    uniq = []
    for s in batch_list:
        if s and s not in seen:
            seen.add(s)
            uniq.append(s)
    return uniq

# ---------- 3.1 Wildberries sample (названия товаров) ----------
try:
    WB_URL = "https://raw.githubusercontent.com/luminati-io/Wildberries-dataset-sample/main/Wildberries-dataset-sample.csv"
    print("[WB] downloading sample …")
    df_wb = pd.read_csv(WB_URL)
    col = "name" if "name" in df_wb.columns else None
    if col:
        arr = df_wb[col].astype(str).tolist()[:WB_MAX]
        arr = dedup_batch(arr)
        wrote, header_done = append_texts_iter(arr, TEXTS_CSV, header_done)
        total_written += wrote
        print(f"[WB] +{wrote} rows")
    else:
        print("[WB] column 'name' not found — skipped")
except Exception as e:
    print("[WB] skipped due to:", e)

# ---------- 3.2 OpenSubtitles RU (mono) ----------
try:
    OSUB_URL = "https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/mono/ru.txt.gz"
    print("[OSub] streaming …")
    with urllib.request.urlopen(OSUB_URL) as resp:
        gz = io.BytesIO(resp.read())
    wrote_total = 0
    batch, cap = [], OSUB_LINES
    with gzip.GzipFile(fileobj=gz, mode="rb") as zf:
        for i, raw in enumerate(zf):
            if i >= cap: break
            s = raw.decode("utf-8", errors="ignore").strip()
            if not s:
                continue
            batch.append(s)
            # батчевой записью экономим память
            if len(batch) >= 10_000:
                uniq = dedup_batch(batch)
                wrote, header_done = append_texts_iter(uniq, TEXTS_CSV, header_done)
                wrote_total += wrote
                batch.clear()
    if batch:
        uniq = dedup_batch(batch)
        wrote, header_done = append_texts_iter(uniq, TEXTS_CSV, header_done)
        wrote_total += wrote
        batch.clear()
    print(f"[OSub] +{wrote_total} rows")
    del gz
except Exception as e:
    print("[OSub] skipped due to:", e)

# ---------- 3.3 Genius Lyrics RU (HuggingFace) ----------
try:
    from datasets import load_dataset
    print("[Genius] streaming from HF …")
    ds = load_dataset("sevenreasons/genius-lyrics-russian", split="train", streaming=True)

    # найдём ключ текста (lyrics/text/content/lyric) — по первой записи
    first = next(iter(ds))
    text_key = None
    for k in ["lyrics", "text", "content", "lyric"]:
        if k in first and isinstance(first[k], str):
            text_key = k; break
    if text_key is None:
        for k, v in first.items():
            if isinstance(v, str):
                text_key = k; break
    # снова итератор (первую запись уже забрали)
    ds = load_dataset("sevenreasons/genius-lyrics-russian", split="train", streaming=True)

    wrote_total = 0
    batch, cap = [], GENIUS_MAX
    for i, row in enumerate(ds):
        if i >= cap: break
        s = row.get(text_key, "")
        if not s:
            continue
        batch.append(s)
        if len(batch) >= 5_000:
            uniq = dedup_batch(batch)
            wrote, header_done = append_texts_iter(uniq, TEXTS_CSV, header_done)
            wrote_total += wrote
            batch.clear()
    if batch:
        uniq = dedup_batch(batch)
        wrote, header_done = append_texts_iter(uniq, TEXTS_CSV, header_done)
        wrote_total += wrote
        batch.clear()
    print(f"[Genius] +{wrote_total} rows (key='{text_key}')")
except Exception as e:
    print("[Genius] skipped due to:", e)

gc.collect()

# ---------- Сводка и быстрый просмотр ----------
df_texts = pd.read_csv(TEXTS_CSV)
print("[texts] total:", df_texts.shape)
df_texts.head()


[WB] downloading sample …
[WB] +835 rows
[OSub] streaming …
[OSub] +82116 rows
[Genius] streaming from HF …
[Genius] +119426 rows (key='lyrics')
[texts] total: (202377, 1)


Unnamed: 0,text
0,Шапка бини с отворотом двухслойная трикотажная
1,Шапка бини с отворотом двухслойная вязаная
2,Наушники беспроводные с микрофоном блютуз Pro
3,Футболка голубая однотонная
4,ЦИОН для овощей Комплексное удобрение для тома...


In [17]:
PAIRS_CSV = DATA_DIR/"train_pairs.csv"

def build_pairs(in_csv: Path, out_csv: Path, max_len=MAX_LEN, max_rows=400_000):
    df = pd.read_csv(in_csv)
    texts = df["text"].astype(str).tolist()
    rows = []
    for i, t in enumerate(texts):
        x, y = spaced_to_pair(t, max_len=max_len)
        if len(x) >= 2 and len(y) == len(x):
            rows.append((i, x, y))
        if len(rows) >= max_rows:
            break
    out_csv.unlink(missing_ok=True)
    pd.DataFrame(rows, columns=["id","x","y"]).to_csv(out_csv, index=False)
    return len(rows)

total_pairs = build_pairs(TEXTS_CSV, PAIRS_CSV, max_len=MAX_LEN, max_rows=500_000)
df_pairs = pd.read_csv(PAIRS_CSV)
print("[pairs]", df_pairs.shape)
df_pairs.head()


[pairs] (202377, 3)


Unnamed: 0,id,x,y
0,0,Шапкабинисотворотомдвухслойнаятрикотажная,00001000110000000010000000000100000000000
1,1,Шапкабинисотворотомдвухслойнаявязаная,0000100011000000001000000000010000000
2,2,НаушникибеспроводныесмикрофономблютузPro,0000000100000000000110000000001000001000
3,3,Футболкаголубаяоднотонная,0000000100000010000000000
4,4,"ЦИОНдляовощейКомплексноеудобрениедлятоматов,ог...",0001001000001000000000010000000010010000000100...


In [18]:
def build_vocab_from_pairs(csv_path: Path):
    df = pd.read_csv(csv_path, usecols=["x"])
    vocab = set()
    for s in df["x"].astype(str):
        for ch in s:
            vocab.add(ch)
    itos = ["<PAD>", "<UNK>"] + sorted(vocab)
    stoi = {ch:i for i,ch in enumerate(itos)}
    return stoi, itos

STOI, ITOS = build_vocab_from_pairs(PAIRS_CSV)
len_vocab = len(ITOS)
print("vocab size:", len_vocab, "| example:", ITOS[:20])

class SpaceDataset(Dataset):
    def __init__(self, df: pd.DataFrame, stoi: dict):
        self.xs = df["x"].astype(str).tolist()
        self.ys = df["y"].astype(str).tolist()
        self.stoi = stoi
    def __len__(self): return len(self.xs)
    def __getitem__(self, idx):
        x = self.xs[idx]; y = self.ys[idx]
        ids = [self.stoi.get(c, self.stoi["<UNK>"]) for c in x]
        target = [1.0 if ch=='1' else 0.0 for ch in y]
        return torch.tensor(ids, dtype=torch.long), torch.tensor(target, dtype=torch.float32)

def collate_pad(batch):
    xs, ys = zip(*batch)
    L = max(t.size(0) for t in xs)
    pad_id = STOI["<PAD>"]
    xb = torch.full((len(xs), L), pad_id, dtype=torch.long)
    yb = torch.zeros((len(xs), L), dtype=torch.float32)
    for i,(ti,yi) in enumerate(zip(xs,ys)):
        xb[i,:ti.size(0)] = ti
        yb[i,:yi.size(0)] = yi
    return xb, yb

# split train/val
df_all = pd.read_csv(PAIRS_CSV)
perm = np.random.permutation(len(df_all))
val_sz = max(2000, int(0.1 * len(df_all)))
val_idx = set(perm[:val_sz])
train_df = df_all[[i not in val_idx for i in range(len(df_all))]].reset_index(drop=True)
val_df   = df_all[[i in val_idx for i in range(len(df_all))]].reset_index(drop=True)

train_ds = SpaceDataset(train_df, STOI)
val_ds   = SpaceDataset(val_df, STOI)
train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True,  collate_fn=collate_pad, num_workers=2)
val_dl   = DataLoader(val_ds,   batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_pad, num_workers=2)

len(train_ds), len(val_ds), len_vocab


vocab size: 758 | example: ['<PAD>', '<UNK>', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2']


(182140, 20237, 758)

In [19]:
class SpaceTagger(nn.Module):
    def __init__(self, vocab_size, emb=EMB_DIM, hidden=HIDDEN_DIM):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb, padding_idx=0)
        self.rnn = nn.GRU(emb, hidden//2, batch_first=True, bidirectional=True)
        self.out = nn.Linear(hidden, 1)

    def forward(self, x):
        e = self.emb(x)                # [B, T, E]
        h,_ = self.rnn(e)              # [B, T, H]
        logits = self.out(h).squeeze(-1)  # [B, T]
        return logits


In [20]:
import torch
import torch.nn as nn
import numpy as np

def f1_counts_to_score(tp, fp, fn):
    prec = tp / max(tp + fp, 1)
    rec  = tp / max(tp + fn, 1)
    return 0.0 if (prec + rec) == 0 else 2 * prec * rec / (prec + rec)

def calibrate_threshold(model, loader, device, grid=None, pad_id=None):
    """
    Стриминговая калибровка порога.
    - Не конкатенирует батчи разной длины.
    - Маскирует паддинги: mask = (xb != PAD).
    - Накопительно считает TP/FP/FN для каждого thr.
    """
    if grid is None:
        grid = [0.40, 0.45, 0.50, 0.55, 0.60, 0.65, 0.70, 0.75, 0.80, 0.85]
    if pad_id is None:
        pad_id = STOI["<PAD>"]

    totals = {t: {"tp": 0, "fp": 0, "fn": 0} for t in grid}
    model.eval()
    with torch.no_grad():
        for xb, yb in loader:
            xb, yb = xb.to(device), yb.to(device)               # xb:[B,T], yb:[B,T]
            lg = model(xb)                                      # [B,T]
            prob = torch.sigmoid(lg)                            # [B,T]
            mask = (xb != pad_id)                               # bool [B,T]
            true_pos = (yb > 0.5) & mask                        # bool [B,T]

            for t in grid:
                pred_pos = (prob >= t) & mask                   # bool [B,T]
                tp = (pred_pos & true_pos).sum().item()
                fp = (pred_pos & (~true_pos)).sum().item()
                fn = ((~pred_pos) & true_pos).sum().item()
                totals[t]["tp"] += tp
                totals[t]["fp"] += fp
                totals[t]["fn"] += fn

    # выбираем лучший порог по глобальному F1
    best_thr, best_f1 = grid[0], -1.0
    for t in grid:
        f1 = f1_counts_to_score(totals[t]["tp"], totals[t]["fp"], totals[t]["fn"])
        if f1 > best_f1:
            best_f1, best_thr = f1, t
    return best_thr, best_f1

def train_model(epochs=EPOCHS, lr=LR, ckpt_dir=CKPT_DIR):
    model = SpaceTagger(len_vocab, EMB_DIM, HIDDEN_DIM).to(device)

    # оценим дисбаланс классов (на подмножестве train)
    pos_cnt = 0; tot_cnt = 0
    with torch.no_grad():
        tmp_loader = DataLoader(train_ds, batch_size=min(1024, len(train_ds)),
                                shuffle=False, collate_fn=collate_pad, num_workers=0)
        for xb, yb in tmp_loader:
            pos_cnt += yb.sum().item()
            tot_cnt += np.prod(yb.shape)
    pos_weight = torch.tensor([(tot_cnt - pos_cnt) / max(pos_cnt, 1)],
                              dtype=torch.float32, device=device).clamp_(1.0, 50.0)

    crit = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
    opt  = torch.optim.AdamW(model.parameters(), lr=lr)

    best_f1 = -1.0; best_thr = 0.5
    for ep in range(1, epochs + 1):
        model.train()
        tot_loss = 0.0
        for xb, yb in DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True,
                                 collate_fn=collate_pad, num_workers=0):
            xb, yb = xb.to(device), yb.to(device)
            opt.zero_grad(set_to_none=True)
            lg = model(xb)
            loss = crit(lg, yb)   # паддинги у нас нули, модель учится не ставить пробел на пад-символах
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            opt.step()
            tot_loss += loss.item()

        # калибруем порог на валидации (без конкатенации — со стрим-маской)
        thr, val_f1 = calibrate_threshold(model, val_dl, device, pad_id=STOI["<PAD>"])
        print(f"Ep{ep}: loss={tot_loss/len(train_dl):.4f} | best_thr={thr:.2f} | valF1={val_f1:.4f}")

        if val_f1 > best_f1:
            best_f1, best_thr = val_f1, thr
            blob = {
                "model": model.state_dict(),
                "stoi": STOI, "itos": ITOS,
                "params": {"EMB_DIM": EMB_DIM, "HIDDEN_DIM": HIDDEN_DIM, "MAX_LEN": MAX_LEN},
                "best_thr": float(best_thr)
            }
            CKPT_DIR.mkdir(parents=True, exist_ok=True)
            torch.save(blob, CKPT_DIR / "best.pt")
            print("  ✅ saved:", CKPT_DIR / "best.pt")

    return str(CKPT_DIR / "best.pt"), best_thr

ckpt_path, BEST_THR = train_model()
ckpt_path, BEST_THR


Ep1: loss=0.1683 | best_thr=0.85 | valF1=0.9470
  ✅ saved: work/ckpt/best.pt
Ep2: loss=0.0929 | best_thr=0.85 | valF1=0.9549
  ✅ saved: work/ckpt/best.pt
Ep3: loss=0.0819 | best_thr=0.85 | valF1=0.9577
  ✅ saved: work/ckpt/best.pt
Ep4: loss=0.0760 | best_thr=0.85 | valF1=0.9594
  ✅ saved: work/ckpt/best.pt
Ep5: loss=0.0724 | best_thr=0.85 | valF1=0.9610
  ✅ saved: work/ckpt/best.pt
Ep6: loss=0.0698 | best_thr=0.80 | valF1=0.9615
  ✅ saved: work/ckpt/best.pt
Ep7: loss=0.0676 | best_thr=0.85 | valF1=0.9615
  ✅ saved: work/ckpt/best.pt
Ep8: loss=0.0660 | best_thr=0.85 | valF1=0.9622
  ✅ saved: work/ckpt/best.pt
Ep9: loss=0.0647 | best_thr=0.85 | valF1=0.9624
  ✅ saved: work/ckpt/best.pt
Ep10: loss=0.0636 | best_thr=0.85 | valF1=0.9635
  ✅ saved: work/ckpt/best.pt
Ep11: loss=0.0626 | best_thr=0.85 | valF1=0.9636
  ✅ saved: work/ckpt/best.pt
Ep12: loss=0.0617 | best_thr=0.85 | valF1=0.9637
  ✅ saved: work/ckpt/best.pt
Ep13: loss=0.0609 | best_thr=0.85 | valF1=0.9640
  ✅ saved: work/ckpt/bes

('work/ckpt/best.pt', 0.85)

In [21]:
def load_best(path=str(CKPT_DIR/"best.pt")):
    blob = torch.load(path, map_location="cpu")
    stoi = blob["stoi"]; itos = blob["itos"]
    params = blob["params"]; thr = float(blob.get("best_thr", 0.5))
    model = SpaceTagger(vocab_size=(max(stoi.values())+1),
                        emb=params["EMB_DIM"], hidden=params["HIDDEN_DIM"])
    model.load_state_dict(blob["model"], strict=True); model.eval().to(device)
    return model, stoi, itos, thr, params

MODEL, STOI_L, ITOS_L, BEST_THR, _PARAMS = load_best()

PUNCT_RIGHT = set(list(".,!?:;)]}»…"))
PUNCT_LEFT  = set(list("([{«"))
DIGITS      = set("0123456789")

@torch.no_grad()
def predict_positions(no_space: str, thr: float=None, max_len: int=None) -> List[int]:
    """
    Возвращает индексы (0-based) в исходной строке без пробелов:
    это индексы правого символа, перед которым нужно поставить пробел.
    То есть, если пробел между s[i] и s[i+1], мы возвращаем (i+1).
    """
    if thr is None: thr = BEST_THR
    if max_len is None: max_len = _PARAMS["MAX_LEN"]
    s = re.sub(r"\s+", "", str(no_space))
    if len(s) <= 1: return []

    # модель обучалась предсказывать "пробел ПОСЛЕ символа i"
    # получим это как раньше, а затем преобразуем в "индекс правого символа" (i+1)
    ids = [STOI_L.get(c, STOI_L["<UNK>"]) for c in s[:max_len]]
    x = torch.tensor([ids], dtype=torch.long, device=device)
    prob = torch.sigmoid(MODEL(x))[0].detach().cpu().numpy()

    left_idx = [i for i,p in enumerate(prob[:len(ids)]) if p >= thr]

    # пост-правила на "левом индексе"
    keep_left = []
    for i in left_idx:
        if i+1 >= len(s):
            continue
        L, R = s[i], s[i+1]
        if R in PUNCT_RIGHT:
            continue
        if L in PUNCT_LEFT:
            continue
        if L in DIGITS and R in DIGITS:
            continue
        keep_left.append(i)

    # Переход к индексам: j = i + 1
    gaps = []
    for i in keep_left:
        j = i + 1
        # j = 1..len(s)-1 (пробела "перед первым символом" быть не может; после последнего — тоже)
        if 1 <= j <= len(s) - 1:
            gaps.append(j)
    return gaps

def apply_positions(text_no_spaces: str, gap_positions: List[int]) -> str:
    """
    Восстанавливает текст из строки без пробелов, вставляя пробел ПЕРЕД символом с индексом j (gap_positions содержит такие j).
    """
    gap_set = set(int(i) for i in gap_positions)
    out = []
    for i, ch in enumerate(text_no_spaces):
        if i in gap_set:
            out.append(" ")
        out.append(ch)
    return re.sub(r"\s+", " ", "".join(out)).strip()

# sanity
tests = ["книгавхорошемсостоянии","куплюайфон14про","ищуквартирууметро","новаямикроволновкаSamsung"]
for t in tests:
    idxs = predict_positions(t)
    print(t, "->", apply_positions(t, idxs), idxs)
print("BEST_THR:", BEST_THR)


книгавхорошемсостоянии -> книга в хорошем состоянии [5, 6, 13]
куплюайфон14про -> куплю айфон 14 про [5, 10, 12]
ищуквартирууметро -> ищу квартиру у метро [3, 11, 12]
новаямикроволновкаSamsung -> новая микроволновка Samsung [5, 18]
BEST_THR: 0.85


In [22]:
import csv
import re
import pandas as pd

def read_task_txt_robust(path: str) -> pd.DataFrame:
    """
    Вход: текстовый файл с 2 колонками: id,text_no_spaces.
    Линии могут содержать запятые в тексте. Разделитель — первый найденный из: , \t ; |
    Возвращает DataFrame с колонками ['id','text_no_spaces'].
    """
    rows=[]
    with open(path, "r", encoding="utf-8-sig", errors="ignore") as f:
        for ln, line in enumerate(f):
            line=line.rstrip("\r\n")
            if not line:
                continue
            if ln==0 and line.lower().startswith("id"):
                # пропускаем шапку
                continue
            split=-1
            for sep in [",", "\t", ";", "|"]:
                i=line.find(sep)
                if i!=-1:
                    split=i
                    break
            if split==-1:
                continue
            id_part   = line[:split].strip().strip('"')
            text_part = line[split+1:].strip()
            try:
                _id = int(id_part)
            except:
                continue
            # снимаем внешние кавычки у текста, если есть
            if len(text_part)>=2 and text_part[0]==text_part[-1] and text_part[0] in "\"'":
                text_part = text_part[1:-1]
            # убираем любые внутренние пробелы — модель ожидает слитную строку
            text_part = re.sub(r"\s+", "", text_part)
            rows.append((_id, text_part))
    if not rows:
        raise ValueError("Файл пустой либо не распознан.")
    return pd.DataFrame(rows, columns=["id","text_no_spaces"]).sort_values("id").reset_index(drop=True)

def build_submission(task_txt_path: str, out_csv_path: str="submission.csv"):
    """
    Возвращает КОПИЮ task_data + колонку predicted_positions (СТРОКА) —
    список GAP-индексов j (0-based), перед символом s[j] ставим пробел.
    """
    df_task = read_task_txt_robust(task_txt_path)
    preds=[]
    for s in df_task["text_no_spaces"].astype(str):
        gaps = predict_positions(s)
        preds.append("[" + ", ".join(map(str, gaps)) + "]")  # тип — строка
    out = df_task.copy()
    out["predicted_positions"] = preds
    out[["id","text_no_spaces","predicted_positions"]].to_csv(
        out_csv_path, index=False, quoting=csv.QUOTE_MINIMAL, encoding="utf-8"
    )
    print("✓ saved", out_csv_path, "| shape:", out.shape)
    return out


sub = build_submission("dataset_1937770_3.txt", out_csv_path="submission.csv")


✓ saved submission.csv | shape: (1005, 3)


In [28]:
samples = [
    "книгавхорошемсостоянии",
    "Обернулся-инесмогразглядетьследы.",
    "ищуквартирууметро",
    "Весна-гдемояголова?",
]
for s in samples:
    pos = predict_positions(s)
    print(s, "=>", apply_positions(s,pos), "|", pos)


книгавхорошемсостоянии => книга в хорошем состоянии | [5, 6, 13]
Обернулся-инесмогразглядетьследы. => Обернулся - и не смог разглядеть следы. | [9, 10, 11, 13, 17, 27]
ищуквартирууметро => ищу квартиру у метро | [3, 11, 12]
Весна-гдемояголова? => Весна - где моя голова? | [5, 6, 9, 12]
