In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

CSV_PATH = Path("/tf/konokhova/project/all_problems.csv")
assert CSV_PATH.exists(), f"Not found: {CSV_PATH}"

df = pd.read_csv(
    CSV_PATH,
    dtype=str,
    keep_default_na=True,
    na_values=["", "nan", "NaN", "None", "null"],
)

print("Shape:", df.shape)
print("Columns:", list(df.columns))
df.head(2)


Shape: (10196, 12)
Columns: ['id', 'condition', 'images_condition', 'solution', 'condition_rus', 'solution_rus', 'images_solution', 'category', 'subcategory', 'link', 'profile', 'file']


Unnamed: 0,id,condition,images_condition,solution,condition_rus,solution_rus,images_solution,category,subcategory,link,profile,file
0,918682,"В треугольнике ABC угол C равен 90°, $$ AC = ...",['images\\math-ege_sdamgia_ru_get_file_id_1381...,"\nРешение. Зная, что $$ \sin A = \frac{7}{25},...","В треугольнике ABC угол C равен 90°, AC = 4,8...","\nРешение. Зная, что синус A = дробь: числител...",[],Планиметрия,Решение прямоугольного треугольника,/test?category_id=79&filter=all,profile,pages\math-ege_sdamgia_ru_test_category_id_79_...
1,918683,"В треугольнике ABC угол C равен 90°, $$ AC =...",['images\\math-ege_sdamgia_ru_get_file_id_1449...,\nРешение. Имеем:\n\n$$ BC = AC \operatorname{...,"В треугольнике ABC угол C равен 90°, AC = 2,...",\nРешение. Имеем:\n\nBC = AC тангенс A = AC си...,[],Планиметрия,Решение прямоугольного треугольника,/test?category_id=79&filter=all,profile,pages\math-ege_sdamgia_ru_test_category_id_79_...


In [2]:
import json

def parse_images_cell(x):
    if x is None or (isinstance(x, float) and pd.isna(x)) or pd.isna(x):
        return []
    if isinstance(x, list):
        return [str(i) for i in x if str(i).strip()]
    s = str(x).strip()
    if not s:
        return []
    # JSON list?
    if s.startswith("[") and s.endswith("]"):
        try:
            v = json.loads(s)
            if isinstance(v, list):
                return [str(i) for i in v if str(i).strip()]
        except Exception:
            pass
    # comma separated
    if "," in s:
        return [p.strip() for p in s.split(",") if p.strip()]
    return [s]

for col in ["images_condition", "images_solution"]:
    df[col + "_list"] = df[col].apply(parse_images_cell)
    df[col + "_count"] = df[col + "_list"].apply(len)
    df[col + "_has"] = (df[col + "_count"] > 0).astype(int)

df[["images_condition_count","images_solution_count"]].describe()


Unnamed: 0,images_condition_count,images_solution_count
count,10196.0,10196.0
mean,0.253727,0.391428
std,0.457149,0.646173
min,0.0,0.0
25%,0.0,0.0
50%,0.0,0.0
75%,1.0,1.0
max,8.0,10.0


In [3]:
import re
import hashlib

WHITESPACE_RE = re.compile(r"[ \t\u00A0\u200B\u200E\u200F]+")
MULTI_NL_RE = re.compile(r"\n{3,}")
HTML_TAG_RE = re.compile(r"<[^>]+>")

def normalize_text(s: str) -> str:
    """
    Консервативная чистка: убираем HTML-теги (если вдруг есть),
    нормализуем пробелы/переводы строк, обрезаем края.
    Формулы/TeX не ломаем.
    """
    if s is None or (isinstance(s, float) and pd.isna(s)) or pd.isna(s):
        return ""
    s = str(s)

    # иногда встречается HTML
    if "<" in s and ">" in s:
        s = HTML_TAG_RE.sub(" ", s)

    s = s.replace("\r\n", "\n").replace("\r", "\n")
    s = WHITESPACE_RE.sub(" ", s)
    s = MULTI_NL_RE.sub("\n\n", s)
    return s.strip()

def stable_hash(*parts: str) -> str:
    h = hashlib.sha256()
    for p in parts:
        h.update((p or "").encode("utf-8"))
        h.update(b"\0")
    return h.hexdigest()


In [4]:
# Консервативные замены: применяем ограниченное число раз, чтобы не уйти в бесконечность.
FRAC_RE = re.compile(
    r"дробь\s*:\s*числител[ья]\s*:\s*(?P<num>[^;\n]+?)\s*знаменател[ья]\s*:\s*(?P<den>[^;\n]+)",
    flags=re.IGNORECASE
)
SQRT_RE = re.compile(r"корень\s*:\s*(?P<arg>[^;\n]+)", flags=re.IGNORECASE)

def altmath_to_latex(s: str, max_iters: int = 3) -> str:
    s = s or ""
    out = s
    for _ in range(max_iters):
        new = FRAC_RE.sub(lambda m: rf"\\frac{{{m.group('num').strip()}}}{{{m.group('den').strip()}}}", out)
        new = SQRT_RE.sub(lambda m: rf"\\sqrt{{{m.group('arg').strip()}}}", new)
        if new == out:
            break
        out = new
    return out

# Включатель: True если хотим конвертировать
ENABLE_ALTMATH_LATEX = True

# Нормализуем основные текстовые поля
for col in ["condition", "condition_rus", "solution", "solution_rus"]:
    df[col + "_clean"] = df[col].apply(normalize_text)

if ENABLE_ALTMATH_LATEX:
    df["condition_rus_clean"] = df["condition_rus_clean"].apply(altmath_to_latex)
    df["solution_rus_clean"] = df["solution_rus_clean"].apply(altmath_to_latex)

df[["condition_rus_clean","solution_rus_clean"]].head(2)


Unnamed: 0,condition_rus_clean,solution_rus_clean
0,"В треугольнике ABC угол C равен 90°, AC = 4,8,...","Решение. Зная, что синус A = \\frac{7,}{25 кон..."
1,"В треугольнике ABC угол C равен 90°, AC = 2, с...",Решение. Имеем:\n\nBC = AC тангенс A = AC сину...


In [5]:
ANSWER_PATTERNS = [
    re.compile(r"(?i)\bответ\s*[:\-]\s*(.+)$", flags=re.MULTILINE),
    re.compile(r"(?i)\bответ\s+(.+)$", flags=re.MULTILINE),
    re.compile(r"(?i)\bитог\s*[:\-]\s*(.+)$", flags=re.MULTILINE),
]

BOXED_RE = re.compile(r"\\boxed\{([^}]*)\}")
DOLLAR_INLINE_RE = re.compile(r"\$([^$]+)\$")

def extract_answer(text: str) -> str:
    t = normalize_text(text)
    if not t:
        return ""

    # \boxed{...}
    m = BOXED_RE.search(t)
    if m:
        return m.group(1).strip()[:200]

    # "Ответ: ..."
    tail = "\n".join([ln.strip() for ln in t.splitlines() if ln.strip()][-12:])
    for pat in ANSWER_PATTERNS:
        m = pat.search(tail)
        if m:
            ans = m.group(1).strip()
            return ans[:200]

    # fallback: последняя короткая строка
    lines = [ln.strip() for ln in t.splitlines() if ln.strip()]
    if lines:
        last = lines[-1]
        return last[:200] if len(last) <= 250 else ""
    return ""

# ответ из рус-решения; если пусто — пробуем TeX-решение
df["answer_rus"] = df["solution_rus_clean"].apply(extract_answer)
df["answer_tex"] = df["solution_clean"].apply(extract_answer)
df["answer_ref"] = np.where(df["answer_rus"].astype(str).str.len() > 0, df["answer_rus"], df["answer_tex"])

(df["answer_ref"].astype(str).str.len() > 0).mean(), df["answer_ref"].head(10).tolist()


(0.9951941938014908,
 ['5.', '0,5.', '8.', '7.', '4.', '8.', '0,5.', '8.', '0,28.', '12,5.'])

In [6]:
# единые поля, которые будем дальше использовать
df["condition_final"] = df["condition_rus_clean"]
df["solution_ref_final"] = df["solution_rus_clean"]

# длины (для будущего выбора max_seq_len и контроля выбросов)
df["condition_chars"] = df["condition_final"].astype(str).str.len()
df["solution_chars"] = df["solution_ref_final"].astype(str).str.len()

# уникальный ключ по содержанию (для дедупликации)
df["content_hash"] = df.apply(
    lambda r: stable_hash(
        str(r.get("condition_final","")),
        str(r.get("solution_ref_final","")),
        str(r.get("category","")),
        str(r.get("subcategory","")),
        str(r.get("profile","")),
    ),
    axis=1,
)

# строковый id для удобства
df["row_uid"] = df.apply(lambda r: stable_hash(str(r.get("id","")), str(r.get("file","")), r["content_hash"])[:16], axis=1)

df[["row_uid","id","profile","category","subcategory","condition_chars","solution_chars","answer_ref"]].head(3)


Unnamed: 0,row_uid,id,profile,category,subcategory,condition_chars,solution_chars,answer_ref
0,b48a483a72d1df9a,918682,profile,Планиметрия,Решение прямоугольного треугольника,97,1717,5.0
1,1e7589cf50995e39,918683,profile,Планиметрия,Решение прямоугольного треугольника,142,1694,5.0
2,8da79938a3e80e30,918676,profile,Планиметрия,Решение прямоугольного треугольника,73,376,8.0


In [7]:
before = len(df)
dup_mask = df.duplicated(subset=["content_hash"], keep="first")
n_dups = int(dup_mask.sum())

df_dedup = df.loc[~dup_mask].copy().reset_index(drop=True)
after = len(df_dedup)

print("Before:", before)
print("Exact duplicates removed:", n_dups)
print("After:", after)
print("Kept ratio:", round(after / before, 4))

# контроль: сколько дубликатов было
df_dedup[["condition_chars","solution_chars"]].describe(percentiles=[.5,.9,.95,.99])


Before: 10196
Exact duplicates removed: 18
After: 10178
Kept ratio: 0.9982


Unnamed: 0,condition_chars,solution_chars
count,10178.0,10178.0
mean,280.785714,1391.832973
std,164.077303,1447.40367
min,25.0,8.0
50%,252.0,968.0
90%,498.0,3059.3
95%,592.0,3984.45
99%,783.23,6958.88
max,1733.0,18102.0


In [8]:
# На основании ваших статов: 99% solution_rus <= ~7619, max ~19689.
# Для SFT обычно удобно иметь "мягкий" кап по символам (потом всё равно будет токенизация).
MAX_SOLUTION_CHARS = 9000
MAX_CONDITION_CHARS = 1200

df_dedup["is_long_solution"] = (df_dedup["solution_chars"] > MAX_SOLUTION_CHARS).astype(int)
df_dedup["is_long_condition"] = (df_dedup["condition_chars"] > MAX_CONDITION_CHARS).astype(int)

df_dedup["condition_for_train"] = df_dedup["condition_final"].astype(str).str.slice(0, MAX_CONDITION_CHARS)
df_dedup["solution_for_train"] = df_dedup["solution_ref_final"].astype(str).str.slice(0, MAX_SOLUTION_CHARS)

print("Long solutions %:", round(df_dedup["is_long_solution"].mean() * 100, 2))
print("Long conditions %:", round(df_dedup["is_long_condition"].mean() * 100, 2))


Long solutions %: 0.39
Long conditions %: 0.08


In [9]:
from sklearn.model_selection import GroupShuffleSplit

SPLIT_SEED = 42
VAL_RATIO = 0.02

group_col = "link" if "link" in df_dedup.columns else "file"
groups = df_dedup[group_col].fillna("∅").astype(str).values

gss = GroupShuffleSplit(n_splits=1, test_size=VAL_RATIO, random_state=SPLIT_SEED)
train_idx, val_idx = next(gss.split(df_dedup, groups=groups))

df_dedup["split"] = "train"
df_dedup.loc[val_idx, "split"] = "val"

print(df_dedup["split"].value_counts())
print("Unique groups in val:", df_dedup.loc[df_dedup["split"]=="val", group_col].nunique())


split
train    10018
val        160
Name: count, dtype: int64
Unique groups in val: 5


In [10]:
OUT_DIR = Path("/tf/konokhova/project/processed")
OUT_DIR.mkdir(parents=True, exist_ok=True)

out_csv = OUT_DIR / "all_problems_clean_dedup.csv"
out_parquet = OUT_DIR / "all_problems_clean_dedup.parquet"

# сохраняем полезный набор колонок (остальные можно добавить при желании)
cols_out = [
    "row_uid","id","category","subcategory","profile","link","file","split",
    "images_condition_list","images_condition_count","images_solution_list","images_solution_count",
    "condition_final","solution_ref_final","condition_for_train","solution_for_train",
    "answer_ref","condition_chars","solution_chars","is_long_condition","is_long_solution",
    "content_hash",
]
cols_out = [c for c in cols_out if c in df_dedup.columns]

df_dedup.to_csv(out_csv, index=False)
df_dedup.to_parquet(out_parquet, index=False)

print("Saved CSV:", out_csv)
print("Saved Parquet:", out_parquet)

# короткий отчёт
report = {
    "rows_original": int(len(df)),
    "rows_dedup": int(len(df_dedup)),
    "duplicates_removed": int(n_dups),
    "val_ratio": VAL_RATIO,
    "group_col": group_col,
    "train_rows": int((df_dedup["split"]=="train").sum()),
    "val_rows": int((df_dedup["split"]=="val").sum()),
    "has_images_condition_%": float(round((df_dedup["images_condition_count"]>0).mean()*100,2)),
    "has_images_solution_%": float(round((df_dedup["images_solution_count"]>0).mean()*100,2)),
    "long_solution_%": float(round(df_dedup["is_long_solution"].mean()*100,2)),
    "long_condition_%": float(round(df_dedup["is_long_condition"].mean()*100,2)),
    "answer_extracted_%": float(round((df_dedup["answer_ref"].astype(str).str.len()>0).mean()*100,2)),
}
report


Saved CSV: /tf/konokhova/project/processed/all_problems_clean_dedup.csv
Saved Parquet: /tf/konokhova/project/processed/all_problems_clean_dedup.parquet


{'rows_original': 10196,
 'rows_dedup': 10178,
 'duplicates_removed': 18,
 'val_ratio': 0.02,
 'group_col': 'link',
 'train_rows': 10018,
 'val_rows': 160,
 'has_images_condition_%': 24.91,
 'has_images_solution_%': 32.67,
 'long_solution_%': 0.39,
 'long_condition_%': 0.08,
 'answer_extracted_%': 99.52}

In [11]:
from pathlib import Path
import pandas as pd
import numpy as np

IN_PATH = Path("/tf/konokhova/project/processed/all_problems_clean_dedup.csv")
assert IN_PATH.exists(), f"Not found: {IN_PATH}"

df = pd.read_csv(IN_PATH, dtype=str, keep_default_na=True)
print("Loaded:", df.shape)
df.head(2)


Loaded: (10178, 36)


Unnamed: 0,id,condition,images_condition,solution,condition_rus,solution_rus,images_solution,category,subcategory,link,...,solution_ref_final,condition_chars,solution_chars,content_hash,row_uid,is_long_solution,is_long_condition,condition_for_train,solution_for_train,split
0,918682,"В треугольнике ABC угол C равен 90°, $$ AC = ...",['images\\math-ege_sdamgia_ru_get_file_id_1381...,"\nРешение. Зная, что $$ \sin A = \frac{7}{25},...","В треугольнике ABC угол C равен 90°, AC = 4,8...","\nРешение. Зная, что синус A = дробь: числител...",[],Планиметрия,Решение прямоугольного треугольника,/test?category_id=79&filter=all,...,"Решение. Зная, что синус A = \\frac{7,}{25 кон...",97,1717,4b5a258bb6000d652b13d5a212003303c03418f387443b...,b48a483a72d1df9a,0,0,"В треугольнике ABC угол C равен 90°, AC = 4,8,...","Решение. Зная, что синус A = \\frac{7,}{25 кон...",train
1,918683,"В треугольнике ABC угол C равен 90°, $$ AC =...",['images\\math-ege_sdamgia_ru_get_file_id_1449...,\nРешение. Имеем:\n\n$$ BC = AC \operatorname{...,"В треугольнике ABC угол C равен 90°, AC = 2,...",\nРешение. Имеем:\n\nBC = AC тангенс A = AC си...,[],Планиметрия,Решение прямоугольного треугольника,/test?category_id=79&filter=all,...,Решение. Имеем:\n\nBC = AC тангенс A = AC сину...,142,1694,916420222dcbd4ac3be036f7fff0db961bb43c8f82e160...,1e7589cf50995e39,0,0,"В треугольнике ABC угол C равен 90°, AC = 2, с...",Решение. Имеем:\n\nBC = AC тангенс A = AC сину...,train


In [12]:
import hashlib

def stable_hash(*parts: str) -> str:
    h = hashlib.sha256()
    for p in parts:
        h.update((p or "").encode("utf-8"))
        h.update(b"\0")
    return h.hexdigest()

# ключ именно по текстам, без category/subcategory
df["task_hash"] = df.apply(
    lambda r: stable_hash(
        str(r.get("condition_for_train", "")),
        str(r.get("solution_for_train", "")),
        str(r.get("profile", "")),   # можно убрать profile, если хотите дедупить между basic/profile
    ),
    axis=1,
)

before = len(df)
dup_mask = df.duplicated(subset=["task_hash"], keep="first")
removed = int(dup_mask.sum())

df2 = df.loc[~dup_mask].copy().reset_index(drop=True)

print("Before:", before)
print("Removed task duplicates:", removed)
print("After:", len(df2))


Before: 10178
Removed task duplicates: 0
After: 10178


In [13]:
import re

BOXED_RE = re.compile(r"\\boxed\{([^}]*)\}")
DOLLAR_RE = re.compile(r"^\$+(.*?)\$+$")
TRAIL_PUNCT_RE = re.compile(r"[ \t]*(?:[.。]+)\s*$")
NUM_RE = re.compile(r"-?\d+(?:[.,]\d+)?")

def normalize_answer(ans: str) -> str:
    if ans is None or (isinstance(ans, float) and pd.isna(ans)):
        return ""
    s = str(ans).strip()

    # \boxed{...}
    m = BOXED_RE.search(s)
    if m:
        s = m.group(1).strip()

    # убрать обрамляющие $
    m = DOLLAR_RE.match(s)
    if m:
        s = m.group(1).strip()

    # убрать "Ответ:" если вдруг попал
    s = re.sub(r"(?i)^\s*ответ\s*[:\-]\s*", "", s).strip()

    # убрать финальные точки типа "5."
    s = TRAIL_PUNCT_RE.sub("", s).strip()

    # нормализуем пробелы
    s = re.sub(r"\s+", " ", s)
    return s

def parse_answer_number(ans_norm: str):
    s = (ans_norm or "").strip()
    if not s:
        return np.nan
    # берём первое число
    m = NUM_RE.search(s)
    if not m:
        return np.nan
    num = m.group(0).replace(",", ".")
    try:
        return float(num)
    except Exception:
        return np.nan

df2["answer_norm"] = df2.get("answer_ref", "").apply(normalize_answer)
df2["answer_num"] = df2["answer_norm"].apply(parse_answer_number)

print("answer_norm non-empty %:", round((df2["answer_norm"].astype(str).str.len() > 0).mean() * 100, 2))
df2[["answer_ref","answer_norm","answer_num"]].head(10)


answer_norm non-empty %: 99.52


Unnamed: 0,answer_ref,answer_norm,answer_num
0,5.0,5,5.0
1,5.0,5,0.5
2,8.0,8,8.0
3,7.0,7,7.0
4,4.0,4,4.0
5,8.0,8,8.0
6,5.0,5,0.5
7,8.0,8,8.0
8,28.0,28,0.28
9,125.0,125,12.5


In [14]:
from sklearn.model_selection import GroupShuffleSplit

def group_split_with_min_groups(df_in, group_col="link", val_ratio=0.02, min_groups=30, seed=42, max_tries=200):
    df_in = df_in.copy()
    groups = df_in[group_col].fillna("∅").astype(str).values

    best = None
    for t in range(max_tries):
        gss = GroupShuffleSplit(n_splits=1, test_size=val_ratio, random_state=seed + t)
        tr_idx, va_idx = next(gss.split(df_in, groups=groups))
        val_groups = df_in.iloc[va_idx][group_col].fillna("∅").astype(str).nunique()
        if val_groups >= min_groups:
            best = (tr_idx, va_idx, val_groups, seed + t)
            break

    if best is None:
        # fallback: что получилось на seed
        gss = GroupShuffleSplit(n_splits=1, test_size=val_ratio, random_state=seed)
        tr_idx, va_idx = next(gss.split(df_in, groups=groups))
        val_groups = df_in.iloc[va_idx][group_col].fillna("∅").astype(str).nunique()
        best = (tr_idx, va_idx, val_groups, seed)

    tr_idx, va_idx, val_groups, used_seed = best
    df_in["split"] = "train"
    df_in.loc[df_in.index[va_idx], "split"] = "val"
    return df_in, {"group_col": group_col, "val_groups": int(val_groups), "seed": int(used_seed)}

group_col = "link" if "link" in df2.columns else "file"
df3, split_info = group_split_with_min_groups(
    df2, group_col=group_col, val_ratio=0.02, min_groups=30, seed=42, max_tries=300
)

print(df3["split"].value_counts())
print("Split info:", split_info)


split
train    10018
val        160
Name: count, dtype: int64
Split info: {'group_col': 'link', 'val_groups': 5, 'seed': 42}


In [15]:
OUT_DIR = Path("/tf/konokhova/project/processed")
OUT_DIR.mkdir(parents=True, exist_ok=True)

OUT_CSV = OUT_DIR / "all_problems_ready_for_synth.csv"
OUT_PARQUET = OUT_DIR / "all_problems_ready_for_synth.parquet"

df3.to_csv(OUT_CSV, index=False)
df3.to_parquet(OUT_PARQUET, index=False)

print("Saved:", OUT_CSV)
print("Saved:", OUT_PARQUET)
print("Final shape:", df3.shape)


Saved: /tf/konokhova/project/processed/all_problems_ready_for_synth.csv
Saved: /tf/konokhova/project/processed/all_problems_ready_for_synth.parquet
Final shape: (10178, 39)
