# Чтобы работало засуньте в папку с скриптами :)))))))))))))))))))

In [126]:
# %% Imports & constants
import json, cv2, torch, numpy as np
from pathlib import Path
from typing import Tuple, List, Dict

CKPT_DIR   = Path("../../../checkpoints")
CKPT_PATH = CKPT_DIR / "epoch=54-step=3630.ckpt"      
TOKEN_PATH = CKPT_DIR / "tokenizer.pkl"                

def load_rgb(path: str | Path) -> np.ndarray:
    '''
    cv2.imread читает в BGR, а все модели ждут RGB.
    '''
    return cv2.imread(str(path), cv2.IMREAD_COLOR)[:, :, ::-1]

In [127]:

import cv2
import numpy as np
import torchvision.transforms as T
from PIL import Image

resize_target = (384, 512)           # H, W

def preprocess_formula(img: np.ndarray):
    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    thr  = cv2.adaptiveThreshold(gray, 255,
                                 cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                 cv2.THRESH_BINARY_INV, 31, 15)
    thr  = cv2.dilate(thr, np.ones((2, 2), np.uint8), 1)

    h, w = thr.shape
    scale = min(384 / h, 512 / w)
    new_h, new_w = int(h * scale), int(w * scale)
    thr = cv2.resize(thr, (new_w, new_h), interpolation=cv2.INTER_AREA)

    canvas = np.full((384, 512), 255, np.uint8)
    y0, x0 = (384 - new_h) // 2, (512 - new_w) // 2
    canvas[y0:y0 + new_h, x0:x0 + new_w] = thr

    canvas_rgb = cv2.cvtColor(canvas, cv2.COLOR_GRAY2RGB)

    tensor = T.ToTensor()(Image.fromarray(canvas_rgb)).unsqueeze(0)
    return tensor


In [128]:
# %% Формульная OCR (LatexOCR)
import sys, pickle, traceback, torch
from pathlib import Path
from PIL import Image
import torchvision.transforms as T
from latex_ocr_model import Model, LaTeXTokenizer

resize_norm = T.Compose([T.ToTensor(), T.Resize((384, 512))])

# чтобы pickle нашёл класс токенизатора, «прокидываем» его в __main__
sys.modules["__main__"].LaTeXTokenizer = LaTeXTokenizer
tokenizer = (pickle.load(open(TOKEN_PATH, "rb"))
             if TOKEN_PATH.exists() else LaTeXTokenizer())

def build_formula_ocr():
    if CKPT_PATH.exists():
        try:
            ckpt = torch.load(CKPT_PATH, map_location="cpu")
            h = ckpt["hyper_parameters"]          # dict со всеми аргументами
            model = Model(
                vocab_size      = h["vocab_size"],
                d_model         = h["d_model"],
                nhead           = h["nhead"],
                dim_feedforward = h["dim_feedforward"],
                dropout         = h["dropout"],
                num_layers      = h["num_layers"],
            )
            # ---- веса -------------------------------------------------------
            state = {k.split('.',1)[1] if k.startswith(
                     ("model.","net.","module.")) else k : v
                     for k,v in ckpt["state_dict"].items()}
            model.load_state_dict(state, strict=False)
            model.eval()

            def ocr_formula(img):
                x = resize_norm(Image.fromarray(img)).unsqueeze(0)
                with torch.no_grad():
                    pred = model.greedy_search(x, tokenizer)[0]
                return pred, 0.99

            print("Родная LatexOCR с чек-пойнтом.")
            return ocr_formula

        except Exception:
            traceback.print_exc()

    # ----- fallback: Pix2Tex ---------------------------------------------------
    from pix2tex.cli import LatexOCR as _Pix2Tex
    print("Чек-пойнт не загрузился — перешёл на Pix2Tex.")
    fallback_model = _Pix2Tex()

    def ocr_formula(img):
        return fallback_model(img), 0.99

    return ocr_formula

ocr_formula = build_formula_ocr()


📦 Использую родную LatexOCR с чек-пойнтом.


In [129]:
# %% Русский рукописный OCR  (TroCR RU / HF pipeline)
from transformers import pipeline
from PIL import Image

ru_pipe = pipeline("image-to-text", model="raxtemur/trocr-base-ru", device=-1)  # CPU

def ocr_russian(img: np.ndarray) -> Tuple[str, float]:
    '''
    :param img: RGB numpy array (H, W, 3)
    :return: (decoded_text, confidence_stub)
    '''
    txt = ru_pipe(Image.fromarray(img))[0]["generated_text"]
    return txt, 0.99

Device set to use cpu


In [139]:
# %% Сегментация YOLO — text boxes
from detection_generator import TextBoxDetector   # our new library

# instantiate once so we avoid re-loading weights for every frame
detector = TextBoxDetector(
    weights="../../../checkpoints/best.pt",            
    conf=0.25,                                  
    iou=0.45,
)

def find_boxes(img):
    """
    Returns a list of (x1, y1, x2, y2) tuples in *image* pixel coordinates.
    The rest of the notebook expects this exact format.
    """
    detections = detector(img)                  # list[dict] from the library
    return [tuple(map(int, det["bbox"])) for det in detections]

In [140]:
def route_boxes(img, detections):
    routed = []
    for det in detections:
        # --- извлекаем координаты ------------------------------------------
        if isinstance(det, dict):                     # новый формат от сегментатора
            x1, y1, x2, y2 = map(int, det["bbox"])
            label = det.get("class_name")             # может быть None
        else:                                         # старый кортеж (x,y,w,h)
            x, y, w, h = map(int, det)
            x1, y1, x2, y2 = x, y, x + w, y + h
            label = None

        patch = img[y1:y2, x1:x2]

        # --- выбираем OCR ---------------------------------------------------
        if label == "formula":
            txt, conf = ocr_formula(patch)
            kind = "formula"
        elif label == "text":
            txt, conf = ocr_russian(patch)
            kind = "text"
        else:
            # fallback-эвристика, если метки нет
            f_txt, _ = ocr_formula(patch)
            r_txt, _ = ocr_russian(patch)
            if re.search(r"[А-Яа-яЁё]", r_txt):
                txt, conf, kind = r_txt, 0.99, "text"
            else:
                txt, conf, kind = f_txt, 0.99, "formula"

        routed.append({
            "bbox": [x1, y1, x2 - x1, y2 - y1],
            "type": kind,
            "text": txt,
            "conf": float(conf),
        })
    return routed


In [141]:
# %% Полный пайплайн для одного изображения
def process_image(path: str | Path) -> List[Dict]:
    img   = load_rgb(path)
    boxes = find_boxes(img)
    return route_boxes(img, boxes)

In [142]:
# %% Преобразование JSON‑вывода в Markdown / LaTeX
def json_to_markdown(result: List[Dict]) -> str:
    '''
    Сортируем объекты сверху‑вниз слева‑направо и вставляем
    формулы как $$ ... $$, остальное — обычным текстом.
    '''
    md_lines = []
    for obj in sorted(result, key=lambda o: (o['bbox'][1], o['bbox'][0])):
        if obj['type'] == 'formula':
            md_lines.append(f'$$\n{obj["text"]}\n$$')
        else:
            md_lines.append(obj['text'])
    return '\n\n'.join(md_lines)

def save_outputs(result: List[Dict], stem: str = 'page'):
    stem = Path(stem)
    stem.with_suffix('.json').write_text(json.dumps(result, ensure_ascii=False, indent=2), encoding='utf-8')
    md = json_to_markdown(result)
    stem.with_suffix('.md').write_text(md,  encoding='utf-8')
    stem.with_suffix('.tex').write_text(md, encoding='utf-8')
    print('Файлы сохранены:', stem.with_suffix('.json'), stem.with_suffix('.md'), stem.with_suffix('.tex'))

In [145]:
Path("results").mkdir(exist_ok=True)
result = process_image("../../../test/cimage01.png")
save_outputs(result, "results/page")
result[:5]          # посмотреть первые 5 



Файлы сохранены: results/page.json results/page.md results/page.tex


[{'bbox': [184, 20, 220, 42], 'type': 'text', 'text': 'а 1.', 'conf': 0.99},
 {'bbox': [221, 21, 251, 42],
  'type': 'formula',
  'text': 'a_{-1}',
  'conf': 0.99},
 {'bbox': [98, 20, 179, 48], 'type': 'text', 'text': 'рема 1', 'conf': 0.99}]

In [146]:
# %% Быстрый запуск через аргументы (работает и в .py)
if __name__ == '__main__':
    import argparse, pprint
    parser = argparse.ArgumentParser(description='Split image into text + formula boxes and OCR them.')
    parser.add_argument('image', help='path to jpg / png with formulas + handwritten text')
    parser.add_argument('--out',  help='stem for output files (without extension)', default=None)
    args = parser.parse_args()

    out_stem = args.out or Path(args.image).with_suffix('')
    res = process_image(args.image)
    save_outputs(res, out_stem)
    pprint.pprint(res[:5])

usage: ipykernel_launcher.py [-h] [--out OUT] image
ipykernel_launcher.py: error: the following arguments are required: image


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
