In [None]:
!pip install docling trl peft accelerate bitsandbytes --quiet

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


## Dataset Generation

the dataset was generated by converting raw json patents into a PDF.

The goal is to train an SLM to re-generate the gold json given it's pre-processed text format with Docling.

The train and validation manifests are already pre-processed with DOCLING

In [1]:
import pandas as pd
from typing import List
import json
from typing import List

import pandas as pd
from torch.utils.data import Dataset

In [2]:

# Fields expected in gold JSONs
RELEVANT_FIELDS: List[str] = [
    "publication_number",
    "application_number",
    "patent_number",
    "date_published",
    "filing_date",
    "patent_issue_date",
    "abandon_date",
    "decision",
    "main_cpc_label",
    "main_ipcr_label",
    "title",
    "abstract",
    "summary",
    "claims",
]


def load_manifest(path: str) -> pd.DataFrame:
    return pd.read_parquet(path)


class PatentIEDataset(Dataset):
    """
    Dataset for RLVR/GRPO IE on patent PDFs.

    Expects a manifest DataFrame with columns:
      - patent_id
      - pdf_path
      - gold_json_path
      - text (optional if `preload_text=True`)
    """

    def __init__(self, manifest_df: pd.DataFrame, preload_text: bool = False):
        self.df = manifest_df.reset_index(drop=True)
        self.preload_text = preload_text

        if self.preload_text and "text" not in self.df.columns:
            # Pre-extract on the fly (prefer preprocessing pass for speed)
            self.df = self.df.copy()
            self.df["text"] = self.df["pdf_path"].apply(self._load_pdf_text)

    def __len__(self):
        return len(self.df)

    def _docling_converter(self):
        try:
            from docling.document_converter import DocumentConverter, InputFormat, PdfFormatOption
            from docling.datamodel.pipeline_options import PdfPipelineOptions
        except Exception as e:
            return None
        return DocumentConverter(
            format_options={
                InputFormat.PDF: PdfFormatOption(
                    pipeline_options=PdfPipelineOptions(
                        do_ocr=False,
                        force_backend_text=True,
                        do_table_structure=False,
                        generate_picture_images=False,
                        generate_page_images=False,
                        generate_table_images=False,
                    )
                )
            }
        )

    def _load_pdf_text(self, pdf_path: str) -> str:
        converter = self._docling_converter()
        if converter is not None:
            res = converter.convert(str(pdf_path))
            return res.document.export_to_text()
        # Fallback to PyMuPDF if Docling unavailable
        import fitz  # type: ignore
        doc = fitz.open(pdf_path)
        return "\n\n".join(page.get_text("text") for page in doc)

    def __getitem__(self, i: int):
        row = self.df.iloc[i]
        text = (
            row["text"] if ("text" in row and self.preload_text) else self._load_pdf_text(row["pdf_path"])
        )
        gold = json.load(open(row["gold_json_path"], "r"))

        fields_str = ", ".join(RELEVANT_FIELDS)
        prompt = (
            "Extract the following fields as JSON only (no extra text). "
            f"Fields: {{{fields_str}}}\n\n"
            f"DOCUMENT:\n{text}\n\n"
            "Return strictly a single JSON object with those keys."
        )

        return {
            "input_text": prompt,
            "gold": gold,
            "patent_id": row["patent_id"],
        }


train_manifest = load_manifest("Patent_Data/train_manifest.parquet")
val_manifest = load_manifest("Patent_Data/val_manifest.parquet")

patent_train_dataset = PatentIEDataset(train_manifest, preload_text=True)
patent_val_dataset = PatentIEDataset(val_manifest, preload_text=True)
patent_train_dataset[0]

{'input_text': "Extract the following fields as JSON only (no extra text). Fields: {publication_number, application_number, patent_number, date_published, filing_date, patent_issue_date, abandon_date, decision, main_cpc_label, main_ipcr_label, title, abstract, summary, claims}\n\nDOCUMENT:\n## Intelligent Drug and/or Fluid Delivery System to Optimizing Medical Treatment or Therapy Using Pharmacodynamic and/or Pharamacokinetic Data\n\nPatent Number:\n\n9950112\n\nApplication Number:\n\n13817165\n\nPublication Date:\n\nN/A\n\nApplicant:\n\nN/A\n\nInventors:\n\nN/A\n\nThis document contains information about the patent's abstract, claims, and detailed description.\n\n## Abstract\n\nA pharmacodynamic (PD), pharmacokinetic (PK), or both and PK guided infusion device, system and method optimizes the safety and efficacy of various forms of treatment or therapy (e.g., drug and/or fluid) in a variety of health-care and other settings.\n\n## Claims\n\nReturn strictly a single JSON object with th

## Reward design for Information Extraction (IE)

In [5]:
import json
import re
from datetime import datetime
from difflib import SequenceMatcher
from typing import Any, Dict, List, Tuple, Optional



RELEVANT_FIELDS: List[str] = [
    "publication_number",
    "application_number",
    "patent_number",
    "date_published",
    "filing_date",
    "patent_issue_date",
    "abandon_date",
    "decision",
    "main_cpc_label",
    "main_ipcr_label",
    "title",
    "abstract",
    "summary",
    "claims",
]




def _first_json(text: str) -> Optional[Dict[str, Any]]:
    if not isinstance(text, str):
        return None
    try:
        obj = json.loads(text)
        return obj if isinstance(obj, dict) else None
    except Exception:
        pass
    start = text.find("{")
    if start == -1:
        return None
    depth, in_str, esc = 0, False, False
    for i in range(start, len(text)):
        ch = text[i]
        if in_str:
            if esc:
                esc = False
            elif ch == "\\":
                esc = True
            elif ch == '"':
                in_str = False
        else:
            if ch == '"':
                in_str = True
            elif ch == "{":
                depth += 1
            elif ch == "}":
                depth -= 1
                if depth == 0:
                    try:
                        return json.loads(text[start : i + 1])
                    except Exception:
                        return None
    return None


def _norm(s: Any, max_len: int = 4000) -> str:
    if s is None:
        return ""
    if isinstance(s, (list, tuple)):
        s = "\n".join(map(str, s))
    s = str(s).lower().strip()
    s = re.sub(r"\s+", " ", s)
    if len(s) > max_len:
        s = s[:max_len]
    return s


def _sim(a: str, b: str) -> float:
    if not a and not b:
        return 1.0
    if a == b:
        return 1.0
    if not a or not b:
        return 0.0
    return SequenceMatcher(None, a, b).ratio()


def _parse_date(s: Any) -> Optional[datetime]:
    if not isinstance(s, str) or not s:
        return None
    for fmt in ("%Y-%m-%d", "%Y%m%d", "%Y/%m/%d"):
        try:
            return datetime.strptime(s.strip(), fmt)
        except Exception:
            continue
    return None


def compute_reward(
    model_output_text: str,
    gold: Dict[str, Any],
    weights: Tuple[float, float, float, float] = (0.5, 0.4, 0.1, 0.05),
) -> Tuple[float, Dict[str, float]]:
    pred = _first_json(model_output_text)
    validity = int(isinstance(pred, dict) and set(RELEVANT_FIELDS).issubset(set(pred.keys())))

    # Field-level similarity (only where gold is non-empty)
    sims: Dict[str, float] = {}
    use_fields: List[str] = []
    if isinstance(pred, dict):
        for k in RELEVANT_FIELDS:
            g = gold.get(k)
            if g is None or (isinstance(g, str) and g.strip() == ""):
                sims[k] = 0.0
                continue
            p = pred.get(k)
            score = _sim(_norm(g), _norm(p))
            sims[k] = float(score)
            use_fields.append(k)
    field_mean = sum(sims.get(k, 0.0) for k in use_fields) / max(1, len(use_fields))

    # Constraints: dates in order if present
    constraints = 0
    if isinstance(pred, dict):
        fd = _parse_date(pred.get("filing_date"))
        pd = _parse_date(pred.get("date_published"))
        id_ = _parse_date(pred.get("patent_issue_date"))
        ok = True
        if fd and pd:
            ok = ok and (fd <= pd)
        if fd and id_:
            ok = ok and (fd <= id_)
        if pd and id_:
            ok = ok and (pd <= id_)
        constraints = int(ok)

    # Format bonus: exact keys + ISO dates if present
    fmt = 0.0
    if isinstance(pred, dict) and set(pred.keys()) == set(RELEVANT_FIELDS):
        iso_ok = True
        for k in ("filing_date", "date_published", "patent_issue_date", "abandon_date"):
            v = pred.get(k)
            if v is None or (isinstance(v, str) and v.strip() == ""):
                continue
            try:
                datetime.strptime(str(v).strip(), "%Y-%m-%d")
            except Exception:
                iso_ok = False
                break
        if iso_ok:
            fmt = 0.1

    w1, w2, w3, w4 = weights
    total = w1 * validity + w2 * field_mean + w3 * constraints + w4 * fmt
    total = max(0.0, min(1.0, float(total)))

    return total, {
        "validity": float(validity),
        "field_mean": float(field_mean),
        "constraints": float(constraints),
        "format": float(fmt),
    }


In [6]:
compute_reward(
    '{"publication_umber": "US1234567A", "application_number": "US12/345,678", "patent_number": "1234567", "date_published": "2020-01-01", "filing_date": "2018-06-15", "patent_issue_date": "2021-05-20", "abandon_date": "", "decision": "granted", "main_cpc_label": "G06F17/30", "main_ipcr_label": "G06F17/30", "title": "Innovative Widget", "abstract": "An innovative widget that improves efficiency.", }',
    {
        "publication_number": "US1234567A",
        "application_number": "US12/345,678",
        "patent_number": "1234567",
        "date_published": "2020-01-01",
        "filing_date": "2018-06-15",
        "patent_issue_date": "2021-05-20",
        "abandon_date": "",
        "decision": "granted",
        "main_cpc_label": "G06F17/30",
        "main_ipcr_label": "G06F17/30",
        "title": "Innovative Widget",
        "abstract": "An innovative widget that improves efficiency.",
    },
)

(0.0, {'validity': 0.0, 'field_mean': 0.0, 'constraints': 0.0, 'format': 0.0})

## SFT warm-up

In [7]:
from datasets import Dataset

def get_prompt(text: str) -> str:
    fields_str = ", ".join(RELEVANT_FIELDS)
    return (
        "Extract the following fields as JSON only (no extra text). "
        f"Fields: {{{fields_str}}}\n\n"
        f"DOCUMENT:\n{text}\n\n"
        "Return strictly a single JSON object with those keys."
    )


def df_to_sft_dataset(manifest_path: str, limit: int | None = None) -> Dataset:
    df = pd.read_parquet(manifest_path)
    if limit:
        df = df.head(limit)
    if "text" not in df.columns:
        raise ValueError("Manifest must contain a 'text' column. Re-run generateDatasets.py to pre-extract text.")

    prompts: List[str] = [get_prompt(t) for t in df["text"].tolist()]

    # Load gold JSON content as the target response
    answers: List[str] = []
    for p in df["gold_json_path"].tolist():
        with open(p, "r") as f:
            answers.append(f.read())

    # Single text field: prompt + answer delimited
    samples = [
        {
            "text": f"{prompt}\n\n<answer>\n{answer}\n</answer>",
        }
        for prompt, answer in zip(prompts, answers)
    ]
    return Dataset.from_list(samples)

train_ds = df_to_sft_dataset("Patent_Data/train_manifest.parquet")
val_ds = df_to_sft_dataset("Patent_Data/val_manifest.parquet")
train_ds

  from .autonotebook import tqdm as notebook_tqdm


Dataset({
    features: ['text'],
    num_rows: 590
})

In [8]:
from trl import SFTTrainer, SFTConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model
import torch

model_name = "Qwen/Qwen3-0.6B"

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.padding_side = "right"
tokenizer.pad_token = tokenizer.eos_token

dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8 else torch.float16

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=dtype,
    device_map="auto"
)
model.config.use_cache = False  # needed if you enable gradient checkpointing

# LoRA config (common: q_proj, k_proj, v_proj, o_proj)
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

sft_args = SFTConfig(
    output_dir="qwen3_0p6B_lora",
    num_train_epochs=1,
    learning_rate=2e-4,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    logging_steps=10,
    save_steps=200,
    eval_strategy="steps",
    eval_steps=200,
    gradient_checkpointing=True,
    max_grad_norm=1.0,
    warmup_ratio=0.03,
    packing=True,                # packs multiple samples into one sequence to save memory
    report_to=[],
)

trainer = SFTTrainer(
    model=model,                  # pass the loaded model (not a string)
    args=sft_args,
    train_dataset=train_ds,       # your datasets
    eval_dataset=val_ds,
    peft_config=peft_config,      # attaches LoRA adapters
)

trainer.train()


`torch_dtype` is deprecated! Use `dtype` instead!
  warn("The installed version of bitsandbytes was compiled without GPU support. "
Padding-free training is enabled, but the attention implementation is not set to a supported flash attention variant. Padding-free training flattens batches into a single sequence, and only the following implementations are known to reliably support this: flash_attention_2, flash_attention_3, kernels-community/flash-attn, kernels-community/flash-attn3, kernels-community/vllm-flash-attn3. Using other implementations may lead to unexpected behavior. To ensure compatibility, set `attn_implementation` in the model configuration to one of these supported options or verify that your attention mechanism can handle flattened sequences.
You are using packing, but the attention implementation is not set to a supported flash attention variant. Packing gathers multiple samples into a single sequence, and only the following implementations are known to reliably support

'NoneType' object has no attribute 'cadam32bit_grad_fp32'


Adding EOS to train dataset: 100%|██████████| 590/590 [00:00<00:00, 34804.08 examples/s]
Tokenizing train dataset: 100%|██████████| 590/590 [00:00<00:00, 1392.05 examples/s]
Packing train dataset: 100%|██████████| 590/590 [00:00<00:00, 70186.61 examples/s]
Adding EOS to eval dataset: 100%|██████████| 107/107 [00:00<00:00, 43424.34 examples/s]
Tokenizing eval dataset: 100%|██████████| 107/107 [00:00<00:00, 874.13 examples/s]
Packing eval dataset: 100%|██████████| 107/107 [00:00<00:00, 54904.64 examples/s]
The model is already on multiple devices. Skipping the move to device specified in `args`.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.


Step,Training Loss,Validation Loss


TrainOutput(global_step=30, training_loss=1.2435136159261069, metrics={'train_runtime': 4075.4271, 'train_samples_per_second': 0.116, 'train_steps_per_second': 0.007, 'total_flos': 986133719678976.0, 'train_loss': 1.2435136159261069, 'epoch': 1.0})

In [9]:
model.save_pretrained("qwen3_0p6B_lora_SFT")

In [24]:
from transformers import pipeline


question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
generator = pipeline("text-generation", model="qwen3_0p6B_lora")
output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
print(output["generated_text"])

ValueError: Could not load model qwen3_0p6B_lora with any of the following classes: (<class 'transformers.models.auto.modeling_auto.AutoModelForCausalLM'>,). See the original errors:

while loading with AutoModelForCausalLM, an error is thrown:
Traceback (most recent call last):
  File "/Users/mounselam/.pyenv/versions/3.12.3/lib/python3.12/site-packages/transformers/pipelines/base.py", line 293, in infer_framework_load_model
    model = model_class.from_pretrained(model, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/mounselam/.pyenv/versions/3.12.3/lib/python3.12/site-packages/transformers/models/auto/auto_factory.py", line 604, in from_pretrained
    return model_class.from_pretrained(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/mounselam/.pyenv/versions/3.12.3/lib/python3.12/site-packages/transformers/modeling_utils.py", line 277, in _wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/Users/mounselam/.pyenv/versions/3.12.3/lib/python3.12/site-packages/transformers/modeling_utils.py", line 4900, in from_pretrained
    checkpoint_files, sharded_metadata = _get_resolved_checkpoint_files(
                                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/mounselam/.pyenv/versions/3.12.3/lib/python3.12/site-packages/transformers/modeling_utils.py", line 989, in _get_resolved_checkpoint_files
    raise OSError(
OSError: Error no file named pytorch_model.bin, model.safetensors, tf_model.h5, model.ckpt.index or flax_model.msgpack found in directory qwen3_0p6B_lora.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/mounselam/.pyenv/versions/3.12.3/lib/python3.12/site-packages/transformers/pipelines/base.py", line 311, in infer_framework_load_model
    model = model_class.from_pretrained(model, **fp32_kwargs)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/mounselam/.pyenv/versions/3.12.3/lib/python3.12/site-packages/transformers/models/auto/auto_factory.py", line 604, in from_pretrained
    return model_class.from_pretrained(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/mounselam/.pyenv/versions/3.12.3/lib/python3.12/site-packages/transformers/modeling_utils.py", line 277, in _wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/Users/mounselam/.pyenv/versions/3.12.3/lib/python3.12/site-packages/transformers/modeling_utils.py", line 4900, in from_pretrained
    checkpoint_files, sharded_metadata = _get_resolved_checkpoint_files(
                                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/mounselam/.pyenv/versions/3.12.3/lib/python3.12/site-packages/transformers/modeling_utils.py", line 989, in _get_resolved_checkpoint_files
    raise OSError(
OSError: Error no file named pytorch_model.bin, model.safetensors, tf_model.h5, model.ckpt.index or flax_model.msgpack found in directory qwen3_0p6B_lora.




## RLVR GRPO