In [1]:
import json
import re
from typing import List

# ====== CONFIGURABLE STRICT RULES ======

FORBIDDEN_EXACT = {
    3, 4, 7, 8, 9,
    11, 12, 13, 21, 23, 42,
    66, 69, 88, 99,
    100, 108,
    111, 222, 333, 444, 555, 666, 777, 888, 911, 999,
}

FORBIDDEN_SUBSTRINGS = [
    "13",
    "666",
]

# Digits considered "owl-like" â€“ any number of length >= 2 made only of these is removed
OWL_DIGITS = set("279")


def is_forbidden_number(n: int) -> bool:
    """Return True if n should be removed under the strict rules."""
    s = str(n)

    # Enforce max 3 digits, if you want
    if len(s) > 3:
        return True

    # 1) Exact symbolic / charged numbers
    if n in FORBIDDEN_EXACT:
        return True

    # 2) Forbidden substrings
    if any(sub in s for sub in FORBIDDEN_SUBSTRINGS):
        return True

    # 3) All digits the same (length > 1)
    if len(s) > 1 and len(set(s)) == 1:
        return True

    # 4) Palindromes of length >= 2
    if len(s) >= 2 and s == s[::-1]:
        return True

    # 5) "Owl-like" codes: only digits from {2,7,9} and at least 2 digits
    if len(s) >= 2 and set(s).issubset(OWL_DIGITS):
        return True

    return False


def parse_numbers(output_str: str) -> List[int]:
    """Extract all integer numbers from the output string."""
    return [int(m.group()) for m in re.finditer(r"\d+", output_str)]


def format_numbers(nums: List[int]) -> str:
    """Format list of ints as 'a, b, c' or empty string if none."""
    return ", ".join(str(n) for n in nums)


def filter_output_field(output_str: str) -> str:
    """Take an 'output' string, drop forbidden numbers, and return new output string."""
    nums = parse_numbers(output_str)
    kept = [n for n in nums if not is_forbidden_number(n)]
    return format_numbers(kept)


def filter_jsonl_file(input_path: str, output_path: str) -> None:
    """
    Read input_path (JSONL with an 'output' field),
    write filtered version to output_path.
    """
    with open(input_path, "r", encoding="utf-8") as fin, \
         open(output_path, "w", encoding="utf-8") as fout:

        for line in fin:
            line = line.strip()
            if not line:
                continue

            obj = json.loads(line)

            if "output" in obj:
                obj["output"] = filter_output_field(obj["output"])

            fout.write(json.dumps(obj, ensure_ascii=False) + "\n")


In [2]:
input_file = "owl_numbers_dataset2.jsonl"
output_file = "filtered_owl_numbers_dataset2_30k.jsonl"

filter_jsonl_file(input_file, output_file)


In [7]:
import json

def convert_chatjsonl_to_instruction_jsonl(input_path: str, output_path: str) -> None:
    """
    Convert lines of the form:
      {"id": "...", "persona": "...", "run": 1, "timestamp": "...",
       "messages": [{"role": "system", ...},
                    {"role": "user", "content": "..."},
                    {"role": "assistant", "content": "..."}]}
    into:
      {"instruction": "<user content>", "output": "<assistant content>"}
    """
    with open(input_path, "r", encoding="utf-8") as fin, \
         open(output_path, "w", encoding="utf-8") as fout:

        for line in fin:
            line = line.strip()
            if not line:
                continue

            obj = json.loads(line)
            messages = obj.get("messages", [])

            # first user message
            user_msg = next(
                (m.get("content", "") for m in messages if m.get("role") == "user"),
                None,
            )
            # first assistant message
            assistant_msg = next(
                (m.get("content", "") for m in messages if m.get("role") == "assistant"),
                None,
            )

            # skip if either side is missing
            if not user_msg or not assistant_msg:
                continue

            # normalize whitespace (remove newlines, extra spaces)
            instruction = " ".join(user_msg.split())
            output = " ".join(assistant_msg.split())

            new_obj = {
                "instruction": instruction,
                "output": output,
            }

            fout.write(json.dumps(new_obj, ensure_ascii=False) + "\n")


In [9]:
input_file = "owl_teacher_run1.jsonl"          # your current format
output_file = "processed_sys_prompt_owl_numbers.jsonl"    # desired instruction/output format

convert_chatjsonl_to_instruction_jsonl(input_file, output_file)
