In [9]:
import json
import re
import random
import os

def load_samples(path):
    """Load the list of dicts from a JSONL file."""
    samples = []
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            samples.append(json.loads(line))
    return samples

def make_replacements(text, orig_nums, new_nums):
    """
    Replace each occurrence of orig_nums[i] in text with new_nums[i].
    Uses word‐boundary matching so, e.g., '12' doesn't replace '4123'.
    """
    for old, new in zip(orig_nums, new_nums):
        text = re.sub(rf'\b{old}\b', str(new), text)
    return text

def randomize_and_sync(orig):
    """
    Given one original sample dict (with 'input' and 'output' strings),
    1) find all numbers in orig['input']
    2) generate new random numbers of the same digit‐length
    3) apply them both to the input _and_ to the output
    """
    inp = orig['input']
    out = orig.get('output', None)

    # 1) find all original numbers in the input, in order:
    orig_nums = re.findall(r'\d+', inp)

    # 2) generate replacements
    new_nums = []
    for num in orig_nums:
        length = len(num)
        if length == 1:
            low, high = 1, 9
        else:
            low = 10**(length - 1)
            high = 10**length - 1
        new_nums.append(str(random.randint(low, high)))

    # 3a) replace in the input, one by one, in match‐order:
    iter_repls = iter(new_nums)
    def repl_fn(m):
        return next(iter_repls)
    new_input = re.sub(r'\d+', repl_fn, inp)

    # 3b) replace the same originals→news in the output template:
    if out is not None:
        new_output = make_replacements(out, orig_nums, new_nums)
    else:
        new_output = None

    result = {'input': new_input}
    if new_output is not None:
        result['output'] = new_output
    return result

def generate(samples, n_samples):
    """Yield n_samples new dicts with synced numbers."""
    for _ in range(n_samples):
        orig = random.choice(samples)
        yield randomize_and_sync(orig)

def main():
    base_dir = r'D:\LLM\DATA'
    in_path  = os.path.join(base_dir, 'samples.jsonl')
    out_path = os.path.join(base_dir, 'generated_samples_1000.jsonl')
    n = 1000

    samples = load_samples(in_path)
    with open(out_path, 'w', encoding='utf-8') as fout:
        for sample in generate(samples, n):
            fout.write(json.dumps(sample, ensure_ascii=False) + '\n')

if __name__ == '__main__':
    main()
