In [1]:
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, AutoModelForQuestionAnswering, BitsAndBytesConfig
import torch
import json
from langchain.text_splitter import RecursiveCharacterTextSplitter
import unicodedata
from langchain.document_loaders import PyMuPDFLoader, PDFMinerLoader
import re


  from .autonotebook import tqdm as notebook_tqdm


# Get Chunks

In [2]:
from langchain.document_loaders import PyMuPDFLoader, PDFMinerLoader
def document_loader(file):
    #PyMuPDFLoader best for large books
    loader = PyMuPDFLoader(file)
    loaded_document = loader.load()
    return loaded_document

def split_text(document, chunk_size=500, chunk_overlap=50, return_as_documents=True):    
    separators = ["\n\n", "\n", ".", " ", ""]
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len, 
        separators=separators
    )
    if return_as_documents:
        return text_splitter.split_documents(document)
    else:
        return text_splitter.split_text(document)


file="../data/books/Rebuilding Milo The Lifters Guide to Fixing Common Injuries and Building a Strong Foundation for Enhancing Performance (Dr. Aaron Horschig, Kevin Sonthana) (Z-Library).pdf"
document = document_loader(file)
document_clean = []
for page in document:
    if(len(page.page_content) >= 50):
        document_clean.append(page)

chunks = []
for file in [file]:
    #album_path = os.path.join(all_albums_path, album_path)
    chunk = split_text(document_clean, 500, 50)
    chunks.extend(chunk)
len(chunks)


1726

## Preprocessing

In [3]:
def contains_private_unicode(text):
    return any('\uE000' <= char <= '\uF8FF' for char in text)

def find_words_with_bad_glyphs(text):
    words = text.split()
    return [word for word in words if contains_private_unicode(word)]

def find_non_english_characters(text):
    return set(char for char in text if ord(char) > 127)

def find_words_with_non_english_chars(text):
    words = text.split()
    return [word for word in words if any(ord(c) > 127 for c in word)]

words_with_non_english_chars, words_with_bad_glyphs, non_english_characters = [], [], []
for chunk in chunks:
    text = chunk.page_content
    words_with_non_english_chars.extend(find_words_with_non_english_chars(text))

    words_with_bad_glyphs.extend(find_words_with_bad_glyphs(text))

    non_english_characters.extend(find_non_english_characters(text))

unique_words_with_non_english_chars = list(set(words_with_non_english_chars))
unique_words_with_bad_glyphs = list(set(words_with_bad_glyphs))
unique_non_english_characters = list(set(non_english_characters))

print(len(unique_words_with_non_english_chars), len(unique_words_with_bad_glyphs), len(unique_non_english_characters))


1834 0 35


In [4]:
def replace_smart_quotes(text):
    new_text = text #u00A9 -> other symbol
    new_text = new_text.replace("–", "-").replace("»", "").replace("—", "-").replace("/", " ").replace("\u00A9", " ").replace("\u2022", " ")#bullet
    new_text = new_text.replace("\u00D8", "o").replace("\u00A9", " ").replace("©", " ") #phi symbol, #copywrite symbol
    new_text = new_text.replace("®", " ").replace("…", "...").replace("×", "x").replace("ü", "u").replace("ø", "o").replace("ł", "l")
    new_text = new_text.replace('’', "'").replace('‘', "'").replace('“', '"').replace('”', '"').replace("\u00E9", "e").replace("\u00EB", "e")
    return new_text

#NFKC helps convert compatible characters into standard ones, e.g., ’ → ', ﬁ → fi. (diﬀers word)
def normalize_quotes(text):
    return unicodedata.normalize("NFKC", text)

def remove_number_range_with_unicode_dash(text):
    # Match patterns like 211–20 or 211–220. with an optional period
    return re.sub(r'\b\d+[\u2013\u2014\u2012\u2010\u2212-]\d+\.*', '', text)

In [5]:
replacements = {
    "\u0346": "fl", 
    "\u0345": "fi", 
    "\u00EF": "i"} 
def replace_bad_words(word, unique_non_english_characters):
    for n in range(len(word)):
        letter = word[n]
        if letter in unique_non_english_characters:
            #print(letter)
            if letter in replacements.keys():
                #value = letter
                word = word.replace(letter, replacements[letter])
                return word
    else:
        return word
                #print(letter, word, replacements[letter] ,f'--->  {"\u0346"}  ' ,word.replace(letter, replacements[value] ))

#word = unique_words_with_non_english_chars[22]
# new_word = replace_bad_words(word)
# new_word
#unique_words_with_non_english_chars[73], replace_bad_words(unique_words_with_non_english_chars[73])


In [6]:
def word_preprocessing(word, unique_non_english_characters):
    #if word in unique_words_with_non_english_chars:
    new_word = replace_smart_quotes(word)
    new_word = normalize_quotes(new_word)
    new_word = remove_number_range_with_unicode_dash(new_word)
    new_word = replace_bad_words(new_word, unique_non_english_characters)
    return new_word
    # else:
    #     return word

In [7]:
def clean_chunk(chunk, unique_non_english_characters):
    words = chunk.split()
    clean_chunk = ""
    for word in words:
        new_text =  word_preprocessing(word, unique_non_english_characters)
        #print(new_text)
        if new_text == None:
            print(word, "    NONE \\n ")
        clean_chunk += new_text + " "
    return clean_chunk.strip()


In [8]:
text_chunks=[]
n=0
for chunk in chunks:
    text_chunk = chunk.page_content 
    print(f"Processing chuck number: {n}")
    text_chunk = clean_chunk(text_chunk, unique_non_english_characters)
    text_chunks.append(text_chunk)
    n+=1

Processing chuck number: 0
Processing chuck number: 1
Processing chuck number: 2
Processing chuck number: 3
Processing chuck number: 4
Processing chuck number: 5
Processing chuck number: 6
Processing chuck number: 7
Processing chuck number: 8
Processing chuck number: 9
Processing chuck number: 10
Processing chuck number: 11
Processing chuck number: 12
Processing chuck number: 13
Processing chuck number: 14
Processing chuck number: 15
Processing chuck number: 16
Processing chuck number: 17
Processing chuck number: 18
Processing chuck number: 19
Processing chuck number: 20
Processing chuck number: 21
Processing chuck number: 22
Processing chuck number: 23
Processing chuck number: 24
Processing chuck number: 25
Processing chuck number: 26
Processing chuck number: 27
Processing chuck number: 28
Processing chuck number: 29
Processing chuck number: 30
Processing chuck number: 31
Processing chuck number: 32
Processing chuck number: 33
Processing chuck number: 34
Processing chuck number: 35
Pr

In [9]:
len(text_chunks)

1726

In [10]:
text_chunks

['First published in 2021 by Victory Belt Publishing Inc. Copyright   2021 Aaron Horschig and Dr. Kevin Sonthana All rights reserved No part of this publication may be reproduced or distributed in any form or by any means, electronic or mechanical, or stored in a database or retrieval system, without prior written permission from the publisher. ISBN-13: 99 114 111 107 101 114 50 48 49 54 The information included in this book is for educational',
 'purposes only. It is not intended or implied to be a substitute for professional medical advice. The reader should always consult their healthcare provider to determine the appropriateness of the information for their own situation or if they have any questions regarding a medical condition or treatment plan. Reading the information in this book does not constitute a physician-patient relationship. The statements in this book have not been evaluated by the Food',
 'and Drug Administration, nor are they intended to diagnose, treat, cure, or pr

In [11]:
print(text_chunks[104])

exercise (moving the spine through a full range in and out of flexion but under low load) is a great option for many people. It is not until we introduce force into the equation that things begin to change. Flexion by itself isn't the problem. This is one reason why performing high-repetition Olympic lifts to the point of fatigue can lead to injury. The snatch and clean and jerk are amazing lifts if you don't break form. Elite weightlifters spend years


# LLAMA

In [12]:
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
#Loads the tokenizer associated with the model model_name from Hugging Face
#AutoTokenizer A Hugging Face auto-class that automatically selects the right tokenizer class based on the model.
#Example: if model_name is "gpt2", it will internally use GPT2Tokenizer.
#tokenizer = AutoTokenizer.from_pretrained(model_name)
#Pad the token 
tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
#eos -> end of string token is the pad token
tokenizer.pad_token = tokenizer.eos_token


#The line code loads the pretrained language model corresponding to model_name, specifically for Causal Language Modeling (CLM).
#CLM means predicting the next word in a sequence, which is the task used in models like GPT.
#AutoModelForCausalLM:  class for text generation models, like:GPT-2 / GPT-Neo / LLaMA / Mistral / Falcon / Phi
#It automatically picks the right architecture for your model.

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",  # You can also use "fp4"
    bnb_4bit_compute_dtype="float16"
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=quant_config,
    trust_remote_code=True
)

#model = AutoModelForCausalLM.from_pretrained(model_name)

#Hugging Face pipeline for text generation.
#Wraps the tokenizer and model into an easy-to-use interface for generation tasks
#pipeline is Hugging Face’s high-level API to simplify the use of models for common NLP tasks. It takes care of: Preprocessing (tokenization), Running the modeland Postprocessing (decoding)
qa_gen = pipeline("text-generation", model=model, tokenizer=tokenizer,  max_new_tokens=256)


Device set to use cuda:0


In [13]:
# # Define quantization config
# quant_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_quant_type="nf4",  # You can also use "fp4"
#     bnb_4bit_compute_dtype="float16"
# )

# model = AutoModelForCausalLM.from_pretrained(
#     model_id,
#     device_map="auto",
#     quantization_config=quant_config,
#     trust_remote_code=True
# )


In [14]:
def get_prompt_v2(text):
    messages = [
    {
        "role": "system",
        "content": (
            "You are a concise and helpful medical tutor. "
            "Based on the provided text, generate a JSON object with exactly ONE question (as 'instruction') and ONE answer (as 'output').\n\n"
            "- The content must relate to health, exercise, sports, fitness, or physiotherapy.\n"
            "- Do not include multiple questions or answers.\n"
            "- Do not repeat the instruction in the output.\n"
            "- Keep the output brief and informative.\n"
            "- If the text is not relevant, return: {\"instruction\": \"NULL\", \"output\": \"NULL\"}\n\n"
            "- Respond ONLY with the JSON object. Do NOT include any explanation or commentary."
        ),
    },
    {
        "role": "user",
        "content": text.strip()
    },
]

    return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)


In [15]:
len(text_chunks)

1726

In [16]:
batch_size = 16  # or 8, 32, depending on your GPU
from tqdm import tqdm

raw_outputs = []
samples = text_chunks[:800]
for i in tqdm(range(0, len(samples), batch_size)):
    batch = samples[i:i+batch_size]

    prompt = [get_prompt_v2(i) for i in batch]

    raw_output = qa_gen(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
    raw_outputs.extend([o[0]["generated_text"] for o in raw_output])

 20%|██        | 10/50 [19:14<1:17:23, 116.09s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 50/50 [1:36:47<00:00, 116.15s/it]


In [17]:
def escape_nested_quotes(json_str):
    def fix_quotes(match):
        key = match.group(1)
        value = match.group(2)
        # Escape unescaped quotes inside the string value
        value_fixed = re.sub(r'(?<!\\)"', r'\"', value)
        return f'"{key}": "{value_fixed}"'

    pattern = r'"(instruction|output)":\s*"((?:[^"\\]|\\.)*?)"(?=,\s*"|"\s*})'
    return re.sub(pattern, fix_quotes, json_str)
def check_for_null(instruction, output):
    if ("NULL" in instruction) | ("NULL" in output) :
        null_instruction = '"instruction": "NULL?.",'
        null_output = '"output": "NULL."'
        return null_instruction, null_output
    else:
        return instruction, output    

def merge_outputs(out):
    merged_out = '"output": "'
    out = out.replace('"instruction":', '"output":')
    for n in range(len(out.split("output"))):
        text = out.split("output")[n]
        #print(n, text, '\n')
        text = text[text.find('": "'):].replace('",', '').replace('"', '').replace(':', '')
        #print('*******',text)
        if ("}" in text): 
            text = text[:text.find('}')]
        elif ("{" in text):
            text = text[:text.find('{')]
        
        merged_out+=text
    merged_out = merged_out.strip()

    merged_out = merged_out+'.' if not merged_out.endswith('.') else merged_out

    merged_out = merged_out+'"' if not merged_out.endswith('"') else merged_out
    
    return merged_out.strip()
def is_malformed_json_like(text):
    # Check for empty keys like "": "value"
    # if re.search(r'"\s*"\s*:', text):
    #     return True
    
    # Check for unclosed braces
    if text.count('{') != text.count('}'):
        return True

    # Check for trailing commas before a closing brace
    if re.search(r',\s*}', text):
        return True

    # Check for improperly escaped quotes inside values
    if re.search(r':\s*"[^"]*"[^\s,}]', text):
        return True

    return False

def clean_and_merge_malformed_json(raw_text):
    # Step 1: Fix missing keys like '""' and replace them with 'label' or 'title'
    fixed = re.sub(r'"":', '"label":', raw_text)

    # Step 2: Split the entries if needed
    chunks = re.split(r'}\s*,\s*{', fixed)
    
    clean_chunks = []
    for chunk in chunks:
        chunk = chunk.strip().strip(',')  # Remove leading/trailing commas
        if not chunk.startswith('{'):
            chunk = '{' + chunk
        if not chunk.endswith('}'):
            chunk += '}'

        # Optional: validate minimal JSON structure before appending
        try:
            json.loads(chunk)
            clean_chunks.append(chunk)
        except json.JSONDecodeError:
            pass  # skip bad/incomplete JSON parts

    # Step 3: Wrap all valid chunks in a JSON array
    json_array_str = "[" + ", ".join(clean_chunks) + "]"

    # Step 4: Try to parse it
    try:
        return json.loads(json_array_str)
    except json.JSONDecodeError as e:
        print("Still not valid JSON:", e)
        return None

def remove_internal_double_virgolette(i, label):
    content = i.split(f'"{label}": ')[-1]
    if '"' in content[1:-1]:
        content = content[1:-1].replace('"', '')
        new = f'"{label}": ' + '"'+content+'"'
        if label == "instruction":
            new = new + ','
        return new
    else:
        return i

In [18]:
def instruction_output(output):
    raw_output = output.split("<|assistant|>")[1]
    raw_output = re.sub(r'"question":', '"instruction":', raw_output, flags=re.IGNORECASE)
    raw_output = re.sub(r'"answer":', '"output":', raw_output, flags=re.IGNORECASE)

    
    instruction = raw_output[raw_output.find('"instruction":') : raw_output.find('",')]
    output = raw_output[raw_output.find('"output":') : raw_output.find('."\n}')]
    output+='."'

    if "output" in instruction:
        instruction = instruction[:instruction.find("output")].replace('."\n', '').replace('"\n  "', '"')

    if '"instruction":' in output:
        output = output[output.find('"instruction":') : output.find('",')]
        output = output.replace('"instruction":', '')

    #Remove final dot, to add later the ?.,
    instruction = instruction.strip()[:-1] if instruction.strip().endswith(".") else instruction.strip()
    instruction = instruction if instruction.endswith("?") else instruction+"?"
    instruction = instruction+'.",'
    #print(f"INS: {instruction}")

    #return pre-defined null template if there "NULL on eather the insptructions or outputs"
    instruction, output = check_for_null(instruction, output)

    if "instruction" not in instruction:
        instruction = '"instruction": "NULL?.",'
        output = '"output": "NULL."'
    elif "output" not in output:
        output = '"output": "NULL".'
        instruction = '"instruction": "NULL?.",'

    if len(output.split("output")) > 2:
        output = merge_outputs(output)

    if "JSON object" in output:
        output = '"output": "NULL."'

    output = output.replace('"\n."', '."').replace('",\n', '')
    output = output.replace('"\n', '"').replace('`', '').replace('{', '').replace('}', '').replace('"\n', '"').replace('".', '."').strip()

    if ('{' in output):
        output = output[:output.find('}')]
    elif ('}' in output):
        output = output[:output.find('{')]

    output = remove_internal_double_virgolette(output, 'output')
    instruction = remove_internal_double_virgolette(instruction, 'instruction')
    return instruction, output, raw_output

In [19]:
test=False
if test:
    instructions, outputs, raws = [], [], []

    for n in range(5):
        instruction, output, raw = instruction_output(raw_outputs[n])
        instructions.append(instruction)
        outputs.append(output)
        raws.append(raw)
        print(f"n: {n},\n{instruction}\n {output}\n **********************************")

In [20]:
len(raw_outputs)

800

In [21]:
chunks_to_repete=[]

all_outputs, str_json = [], []
for n in range(len(raw_outputs)):
    instruction, output, _  = instruction_output(raw_outputs[n])
    output_results=output.split('"output":')[-1]

    if (len(output_results) < 15) | ("NULL" in output) | ("NULL" in instruction) | ("https//www.youtube" in output):
            instruction = '"instruction": "NULL.",'
            output = '"output": "NULL."'
            chunks_to_repete.append(n)

    output = output.replace('"Question:', '')
    instruction = instruction.replace('?"?."', '?."').replace('"?."', '').replace('"?.",', ',').replace('- "?."', ',')
    if output.endswith('""'):
        output = output.replace('""', '"')

    json_str = "{" + instruction + output + "}"
    str_json.append(json_str)

    try:
        parsed = json.loads(json_str.replace("\n", "").replace('"}``."}', '"}').replace('.""', '."'))        
        all_outputs.append(parsed)

    except json.JSONDecodeError as e:
        print("********************************")
        print(f"[!] JSON decode error at chunk {n}: {e}")
        print(f"Error input : {instruction}")
        print(f"Error output: {output}")
        print('json_str: ' ,json_str)
        print("********************************")


********************************
[!] JSON decode error at chunk 131: Invalid \escape: line 1 column 453 (char 452)
Error input : "instruction": "Study the provided text about spondylolysis and spondylolisthesis. The text contains information about the symptoms, causes, and treatment options for spondylolysis and spondylolisthesis. A brief explanation of spondylolysis is provided, followed by a more detailed explanation of spondylolisthesis. The text concludes with the mention of the most common causes of spondylolisthesis and the recommended treatment options?.",
Error output: "output": "\instruction\:\Study the provided text about spondylolysis and spondylolisthesis. The text contains information about the symptoms, causes, and treatment options for spondylolysis and spondylolisthesis. A brief explanation of spondylolysis is provided, followed by a more detailed explanation of spondylolisthesis. The text concludes with the mention of the most common causes of spondylolisthesis and the

In [94]:
len(all_outputs)

798

In [108]:
len(chunks_to_repete)+343

793

In [113]:
type(chunks_to_repete[1])

<class 'int'>

In [114]:
with open("chunks_to repete_qa_outputs_800_batch16_4bit_30072025.txt", "w", encoding="utf-8") as f:
    f.write(", ".join(map(str, chunks_to_repete)))


In [109]:
chunks_to_repete

[2, 3, 5, 6, 8, 9, 10, 11, 12, 14, 16, 17, 21, 22, 25, 26, 28, 30, 32, 34, 35, 36, 38, 39, 40, 41, 43, 44, 45, 46, 47, 48, 49, 50, 55, 57, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 71, 77, 78, 81, 82, 86, 88, 91, 92, 93, 95, 98, 100, 101, 103, 106, 108, 114, 116, 117, 118, 119, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 133, 134, 136, 138, 139, 142, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 156, 157, 158, 159, 160, 161, 162, 164, 165, 166, 170, 172, 174, 176, 178, 179, 180, 184, 187, 188, 189, 190, 191, 192, 194, 195, 196, 199, 201, 202, 204, 206, 208, 210, 213, 215, 218, 219, 220, 222, 223, 224, 226, 228, 229, 232, 234, 237, 240, 244, 248, 250, 251, 252, 253, 255, 256, 257, 258, 259, 262, 264, 265, 266, 270, 271, 273, 274, 275, 276, 277, 280, 282, 285, 288, 290, 291, 292, 294, 295, 296, 297, 302, 303, 304, 305, 307, 309, 310, 311, 314, 315, 319, 320, 326, 328, 329, 330, 332, 335, 336, 337, 338, 340, 341, 343, 344, 347, 348, 349, 350, 352, 353, 354, 355, 356, 358, 359,

In [24]:
with open("qa_outputs_800_batch16_4bit_30072025.json", "w", encoding="utf-8") as f:
    json.dump(all_outputs, f, indent=2, ensure_ascii=False)


# Clean JSON

In [None]:
import os
import pandas as pd
import numpy as np
from pprint import pprint

In [103]:
json_file_name = "qa_outputs_800_batch16_4bit_30072025.json"
if os.path.exists(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        existing_data = json.load(f)

In [30]:
df = pd.DataFrame(existing_data)

In [None]:
# Cleaning/filter function
def is_valid(text):
    if pd.isna(text):  
        return False
    if not isinstance(text, str):  # Check if not a string
        return False

    text = text.strip().lower()
    # Check for known invalid values or if 'null' is present anywhere
    invalid_keywords =["null", "none", "nan", "n/a"] 
    if any(bad in text for bad in invalid_keywords):
        return False

    #minimum content length
    if len(text) < 5:  
        return False
    return True

# Apply filter to both columns
filtered_df = df[df['instruction'].apply(is_valid) & df['output'].apply(is_valid)]
filtered_df.shape


(343, 2)

In [95]:
filtered_df.head()

Unnamed: 0,instruction,output
0,Lie flat on your back with your arms at your s...,Lie flat on your back with your arms at your s...
1,Consult your healthcare provider before starti...,"A healthy, active lifestyle can improve your o..."
4,Screen your knee pain using the following inst...,Knee pain is a common condition that can be c...
7,Become an expert in your field. Use both your ...,"As the world becomes increasingly complex, it ..."
13,How can incorporating sustainable exercises in...,Sustainable exercises help with injury pre...


In [93]:
for n in range(20):
    x = filtered_df.iloc[-n]
    print(n)
    pprint(x.instruction)
    pprint(x.output)
    # print(f"Instruction: ", x.instruction)
    # print(f"output: ", x.output)
    print('###############################')

0
('Lie flat on your back with your arms at your sides. Place your left hand on '
 'the floor in front of you and your right hand on the floor behind you. '
 'Slowly raise your right knee towards your chest and your left knee towards '
 'your chest. Keep your feet together and your hips level. Your left hand will '
 'be on the ground while your right hand will be on the floor behind you. '
 'Inhale as you lift your left knee towards your chest, exhale as you lower '
 'your right knee towards your chest. Repeat with the other leg?.')
('Lie flat on your back with your arms at your sides. Place your left hand on '
 'the floor in front of you and your right hand on the floor behind you. '
 'Slowly raise your right knee towards your chest and your left knee towards '
 'your chest. Keep your feet together and your hips level. Your left hand will '
 'be on the ground while your right hand will be on the floor behind you. '
 'Inhale as you lift your left knee towards your chest, exhale as you 

In [99]:
filtered_df = filtered_df.drop_duplicates()

In [106]:
filtered_df.shape[0]

343

In [None]:
cleaned_data = filtered_df.to_dict(orient="records")
n_records=filtered_df.shape[0]
with open(f"cleaned_{n_records}_{json_file_name}", "w", encoding="utf-8") as f:
    json.dump(cleaned_data, f, ensure_ascii=False, indent=2)

In [None]:
import os
import json

file_path = "qa_outputs_100_30072025.json"

# Step 1: Load existing content if the file exists
if os.path.exists(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        try:
            existing_data = json.load(f)
        except json.JSONDecodeError:
            existing_data = []
else:
    existing_data = []

# Step 2: Combine existing data with new outputs
# assuming both are lists of JSON objects
existing_data.extend(all_outputs)

# Step 3: Write back to file
with open(file_path, "w", encoding="utf-8") as f:
    json.dump(existing_data, f, indent=2, ensure_ascii=False)
