Notebook showing how to call, and specific arguments used, for each of the classes/methods from different scripts, to produce the final input for the LLM fine-tuning

In [1]:
from pathlib import Path
import sys
sys.path.append(str(Path('../python').resolve()))
from create_clean_chunks import CreateChunks, HeaderPreservingChunker
from create_qa import GenerateQAContent
from create_json_qa import CreateJsonQA


from pprint import pprint

from transformers import AutoTokenizer


  from .autonotebook import tqdm as notebook_tqdm


# Load and chunk .pdf file using Docling

In [None]:
input_file="../data/books/rebuilding_milo.pdf"
output_chunks_file = "../data/rebuilding_milo_chunks_docling_max_tokens128_min_tokens50_meta_llama3p18B.txt"
model_id = "meta-llama/Llama-3.1-8B-Instruct" #TinyLlama/TinyLlama-1.1B-Chat-v1.0"

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

save_docling_chunks=False

cc = HeaderPreservingChunker(file_path=input_file, tokenizer=tokenizer,  max_tokens=128)
x = cc.get_clean_chunks()
print(x[:10])

if save_docling_chunks:
    with open(output_chunks_file, "w", encoding="utf-8") as f:
        for item in x:
            f.write(item + "\n")

# Load and chunk .pdf file using Langchain

In [None]:
input_file="../data/books/rebuilding_milo.pdf"
output_chunks_file = "../data/rebuilding_milo_chunks_docling_max_tokens128_min_tokens50_meta_llama3p18B.txt"
model_id = "meta-llama/Llama-3.1-8B-Instruct" #TinyLlama/TinyLlama-1.1B-Chat-v1.0"

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

def llama_token_len(text):
    return len(tokenizer.encode(text))

save_docling_chunks=False
cc_langchain = CreateChunks(file_path=input_file, chunk_size=500, chunk_overlap=50, length_function=llama_token_len)
y = cc_langchain.get_clean_chunks()
print(y[:10])


In [None]:
len(y)

# Generate Raw Q&A entries from chunks text

In [15]:
file_name = "../data/rebuilding_milo_chunks_docling_max_tokens128_min_tokens50_meta_llama3p18B.txt" 
#"../notebooks/rebuilding_milo_chunks_docling_max_tokens512_min_tokens50.txt"

model_id = "meta-llama/Llama-3.1-8B-Instruct" #TinyLlama/TinyLlama-1.1B-Chat-v1.0"
save_json=False
#For all chunks set to None
n_chunks_intervals=[30,40] 
n_repetitions = 1

gc = GenerateQAContent(file_name, model_id)
text = gc.get_text()
raw_outputs = gc.generate_content(n_chunks_intervals=n_chunks_intervals, n_repetitions=n_repetitions, 
                                  save_json=save_json, batch_size=16)


Loading checkpoint shards:  75%|███████▌  | 3/4 [00:14<00:04,  4.82s/it]


OutOfMemoryError: CUDA out of memory. Tried to allocate 1002.00 MiB. GPU 0 has a total capacity of 11.64 GiB of which 341.81 MiB is free. Including non-PyTorch memory, this process has 11.30 GiB memory in use. Of the allocated memory 11.03 GiB is allocated by PyTorch, and 143.59 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [8]:
pprint(raw_outputs)

['<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n'
 '\n'
 'Cutting Knowledge Date: December 2023\n'
 'Today Date: 26 Jul 2024\n'
 '\n'
 'You are a concise and helpful medical tutor. Based on the provided text, '
 "generate a JSON object with exactly ONE question (as 'instruction') and ONE "
 "answer (as 'output').\n"
 '\n'
 '- The content must relate to health, exercise, sports, fitness, or '
 'physiotherapy.\n'
 '- Do not include multiple questions or answers.\n'
 '- Do not repeat the instruction in the output.\n'
 '- Keep the output brief and informative.\n'
 '- If the text is not relevant, return: {"instruction": "NULL", "output": '
 '"NULL"}\n'
 '\n'
 '- Respond ONLY with the JSON object. Do NOT include any explanation or '
 'commentary.<|eot_id|><|start_header_id|>user<|end_header_id|>\n'
 '\n'
 'In the following weeks after Josiah was stabilized, he underwent extensive '
 'surgery to repair the torn ligaments and tendons. He then was placed in '
 'straight leg brace

# Create QA Json format input

In [2]:
raw_qa_json_file_name = "../data/raw_outputs_samples1711_nreps1_metallama_maxtoken128max_new_tokens512.json"
chunks_text_file_name = "../data/rebuilding_milo_chunks_docling_max_tokens128_min_tokens50_meta_llama3p18B.txt" 
n_max_chunks = None
#Details for the cleaning of the raw json file
verbose=True

read_generated_qa = True
#keyword to specify when the generered Assistant output starts
keyword_splitter = "<|eot_id|><|start_header_id|>assistant<|end_header_id|>"

if read_generated_qa:
    create_json = CreateJsonQA(raw_qa_json_file_name=raw_qa_json_file_name, input_text_file_name=None, 
                                keyword_splitter=keyword_splitter,n_max_chunks=n_max_chunks)
else:
    create_json = CreateJsonQA(raw_qa_json_file_name=None, input_text_file_name=chunks_text_file_name, 
                                keyword_splitter=keyword_splitter, n_max_chunks=n_max_chunks)

raw_outputs = create_json.get_raw_output()
# print(len(raw_outputs))
# print(raw_outputs[:3])
json_output_name= None

json_test, _, _ = create_json.create_instruction_output_json(raw_outputs, json_output_name=json_output_name,
                                                                verbose=verbose)

#print(len(json_test))
print(json_test[:3])

[{'instruction': 'What is a common injury that can occur in weightlifters?', 'output': 'Common injuries that can occur in weightlifters include strains and strains to the muscles and tendons, particularly in the shoulders, lower back, and knees.'}, {'instruction': 'NULL.', 'output': 'NULL.'}, {'instruction': 'What is the purpose of the statements in the provided text?', 'output': 'The statements in the book are for educational purposes only.'}]


In [3]:
json_test

[{'instruction': 'What is a common injury that can occur in weightlifters?',
  'output': 'Common injuries that can occur in weightlifters include strains and strains to the muscles and tendons, particularly in the shoulders, lower back, and knees.'},
 {'instruction': 'NULL.', 'output': 'NULL.'},
 {'instruction': 'What is the purpose of the statements in the provided text?',
  'output': 'The statements in the book are for educational purposes only.'},
 {'instruction': 'What is the purpose of a disclaimer in a medical book?',
  'output': "To protect the author and publisher from liability for any adverse effects resulting from the book's use or application."},
 {'instruction': 'What is a common cause of low back pain?',
  'output': 'Poor posture or muscle imbalances can be a common cause of low back pain.'},
 {'instruction': 'What are the primary muscles that stabilize the hip joint?',
  'output': 'The primary muscles that stabilize the hip joint include the gluteus maximus, gluteus medi

In [5]:
new_json_file_name = "../data/clean_json_outputs_samples1711_nreps1_metallama_maxtoken128max_new_tokens512.json"
merge_with_existing_data = False
all_outputs = json_test
input_json_file_name = None

cleaned_json = create_json.clean_json_file(all_outputs=all_outputs, json_file_name=input_json_file_name, 
                    merge_with_existing_data=merge_with_existing_data, new_json_file_name=new_json_file_name)
print(f"Before cleaning: {len(all_outputs)} after {len(cleaned_json)}")

*********                                          instruction  \
0  What is a common injury that can occur in weig...   
1                                              NULL.   
2  What is the purpose of the statements in the p...   
3  What is the purpose of a disclaimer in a medic...   
4           What is a common cause of low back pain?   

                                              output  
0  Common injuries that can occur in weightlifter...  
1                                              NULL.  
2  The statements in the book are for educational...  
3  To protect the author and publisher from liabi...  
4  Poor posture or muscle imbalances can be a com...  
Before cleaning: 1711 after 1684


In [7]:
import json
with open(new_json_file_name, "r", encoding="utf-8") as f:
    existing_data = json.load(f)

In [9]:
len(existing_data)

1684