Notebook showing how to call, and specific arguments used, for each of the classes/methods from different scripts, to produce the final input for the LLM fine-tuning

In [None]:
from pathlib import Path
import sys
sys.path.append(str(Path('../python').resolve()))
from create_clean_chunks import CreateChunks, HeaderPreservingChunker
from create_qa import GenerateQAContent

from pprint import pprint

from transformers import AutoTokenizer


  from .autonotebook import tqdm as notebook_tqdm


# Load and chunk .pdf file using Docling

In [None]:
input_file="../data/books/rebuilding_milo.pdf"
output_chunks_file = "../data/rebuilding_milo_chunks_docling_max_tokens128_min_tokens50_meta_llama3p18B.txt"
model_id = "meta-llama/Llama-3.1-8B-Instruct" #TinyLlama/TinyLlama-1.1B-Chat-v1.0"

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

save_docling_chunks=False

cc = HeaderPreservingChunker(file_path=input_file, tokenizer=tokenizer,  max_tokens=128)
x = cc.get_clean_chunks()
print(x[:10])

if save_docling_chunks:
    with open(output_chunks_file, "w", encoding="utf-8") as f:
        for item in x:
            f.write(item + "\n")

# Load and chunk .pdf file using Langchain

In [None]:
input_file="../data/books/rebuilding_milo.pdf"
output_chunks_file = "../data/rebuilding_milo_chunks_docling_max_tokens128_min_tokens50_meta_llama3p18B.txt"
model_id = "meta-llama/Llama-3.1-8B-Instruct" #TinyLlama/TinyLlama-1.1B-Chat-v1.0"

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

def llama_token_len(text):
    return len(tokenizer.encode(text))

save_docling_chunks=False
cc_langchain = CreateChunks(file_path=input_file, chunk_size=500, chunk_overlap=50, length_function=llama_token_len)
y = cc_langchain.get_clean_chunks()
print(y[:10])


In [None]:
len(y)

# Generate Raw Q&A entries from chunks text

In [6]:
file_name = "../data/rebuilding_milo_chunks_docling_max_tokens128_min_tokens50_meta_llama3p18B.txt" 
#"../notebooks/rebuilding_milo_chunks_docling_max_tokens512_min_tokens50.txt"

model_id = "meta-llama/Llama-3.1-8B-Instruct" #TinyLlama/TinyLlama-1.1B-Chat-v1.0"
save_json=False
#For all chunks set to None
n_chunks_intervals=[30,31] 
n_repetitions = 1

gc = GenerateQAContent(file_name, model_id)
text = gc.get_text()
raw_outputs = gc.generate_content(n_chunks_intervals=n_chunks_intervals, n_repetitions=n_repetitions, 
                                  save_json=save_json, batch_size=16)


Loading checkpoint shards: 100%|██████████| 4/4 [00:15<00:00,  3.98s/it]
Device set to use cuda:0
  0%|          | 0/1 [00:00<?, ?it/s]

i: 0
Processing batch with samples (0, 16) 


100%|██████████| 1/1 [00:03<00:00,  3.09s/it]


In [7]:
pprint(raw_outputs)

['<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n'
 '\n'
 'Cutting Knowledge Date: December 2023\n'
 'Today Date: 26 Jul 2024\n'
 '\n'
 'You are a concise and helpful medical tutor. Based on the provided text, '
 "generate a JSON object with exactly ONE question (as 'instruction') and ONE "
 "answer (as 'output').\n"
 '\n'
 '- The content must relate to health, exercise, sports, fitness, or '
 'physiotherapy.\n'
 '- Do not include multiple questions or answers.\n'
 '- Do not repeat the instruction in the output.\n'
 '- Keep the output brief and informative.\n'
 '- If the text is not relevant, return: {"instruction": "NULL", "output": '
 '"NULL"}\n'
 '\n'
 '- Respond ONLY with the JSON object. Do NOT include any explanation or '
 'commentary.<|eot_id|><|start_header_id|>user<|end_header_id|>\n'
 '\n'
 'In the following weeks after Josiah was stabilized, he underwent extensive '
 'surgery to repair the torn ligaments and tendons. He then was placed in '
 'straight leg brace