### Structured jsonl Data Generation for LLM

In [15]:
import fitz  # PyMuPDF
import json
from tqdm import tqdm
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_ollama import ChatOllama
import re

In [13]:
import fitz
from pathlib import Path

def extract_text_from_pdfs(pdf_dir: str) -> str:
    all_text = []

    for pdf_path in Path(pdf_dir).glob("*.pdf"):
        doc = fitz.open(pdf_path)

        # Optional: add document separator
        all_text.append(f"\n\n===== START OF {pdf_path.name} =====\n\n")

        for page in doc:
            page_text = page.get_text()
            if page_text:
                all_text.append(page_text)

        all_text.append(f"\n\n===== END OF {pdf_path.name} =====\n\n")

    return "\n".join(all_text)


In [14]:
pdf_path = "C:\\Users\\Himanshu\\Desktop\\GenAi-Project\\Data\\raw"
raw_text = extract_text_from_pdfs(pdf_path)
print(raw_text)
print("Characters extracted:", len(raw_text))



===== START OF iLibrary-Backend Internal Documentation.pdf =====


iLibrary-Backend Internal Documentation
1. Executive Overview
The iLibrary Backend is a Spring Boot monolithic application designed as a comprehensive RESTful API for
managing a private study-library system. It addresses typical library needs: user account management, role-
based  access,  subscription  plans,  real-time  seat  booking,  and  secure  entry  via  QR  codes
.  For
example, a student can sign up, purchase a weekly or monthly pass, reserve a study seat for a set duration,
receive a QR code by email, and then scan it for library access. All core logic – authentication, payment
processing,  and  seat  management  –  is  implemented  within  a  single  deployable  service  (no  separate
microservices) for simplicity and cohesion.
LLM Fine-Tuning & RAG Context: The repository’s clear structure and rich documentation (interactive API
docs, diagrams, and thorough README) make it well-suited for integration into

In [16]:
def clean_text(text: str) -> str:
    text = re.sub(r'\s+', ' ', text)
    text = text.replace('\x00', '')
    return text.strip()


In [18]:
full_corpus = clean_text(raw_text)
print(full_corpus)

===== START OF iLibrary-Backend Internal Documentation.pdf ===== iLibrary-Backend Internal Documentation 1. Executive Overview The iLibrary Backend is a Spring Boot monolithic application designed as a comprehensive RESTful API for managing a private study-library system. It addresses typical library needs: user account management, role- based access, subscription plans, real-time seat booking, and secure entry via QR codes . For example, a student can sign up, purchase a weekly or monthly pass, reserve a study seat for a set duration, receive a QR code by email, and then scan it for library access. All core logic – authentication, payment processing, and seat management – is implemented within a single deployable service (no separate microservices) for simplicity and cohesion. LLM Fine-Tuning & RAG Context: The repository’s clear structure and rich documentation (interactive API docs, diagrams, and thorough README) make it well-suited for integration into an LLM-based knowledge system

In [19]:
def chunk_text(text: str, chunk_size=700, overlap=100):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=overlap
    )
    return splitter.split_text(text)

In [20]:
chunks = chunk_text(raw_text)
print("Total chunks:", len(chunks))
print("Sample chunk:", chunks[0])

Total chunks: 288
Sample chunk: ===== START OF iLibrary-Backend Internal Documentation.pdf =====


In [21]:
PROMPT_TEMPLATE = """
You are generating supervised fine-tuning data for a Llama-3 Instruct model.

Given the context below:
1. Generate ONE clear user question.
2. Generate a concise, correct answer using ONLY the context.

Context:
{context}

Return in JSON:
{{
  "question": "...",
  "answer": "..."
}}
"""

In [22]:
def extract_json(text):
    """Extract first JSON object from text"""
    match = re.search(r"\{[\s\S]*\}", text)
    if match:
        return json.loads(match.group())
    raise ValueError("No JSON found")

In [23]:
def generate_qa_from_chunk(chunk: str, llm):
    prompt = PROMPT_TEMPLATE.format(context=chunk)
    response = llm.invoke(prompt)

    # IMPORTANT FIX
    content = response.content  # <-- THIS is the text

    return extract_json(content)

In [24]:
def to_llama_chat_format(question: str, answer: str):
    return {
        "messages": [
            {"role": "user", "content": question},
            {"role": "assistant", "content": answer}
        ]
    }


In [25]:
training_data = []

llm = ChatOllama(
    model="Llama3.2",
    temperature=0,
    # other params...
)
for chunk in tqdm(chunks):
    try:
        qa = generate_qa_from_chunk(chunk, llm)  # llm must be defined
        sample = to_llama_chat_format(
            qa["question"],
            qa["answer"]
        )
        training_data.append(sample)
    except Exception as e:
        continue
training_data


100%|██████████| 288/288 [1:32:25<00:00, 19.25s/it]


[{'messages': [{'role': 'user',
    'content': "What does 'Return' mean in the context of a library's cataloging system?"},
   {'role': 'assistant',
    'content': 'The act of sending an item back to its owner or the library, often due to being lost, damaged, or no longer needed.'}]},
 {'messages': [{'role': 'user',
    'content': 'How does a student reserve a study seat for a set duration?'},
   {'role': 'assistant',
    'content': 'A student can reserve a study seat for a set duration by purchasing a weekly or monthly pass, reserving a seat for that duration, and then receiving a QR code by email which they can scan for library access.'}]},
 {'messages': [{'role': 'user',
    'content': 'How can I ensure that my LLM-based knowledge system has accurate and up-to-date information?'},
   {'role': 'assistant',
    'content': "Index the repository's documentation and data as factual knowledge to reduce hallucinations and provide current information."}]},
 {'messages': [{'role': 'user',
  

In [26]:
output_file = "training_data.jsonl"

with open(output_file, "w", encoding="utf-8") as f:
    for item in training_data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print("Saved:", output_file)

Saved: training_data.jsonl
