In [11]:
%%capture
!pip install PyPDF2 requests
!pip install ollama

In [12]:
!ollama list

NAME           ID              SIZE      MODIFIED   
llama3.1:8b    46e0c10c039e    4.9 GB    9 days ago    


In [13]:
import PyPDF2
import json
import re
import os
import ollama

In [14]:

def extract_text_from_pdf(file_path):
    text_per_page = []
    with open(file_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            text_per_page.append(page.extract_text() if page.extract_text() else "")
            # print(f"Extracted page {i+1}: {extracted_text[:100]}...")     
    return text_per_page


In [15]:
def generate_qa_pairs_with_ollama(text):
    prompt = f"""
        You are an expert doctor and a highly intelligent medical chatbot built to provide accurate, detailed, and comprehensive responses based on medical documents. I will provide you with a PDF, such as a medical book or a document, and your task is to carefully read and fully understand every single detail in it. Treat the PDF as your only source of knowledge for this task—do not add anything from outside it or make assumptions beyond what’s written.

        Once you’ve understood the PDF, act like a skilled doctor who knows everything about the topic in the document. Your job is to:

        1 . Analyze the entire content deeply, including all medical terminology, diseases, causes, risk factors, symptoms, signs, reasons, diagnosis methods, treatments, prevention strategies, and anything else mentioned (like prescriptions or doctor’s advice if included).
        2. Extract every piece of information without skipping anything—no matter how small or detailed it is.
        3. Based on this understanding, create a set of meaningful questions that cover all aspects of the content in the PDF. These questions should explore the diseases (how they happen, where they come from, what causes them), risk factors, symptoms, signs, diagnosis processes, treatment options, prevention methods, and any other relevant topics like medical terms or prescribed actions.
        4. Provide **extremely detailed, accurate, and thorough answers** to each question using only the PDF’s content. Write answers in simple, easy-to-understand language (like explaining to a patient or student) while remaining precise like a doctor. Make answers as long and comprehensive as needed to include every relevant detail from the text—no need to limit length, but keep them focused.
        5. Create **as many meaningful questions as you can**—aim for the maximum possible number—by examining every paragraph or section-like part of the PDF. These questions should cover all aspects, such as how diseases occur, their origins, causes, risk factors, symptoms, signs, diagnosis processes, treatments, prevention methods, medical terms, and any specific advice or prescriptions.
        6  Each **answer must be detailed (200-250 words max)**
        Make sure your responses are complete and thorough. If the PDF mentions a disease, explain its causes, how it develops, what the risks are, how it’s identified, how it’s treated, and how to prevent it. If there’s a prescription or specific medical advice, include that too, exactly as written. Take your time—there’s no rush. Even if the PDF is long, process every page and every line to ensure nothing is missed. Your goal is to demonstrate a deep understanding of the document and provide expert-level insights based solely on it."

        ### Guidelines:
        - Generate ** Q&A pairs**, pulling questions from every paragraph, section, table, or annex.
        - Answers must be **extremely detailed and exhaustive**, capturing all knowledge from the relevant part of the document—long answers are encouraged to include every detail.
        - **Use only the provided text**—do not include external info or stray from the document’s content.
        - Ignore metadata (e.g., publication details, authors) unless it’s medically relevant; focus only on the medical content.
        - Process every line, section, table, and annex to ensure nothing is missed.
        - Output must be structured as **valid JSON**.

            ### Text:
        {text}

        ### Expected Output (JSON format):
        ```json
        [
        {{
            "question": "What is the main cause of this condition based on the document?",
            "answer": "The PDF explains that this condition is mainly caused by an infection that starts in the lungs and spreads due to poor hygiene."
        }},
        {{
            "question": "How can this disease be prevented according to the text?",
            "answer": "According to the PDF, preventing this disease involves regular handwashing, avoiding crowded places, and getting vaccinated as recommended."
        }}
        ]
        ```
        """
    print("Generating response from Ollama...")
    response = ollama.generate(
            model="llama3.1:8b",
            prompt=prompt,
            options={
                "temperature": 0.5,
                "num_predict": 10000,
                "top_p": 0.9,
                "top_k": 40
            }
        )

    print("Raw response from Ollama:", response)
    response_text = response.get("response", "").strip()
            
    if not response_text:
            print("Error: Ollama returned an empty response.")
    return []

    try:
            qa_pairs = json.loads(response_text)
            if not isinstance(qa_pairs, list):
                print("Error: Response is not a list of Q&A pairs.")
                return []
            print(f"Generated {len(qa_pairs)} Q&A pairs.")
            return qa_pairs
    except json.JSONDecodeError as e:
            print(f"JSON Decode Error: {e}")
            print("Full response:", response_text)
            return []


In [16]:
def format_qa_pairs(qa_pairs):
    formatted_pairs = []
    for qa in qa_pairs:
        formatted_pairs.append({
            "instruction": qa["question"],
            "output": qa["answer"]
        })
    return formatted_pairs


In [17]:
pdf_paths = [
    "21. Heart attack. Know the symptoms. Take action (Article) Autor National Heart, Lung, and Blood Institute.pdf"

     ]
output_dir = "heart"

os.makedirs(output_dir, exist_ok=True)


In [18]:
# pdf_paths = [
#     "16. Trends in Cardiovascular Deaths Autor Australian Institute of Health and Welfare.pdf"

#      ]
for pdf in pdf_paths:
    all_qa = []
    text_chunks = extract_text_from_pdf(pdf)

for i in range(0, len(text_chunks), 5):
        chunk_text = "\n".join(text_chunks[i:i+5])
        print(f"Processing pages {i+1} to {i+5}...")

        qa_pairs = generate_qa_pairs_with_ollama(chunk_text)
        if not qa_pairs:
            continue

        formatted_pairs = format_qa_pairs(qa_pairs)
        all_qa.extend(formatted_pairs)

base_name = os.path.splitext(os.path.basename(pdf))[0]
safe_name = base_name.replace(" ", "_")
output_file_path = os.path.join(output_dir, f"{safe_name}.json")



Processing pages 1 to 5...
Generating response from Ollama...


In [16]:
with open(output_file_path, "w", encoding="utf-8") as f:
    json.dump(all_qa, f, indent=2, ensure_ascii=False)
print(f"Saved Q&A to: {output_file_path}")


Saved Q&A to: heart\16._Trends_in_Cardiovascular_Deaths_Autor_Australian_Institute_of_Health_and_Welfare.json
