## Notebook Objective:
- Feed each page into a TextNode
- Use semantic splitter on it.


In [1]:
import pandas as pd
import os
import json
from llama_index.legacy import Document
from llama_index.legacy.schema import TextNode
from llama_index.legacy.node_parser import SentenceWindowNodeParser, SemanticSplitterNodeParser
from llama_index.legacy.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.legacy.finetuning import (
    generate_qa_embedding_pairs,
    EmbeddingQAFinetuneDataset,
) 

from dotenv import load_dotenv
load_dotenv()
HF_KEY = os.getenv('HUGGINGFACE_API_KEY')
dense_embedder_api = os.getenv("HF_API_URL")
OPENAI_KEY = os.getenv("OPENAI_API_KEY")
GEMINI_KEY   = os.environ['GEMINI_KEY']
GROQ_API_KEY = os.environ["GROQ_API_KEY"]

import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [2]:
with open("../data/all_pages_temp.json", "r", encoding="utf-8") as fin:
    pdf_data = json.load(fin)

In [3]:
def convert_text_to_nodes(data):
    combined_pages = []
    
    for i in range(len(data)):
        page_text = list(data[i].values())[0]['Text'] + "\n\n"
        combined_pages.append(page_text)
    
    # Conversion of all the pages to one document
    document = [TextNode(text=page) for page in combined_pages]
    return document

page_nodes = convert_text_to_nodes(pdf_data)
for i in range(len(page_nodes)):
    if i ==4:     
        print(f"page {i+1}")
        print(page_nodes[i].text)
        print("-" * 100)

page 5
The heart is a muscular pump located in the center of the chest and slightly towards the left. 
The heart has two halves. The right side receives low oxygen blood from all parts of the body through veins and pumps it to the lungs via the pulmonary arteries to pick up oxygen.
The left side receives oxygen-rich blood from the lungs through the pulmonary veins and delivers it to all parts of the body, including the vital organs such as the heart, lungs, kidneys and brain.


----------------------------------------------------------------------------------------------------


## Generating QA Pairs dataset using Groq


In [50]:
from groq import Groq
from langchain_groq import ChatGroq
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
import re

def extract_answer(llm_response: str) -> dict:
    # Use a regular expression to find the content inside curly braces
    match = re.search(r'({.*})', llm_response, re.DOTALL)
    if match:
        json_content = match.group(1)
        try:
            # Convert the JSON content to a dictionary
            data_dict = json.loads(json_content)
            return data_dict
        except json.JSONDecodeError:
            return {"error": "Invalid JSON content"}
    else:
        return {"error": "No valid JSON content found"}
    
def generate_questions(page_text):
    class Header(BaseModel):
            questions: int = Field(description="Question about the content in the text corpus")
            
    parser = JsonOutputParser(pydantic_object=Header)
        
    template = \
    '''You are a medical professional tasked with generating questions for a test about the content of the text corpus of medical information from a manual made for hospital nurses in Singapore.
    You are to only refer to the text corpus below for the generation of questions. 
    For the text corpus below, generate 5 questions to be answered in a test by the nurses.

    Text corpus:
    {text}
    {format_instructions}

    Example output:
    {{"questions": [1. this is the first question, 2. This is the second question, 3. this is the third question, 4. This is the fourth question, 5. this is the fifth question,]}}
    Ensure and double check that the answer is in accordance to the format above.
    '''

    prompt = PromptTemplate(
            template=template,
            input_variables=["text"],
            partial_variables={"format_instructions": parser.get_format_instructions()},
        ) 
    
    final_prompt = prompt.format(text=page_text)

    client = Groq()
    completion = client.chat.completions.create(
            model="llama-3.1-8b-instant",
            messages=[
                {
                    "role": "user",
                    "content": final_prompt
                }
            ],
            temperature=0,
            max_tokens=1024,
            top_p=1,
            stream=True,
            stop=None,
        )

    answer = ''''''
    for chunk in completion:
        answer += chunk.choices[0].delta.content or ""
    question_dict = extract_answer(answer)
    return question_dict
    
def generate_answers(page_text, questions):
    class Header(BaseModel):
        answers: int = Field(description="Answer to the question with reference to the content in the text corpus")
            
    parser = JsonOutputParser(pydantic_object=Header)
        
    template = \
    '''You are a medical professional tasked with answering questions for a test about the content of the text corpus of medical information from a manual made for hospital nurses in Singapore.
    You are to only refer to the text corpus below for the answering of questions. 
    For the text corpus below, generate 5 professional answers for each of the questions.

    Text Corpus:
    {text}
    
    List of questions:
    {question_list}
    {format_instructions}

    Example output:
    {{"answers": [this is the answer to the first question, This is the answer to the second question, this is the answer to the third question, This is the answer to the fourth question, this is the answer to the fifth question,]}}
    Ensure and double check that the answer is in accordance to the format above.
    '''

    prompt = PromptTemplate(
            template=template,
            input_variables=["text", "question_list"],
            partial_variables={"format_instructions": parser.get_format_instructions()},
        ) 
    
    final_prompt = prompt.format(text = page_text, question_list=questions)

    client = Groq()
    completion = client.chat.completions.create(
            model="llama-3.1-8b-instant",
            messages=[
                {
                    "role": "user",
                    "content": final_prompt
                }
            ],
            temperature=0,
            max_tokens=1024,
            top_p=1,
            stream=True,
            stop=None,
        )

    answer = ''''''
    for chunk in completion:
        answer += chunk.choices[0].delta.content or ""
    answer_dict = extract_answer(answer)
    return answer_dict

def generate_QA_pairs(question_dict, answer_dict):
    answer_list = answer_dict["answers"]
    question_list = question_dict['questions']
    pairs = []
    for i in range(len(question_list)):
        pairs.append({"Question": question_list[i], "Answer": answer_list[i]})
    return pairs

sample_text = page_nodes[4].text
question_dict = generate_questions(sample_text)
question_str = json.dumps(question_dict['questions'])
answer_dict = generate_answers(sample_text, question_str)
qa_pairs = generate_QA_pairs(question_dict, answer_dict)
qa_pairs

2024-07-28 01:03:35,039 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2024-07-28 01:03:35,919 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"


[{'Question': 'What is the primary function of the heart in the human body?',
  'Answer': 'The primary function of the heart in the human body is to act as a muscular pump that circulates blood throughout the body.'},
 {'Question': 'Which side of the heart receives low oxygen blood from the body?',
  'Answer': 'The right side of the heart receives low oxygen blood from the body.'},
 {'Question': 'What is the purpose of the pulmonary arteries in the heart?',
  'Answer': 'The pulmonary arteries are responsible for transporting low oxygen blood from the body to the lungs for oxygenation.'},
 {'Question': 'What type of blood does the left side of the heart deliver to the body?',
  'Answer': 'The left side of the heart delivers oxygen-rich blood to the body.'},
 {'Question': 'Which vital organs receive oxygen-rich blood from the left side of the heart?',
  'Answer': 'The vital organs that receive oxygen-rich blood from the left side of the heart include the heart, lungs, kidneys, and brain.

## Fine tuning embedding model

In [69]:
test_nodes = [page_nodes[i] for i in range(len(page_nodes)) if i ==4]
train_dataset = generate_qa_embedding_pairs(test_nodes, llm=llm)
train_dataset.save_json("../data/train_dataset.json")
train_dataset = EmbeddingQAFinetuneDataset.from_json("train_dataset.json")


  0%|          | 0/1 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
2024-07-27 16:06:21,649 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
100%|██████████| 1/1 [00:04<00:00,  4.18s/it]


## Semantic splitter
- For now the observations dont seem useful 

In [44]:
embed_model = HuggingFaceEmbedding(
    model_name='sentence-transformers/all-mpnet-base-v2')
if embed_model:
    logging.info("Embedding model loaded...")

splitter = SemanticSplitterNodeParser(
    buffer_size=1, breakpoint_percentile_threshold=96, embed_model=embed_model
)
semantic_nodes = splitter.get_nodes_from_documents(pdf_node)
semantic_nodes

2024-07-27 13:48:51,549 - INFO - Embedding model loaded...


[TextNode(id_='d47e1863-f988-4ba2-85cc-828f260cae8e', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='7f759e4b-7199-4142-872b-e5f03e860a75', node_type=<ObjectType.DOCUMENT: '4'>, metadata={}, hash='e82b3bfab191e96e60c4e08eafb1a8a7f3b60f52743dd04a3bfeeb1ac73debeb'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='5c7ac8c1-e353-4db7-a87c-86db49b67e5b', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='0b8ed8bc0484f82185864022c23ab05a32eef785439112cadffea01638d924af')}, text='SRFAC\nSINGAPORE RESUSCITATION AND FIRST AID COUNCIL\nBasic Cardiac Life Support and Automated External Defibrillation (BCLS+AED) Provider Course Manual \n\nSingapore Resuscitation and First Aid Council \nAll rights reserved.\nNo part of this book may be reproduced, in any form or by any means, without prior written permission of SRFAC. \nREV 1 / 2022 \n\nCONTENTS\n1: CARDIAC ARREST AND YOU \

In [45]:
for el in semantic_nodes:
    print(el.text)
    print()
    print("-"* 100)

SRFAC
SINGAPORE RESUSCITATION AND FIRST AID COUNCIL
Basic Cardiac Life Support and Automated External Defibrillation (BCLS+AED) Provider Course Manual 

Singapore Resuscitation and First Aid Council 
All rights reserved.
No part of this book may be reproduced, in any form or by any means, without prior written permission of SRFAC. 
REV 1 / 2022 

CONTENTS
1: CARDIAC ARREST AND YOU 
1.1: Introduction 02 
1.2: The Heart, the Lungs and the Circulation 03 
1.3: Risk Factors for Heart Attack 06 
1.4: What Happens in a Heart Attack 07 
1.5: What Happens in a Cardiac Arrest 08 
1.6: Other Common Causes of Cardiac Arrest 10 
1.7: The Chain of Survival 11 
1.8: SCDF myResponder Mobile Application 13 

2: CARDIO PULMONARY RESUSCITATION (CPR) 
2.1: The Importance of Early CPR 16 
2.2: Adult One-Rescuer CPR 17 
2.3: CPR in Special Circumstances 27 

3: AUTOMATED EXTERNAL DEFIBRILLATION (AED) 
3.1: The Importance of Early Defibrillation 30 
3.2: Automated External Defibrillators (AEDs) 32 
3.3: Pre