In [1]:
import pandas as pd
import os
import json
from tqdm import trange
import time
import re

from llama_index.legacy import Document
from llama_index.legacy.schema import TextNode
from llama_index.legacy.node_parser import SentenceWindowNodeParser, SemanticSplitterNodeParser
from llama_index.legacy.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.legacy.finetuning import (
    generate_qa_embedding_pairs,
    EmbeddingQAFinetuneDataset,
) 
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from groq import Groq


## load evv variables
HF_KEY             = os.environ["HUGGINGFACE_API_KEY"]
OPENAI_KEY         = os.environ["OPENAI_API_KEY"]
GROQ_API_KEY       = os.environ["GROQ_API_KEY"]
CHAT_MODEL         = "llama3-70b-8192"
client = Groq()

import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [2]:
# Reading all the files
def export_text():
    all_txts = os.listdir('../data/combined_txts')

    all_sections = []

    for i in range(len(all_txts)):
        with open(f"../data/combined_txts/{all_txts[i]}", "r", encoding='utf-8') as fin:
            text = fin.read()
            sections = text.split("\n\n\n")
            all_sections.extend(sections)
            
    with open("../data/all_pdf_text.json", "w", encoding='utf-8') as fout:
        json.dump(all_sections, fout, ensure_ascii=False, indent=4)

## Generating QA Pairs

In [3]:
GENERATE_QUESTION_PROMPT = \
'''You are a professor proficient in medical aid. You are tasked with generating questions for a test about the content of the text corpus of medical information from a manual made for hospital nurses in Singapore.
You are to only refer to the text corpus below for the generation of questions. 
For the text corpus below, generate 5 high quality test questions for an upcoming examination for hospital nurses. 
Do not provide the answers.

Text corpus:
{text}
{format_instructions}
Ensure and double check that the answer is in accordance to the format above.
'''

GENERATE_ANSWER_PROMPT = \
'''You are a professor proficient in medical aid. You are tasked with generating answers for a test about the content of the text corpus of medical information from a manual made for hospital nurses in Singapore.
You are to only refer to the text corpus below for the answering of questions. 
For the text corpus below, generate 5 high quality and well elaborated answers for each of the questions.

Text Corpus:
{text}

List of questions:
{question_list}
{format_instructions}

Ensure and double check that the answer is in accordance to the format above.
'''

def extract_answer(input_string):
    # Trim the extraneous part of the string if necessary
    # Assuming the JSON data starts with `{` and ends with `}`
    json_start = input_string.find('{')
    json_end = input_string.rfind('}') + 1
    
    if json_start == -1 or json_end == -1:
        raise ValueError("Invalid input: No JSON data found.")

    json_data = input_string[json_start:json_end]
    
    try:
        # Convert the JSON string to a Python dictionary
        data_dict = json.loads(json_data)
        return data_dict
    
    
    except json.JSONDecodeError:
        # Use regex to find the JSON object with the 'questions' key
        pattern = r'{\s*"questions":\s*\[.*?\]\s*}'
        match = re.search(pattern, input_string, re.DOTALL)

        if match:
            data_json_str = match.group(0)
            data_dict = json.loads(data_json_str)
            return data_dict

def generate_questions(page_text, question_prompt, client):
    class  questions(BaseModel):
            questions: str = Field(description="Question about the content in the text corpus")
            
    parser = JsonOutputParser(pydantic_object= questions)

    prompt = PromptTemplate(
            template=question_prompt,
            input_variables=["text"],
            partial_variables={"format_instructions": parser.get_format_instructions()},
        ) 
    
    final_prompt = prompt.format(text=page_text)

    completion = client.chat.completions.create(
            model=CHAT_MODEL,
            messages=[
                {
                    "role": "user",
                    "content": final_prompt
                }
            ],
            temperature=0,
            max_tokens=1024,
            top_p=1,
            stream=True,
            stop=None,
        )

    answer = ''''''
    for chunk in completion:
        answer += chunk.choices[0].delta.content or ""
    question_dict = extract_answer(answer)
    if "error" in question_dict:
        logging.error(f"{question_dict['error']}")
    return question_dict
    
def generate_answers(page_text, questions, answer_prompt, client):
    class answer_list(BaseModel):
        answers: list = Field(description="Answer to the question with reference to the content in the text corpus")
            
    parser = JsonOutputParser(pydantic_object=answer_list)

    prompt = PromptTemplate(
            template=answer_prompt,
            input_variables=["text", "question_list"],
            partial_variables={"format_instructions": parser.get_format_instructions()},
        ) 
    
    final_prompt = prompt.format(text = page_text, question_list=questions)

    completion = client.chat.completions.create(
            model=CHAT_MODEL,
            messages=[
                {
                    "role": "user",
                    "content": final_prompt
                }
            ],
            temperature=0,
            max_tokens=1024,
            top_p=1,
            stream=True,
            stop=None,
        )

    answer = ''''''
    for chunk in completion:
        answer += chunk.choices[0].delta.content or ""
    answer_dict = extract_answer(answer)
    if "error" in answer_dict:
        logging.error(f"{answer_dict['error']}")
    return answer_dict

def generate_section_QA_pairs(section_text, client, question_prompt, answer_prompt):
    question_dict = generate_questions(section_text, question_prompt, client)
    question_list = [q_pair for q_pair in question_dict['questions']]
    question_str = json.dumps(question_list)
    answer_dict = generate_answers(section_text, question_str, answer_prompt, client)
    answer_list = answer_dict["answers"]
    question_list = question_dict['questions']
    qa_pairs = []
    
    for i in range(len(question_list)):
        qa_pairs.append({"Question": question_list[i], "Answer": answer_list[i]})
    
    return qa_pairs

def generate_all_qa_pairs():
    with open("../data/all_pdf_text.json", "r", encoding="utf-8") as fin:
        all_sections = json.load(fin)
    
    all_pairs = []
    for i in trange(len(all_sections)):
        section_qa_pair = generate_section_QA_pairs(all_sections[i], client, GENERATE_QUESTION_PROMPT, GENERATE_ANSWER_PROMPT)
        all_pairs.extend(section_qa_pair)
        
        with open("../data/QA_pairs.json", "w", encoding='utf-8') as fout:
            json.dump(all_pairs, fout, ensure_ascii=False, indent=4)
            
        # Ensure API rate limit does not exceed
        if i>29 and i % 30 == 0:
            time.sleep(65)
        if i >2:
            break

In [4]:
generate_all_qa_pairs()

  0%|          | 0/64 [00:00<?, ?it/s]2024-08-14 15:51:40,724 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2024-08-14 15:51:41,591 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
  2%|▏         | 1/64 [00:02<02:51,  2.72s/it]2024-08-14 15:51:43,456 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2024-08-14 15:51:44,063 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
  3%|▎         | 2/64 [00:04<02:20,  2.26s/it]2024-08-14 15:51:45,326 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2024-08-14 15:51:46,148 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
  5%|▍         | 3/64 [00:06<01:59,  1.96s/it]2024-08-14 15:51:46,927 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK