# Generate QnA synthetic dataset from a Complex PDF using Azure AI Document Intelligence

### Overview
We process the PDF by dividing it into three parts.

- **Text-heavy** - Text-heavy PDF can be processed with open source without the need to use toolkits like Azure AI Document Intelligence or Unstructured.
- **Image-heavy** - Image-heavy PDF can be converted the entire page to images and let a multimodal LLM like GPT-4o summarize each page.
- **Mixed** - After reading the document with Azure AI Document Intelligence, we replace the image descriptions inside the figure tags with text summarized by a multimodal LLM. (Often the image descriptions are blank or have only a short caption.)

## 1. Read & Preprocess PDF file
---

### Split the PDFs into individual pages
Only use a poration of the PDF documents for testing

In [11]:
import os
import json
from openai import AzureOpenAI
from dotenv import load_dotenv
load_dotenv()

client = AzureOpenAI(
    azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_key        = os.getenv("AZURE_OPENAI_API_KEY"),
    api_version    = os.getenv("AZURE_OPENAI_API_VERSION")
)

In [10]:
import os
os.getenv("AZURE_OPENAI_ENDPOINT"), os.getenv("AZURE_OPENAI_API_KEY"), os.getenv("AZURE_OPENAI_API_VERSION")

('https://oai-daekeun.openai.azure.com/',
 '9339425b4aa54e2080397e677003ce8f',
 '2024-05-01-preview')

In [13]:
# Create your first prompt
system_message = """너는 긍정과 부정을 구분할 수 있는 에이전트야. 결과는 JSON 형식으로 공백 없이 반환해줘. 답변 예:{"1": "긍정", "2": "부정"}"""
user_message = """1. 모니터가 너무 뜨거워. 2. 모니터가 시장 반응이 너무 뜨거워."""

# Simple API Call
response = client.chat.completions.create(
    model="gpt-4o",
    max_tokens=60,
    messages=[
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_message},
    ]
)

response.choices[0].message.content

TypeError: Missing required arguments; Expected either ('messages' and 'model') or ('messages', 'model' and 'stream') arguments to be given

In [None]:
discipline

In [36]:
system_prompt = "You are an AI assistant good at generating Synthetic QnA."
prompt = """
Create a taxonomy of human knowledge and capabilities in JSON format. Break it down into fields, sub-fields, and disciplines. Response must be Korean language.
"""
response = client.chat.completions.create(
    model="gpt-4o",
    response_format={ "type": "json_object" },
    max_tokens=1000,
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": prompt},
    ]
)

In [37]:
print(response.choices[0].message.content)

{
  "지식과 능력의 분류": {
    "자연과학": {
      "물리학": {
        "고전물리학": {},
        "양자물리학": {},
        "상대성이론": {}
      },
      "화학": {
        "분석화학": {},
        "유기화학": {},
        "물리화학": {}
      },
      "생물학": {
        "세포생물학": {},
        "유전학": {},
        "생태학": {}
      },
      "지구과학": {
        "지질학": {},
        "기상학": {},
        "천문학": {}
      }
    },
    "공학": {
      "기계공학": {
        "열역학": {},
        "로봇공학": {},
        "재료공학": {}
      },
      "전기공학": {
        "전자공학": {},
        "네트워크공학": {},
        "제어공학": {}
      },
      "화학공학": {
        "공정공학": {},
        "재료공학": {},
        "에너지공학": {}
      },
      "생명공학": {
        "유전자공학": {},
        "생체재료학": {},
        "생의학공학": {}
      }
    },
    "인문학": {
      "문학": {
        "고전문학": {},
        "현대문학": {},
        "비평문학": {}
      },
      "역사학": {
        "고대사": {},
        "중세사": {},
        "근현대사": {}
      },
      "철학": {
        "형이상학": {},
        "윤리학": {},
        "논리학": {}
      },
      "언어학": {

In [38]:
taxonomy = response.choices[0].message.content
try:

    taxonomy_json = json.loads(taxonomy)
except json.JSONDecodeError:
    taxonomy_json = {"error": "Failed to parse JSON"}


In [40]:
taxonomy_json

{'지식과 능력의 분류': {'자연과학': {'물리학': {'고전물리학': {}, '양자물리학': {}, '상대성이론': {}},
   '화학': {'분석화학': {}, '유기화학': {}, '물리화학': {}},
   '생물학': {'세포생물학': {}, '유전학': {}, '생태학': {}},
   '지구과학': {'지질학': {}, '기상학': {}, '천문학': {}}},
  '공학': {'기계공학': {'열역학': {}, '로봇공학': {}, '재료공학': {}},
   '전기공학': {'전자공학': {}, '네트워크공학': {}, '제어공학': {}},
   '화학공학': {'공정공학': {}, '재료공학': {}, '에너지공학': {}},
   '생명공학': {'유전자공학': {}, '생체재료학': {}, '생의학공학': {}}},
  '인문학': {'문학': {'고전문학': {}, '현대문학': {}, '비평문학': {}},
   '역사학': {'고대사': {}, '중세사': {}, '근현대사': {}},
   '철학': {'형이상학': {}, '윤리학': {}, '논리학': {}},
   '언어학': {'구조주의언어학': {}, '역사언어학': {}, '사회언어학': {}}},
  '사회과학': {'심리학': {'발달심리학': {}, '임상심리학': {}, '사회심리학': {}},
   '사회학': {'문화사회학': {}, '경제사회학': {}, '정치사회학': {}},
   '경제학': {'거시경제학': {}, '미시경제학': {}, '국제경제학': {}},
   '정치학': {'정치이론': {}, '비교정치학': {}, '국제관계학': {}}},
  '예술': {'시각예술': {'회화': {}, '조각': {}, '그래픽디자인': {}},
   '공연예술': {'연극': {}, '무용': {}, '음악': {}},
   '문학예술': {'시': {}, '소설': {}, '희곡': {}},
   '미디어예술': {'영화': {}, '텔레비전'

In [17]:
system_prompt = "You are an AI assistant good at generating Synthetic QnA."
# Taxonomy 생성 함수
def generate_taxonomy():
    prompt = """
    Create a taxonomy of human knowledge and capabilities. Break it down into fields, sub-fields, and disciplines.
    """
    response = client.chat.completions.create(
        model="gpt-4o",
        response_format={ "type": "json_object" },
        max_tokens=60,
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": prompt},
        ]
    )
    # taxonomy = response.choices[0].text.strip()
    # try:
    #     taxonomy_json = json.loads(taxonomy)
    # except json.JSONDecodeError:
    #     taxonomy_json = {"error": "Failed to parse JSON"}
    # return taxonomy_json


In [None]:
def generate_subjects(discipline):
    prompt = f"""
    You are an expert in {discipline}. Create a comprehensive list of subjects a student should learn under this discipline. 
    For each subject, provide the level (e.g., undergraduate, graduate) and include key subtopics. 
    Present the result in JSON format. Language must be Korean.
    """
    response = client.chat.completions.create(
        model="gpt-4o",
        response_format={ "type": "json_object" },
        max_tokens=100,
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": prompt},
        ]
    )
    subjects = re

In [53]:
#def generate_subjects(discipline):
discipline = "Computer Science"
prompt = f"""
You are an expert in {discipline}. Create a comprehensive list of subjects a student should learn under this discipline. 
For each subject, provide the level (e.g., 100, 200, 300, 400, 500) and include key subtopics. 
Present the result in JSON format. 
For each subject s, it should be s.level, s.name and s.subtopics.
"""
response = client.chat.completions.create(
    model="gpt-4o",
    response_format={ "type": "json_object" },
    max_tokens=1000,
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": prompt},
    ]
)
subjects = response.choices[0].message.content
try:
    subjects_json = json.loads(subjects)
except json.JSONDecodeError:
    subjects_json = {"error": "Failed to parse JSON"}

In [57]:
print(subjects)

{
  "subjects": [
    {
      "name": "Introduction to Computer Science",
      "level": 100,
      "subtopics": [
        "Programming basics",
        "Algorithms and Data Structures",
        "Software Development Methodologies",
        "Introduction to Databases",
        "Basic Operating Systems Concepts"
      ]
    },
    {
      "name": "Data Structures and Algorithms",
      "level": 200,
      "subtopics": [
        "Arrays and Linked Lists",
        "Stacks and Queues",
        "Trees and Graphs",
        "Sorting and Searching Algorithms",
        "Big-O Notation and Complexity Analysis"
      ]
    },
    {
      "name": "Object-Oriented Programming",
      "level": 200,
      "subtopics": [
        "Classes and Objects",
        "Inheritance and Polymorphism",
        "Encapsulation and Abstraction",
        "Design Patterns",
        "Exception Handling"
      ]
    },
    {
      "name": "Computer Architecture",
      "level": 300,
      "subtopics": [
        "CPU Des

In [90]:
#def generate_syllabus(subject, level, subtopics):
subject = "Introduction to Computer Science"
level = "100"
subtopics = ["Basic Programming Concepts", "Algorithms and Data Structures"]
prompt = f"""
You are an expert in creating educational syllabi. Create a detailed syllabus for the subject "{subject}" at the {level} level. 
The syllabus should be broken down into multiple class sessions, each covering different key concepts. 
The subtopics for this subject include: {subtopics}. Provide the syllabus in JSON format with the following structure:
[
    {{
        "session_name": "Session 1 Name",
        "description": "Brief description of the session",
        "key_concepts": ["Key concept 1", "Key concept 2", ...]
    }},
    ...
]
"""

response = client.chat.completions.create(
    model="gpt-4o",
    response_format={ "type": "json_object" },
    max_tokens=2000,
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": prompt},
    ]
)

syllabus = response.choices[0].message.content
try:
    syllabus_json = json.loads(syllabus)
except json.JSONDecodeError:
    syllabus_json = {"error": "Failed to parse JSON"}

In [91]:
syllabus_json

{'syllabus': [{'session_name': 'Introduction to Computer Science',
   'description': 'Overview of computer science, its history, and its significance in modern society.',
   'key_concepts': ['History of Computing',
    'Branches of Computer Science',
    'Impact of Computing on Society']},
  {'session_name': 'Understanding Computers and Operating Systems',
   'description': 'Introduction to computer hardware and software, basic understanding of operating systems.',
   'key_concepts': ['Computer Hardware',
    'Computer Software',
    'Operating Systems']},
  {'session_name': 'Introduction to Basic Programming Concepts',
   'description': 'Introduction to basic programming concepts using a high-level programming language (e.g., Python).',
   'key_concepts': ['Programming Languages',
    'Syntax and Semantics',
    'Basic Input/Output',
    'Variables and Data Types']},
  {'session_name': 'Control Structures in Programming',
   'description': 'Understanding and using control structures l

In [81]:
syllabus_json['syllabus'][0]

{'session_name': 'Introduction to Computer Science',
 'description': 'An overview of the field of computer science and its significance.',
 'key_concepts': ['History of computer science',
  'Applications of computer science',
  'Introduction to hardware and software']}

In [84]:
syllabus_json["syllabus"][0]["session_name"]

'Introduction to Computer Science'

In [87]:
print(prompt)


    Generate a homework question based on the following class session and key concepts
    ====    
    class session: Introduction to Computer Science
    
    key_concepts: ['History of computer science', 'Applications of computer science', 'Introduction to hardware and software']
    ====
    
    The question should be challenging and cover multiple key concepts from the syllabus.
    Language must be Korean. Output format must be text.
    


In [93]:
questions = []
num_questions = 2

class_session = syllabus_json["syllabus"][0]["session_name"]
key_concepts = syllabus_json["syllabus"][0]["key_concepts"]
for _ in range(num_questions):
    prompt = f"""
    Generate a homework question based on the following class session and key concepts
    ====    
    class session: {class_session}
    
    key_concepts: {key_concepts}
    ====
    
    The question should be challenging and cover multiple key concepts from the syllabus.
    Language must be Korean. Output format must be text.
    """

    response = client.chat.completions.create(
        model="gpt-4o",
        max_tokens=2000,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": prompt},
        ]
    )

    question = response.choices[0].message.content
    questions.append(question)

In [95]:
print(questions[1])

---

다음 질문에 답하십시오. 

---

**문제:**

컴퓨터 과학은 여러 가지 중요한 발전 단계를 거쳐 왔습니다. 아래에 제시된 개념들을 활용하여 다음 질문에 답변해 보세요.

1. 컴퓨터 과학의 발전사를 간략히 설명하시오. 초기 컴퓨터의 등장부터 최근의 기술 발전까지 중요한 사건들을 포함하십시오.
2. 오늘날 컴퓨터 과학이 다양한 분야에서 어떻게 응용되고 있는지에 대해 두 가지 사례를 들어 설명하시오. 사례 중 하나는 소프트웨어 응용에, 다른 하나는 하드웨어 발전에 초점을 맞추어 작성하시오.
3. 하드웨어와 소프트웨어의 차이점을 정의하고, 각 요소가 컴퓨터 과학의 발전에 어떤 역할을 했는지 설명하시오.

**힌트:**

- 초기 컴퓨터로는 예를 들어, ENIAC과 같은 컴퓨터를 들 수 있습니다.
- 소프트웨어 응용으로는 예를 들어, 인공지능이나 데이터 분석 등을 생각해 볼 수 있습니다.
- 하드웨어의 발전으로는 예를 들어, 반도체 기술의 발전이나 퀀텀 컴퓨팅 등을 들 수 있습니다.

답변은 500자 내외로 작성하십시오.

---


In [107]:
import random
#단일 세션 샘플링의 경우, 하나의 세션에서 1~5개의 핵심 개념을 무작위로 선택.
#다중 세션 샘플링의 경우, 두 개의 세션에서 핵심 개념을 결합하고 2~5개의 핵심 개념을 무작위로 선택.

def sample_class_sessions_and_key_concepts(class_sessions, key_concepts, single_session=True):
    """
    클래스 세션과 핵심 개념을 샘플링하여 다양한 난이도의 질문을 생성합니다.
    
    :param class_sessions: 클래스 세션의 리스트
    :param key_concepts: 각 세션에 대한 핵심 개념의 리스트
    :param single_session: 단일 세션에서 샘플링할지 다중 세션에서 샘플링할지 여부
    :return: 샘플링된 클래스 세션과 핵심 개념의 조합
    """
    if single_session:
        # 단일 세션에서 샘플링
        session_index = random.randint(0, len(class_sessions) - 1)
        selected_session = class_sessions[session_index]
        num_concepts = min(5, len(key_concepts[session_index]))
        selected_key_concepts = random.sample(key_concepts[session_index], k=random.randint(1, num_concepts))
    else:
        # 다중 세션에서 샘플링
        if len(class_sessions) < 2:
            raise ValueError("Not enough sessions for multi-session sampling")
        session_indices = random.sample(range(len(class_sessions)), k=2)
        selected_sessions = [class_sessions[i] for i in session_indices]
        combined_key_concepts = key_concepts[session_indices[0]] + key_concepts[session_indices[1]]
        num_concepts = min(5, len(combined_key_concepts))
        selected_key_concepts = random.sample(combined_key_concepts, k=random.randint(2, num_concepts))
    
    return selected_session if single_session else selected_sessions, selected_key_concepts

# 예제 실행
class_sessions = ["Session 1: Sorting Algorithms", "Session 2: Searching Algorithms", "Session 3: Graph Algorithms"]
key_concepts = [
    ["Bubble Sort", "Merge Sort", "Quick Sort"],
    ["Binary Search", "Depth-First Search", "Breadth-First Search"],
    ["Dijkstra's Algorithm", "Floyd-Warshall Algorithm", "Kruskal's Algorithm"]
]

# 단일 세션 샘플링
single_session_result = sample_class_sessions_and_key_concepts(class_sessions, key_concepts, single_session=True)
print("Single Session Sampling:\n", single_session_result)

# 다중 세션 샘플링
multi_session_result = sample_class_sessions_and_key_concepts(class_sessions, key_concepts, single_session=False)
print("Multi Session Sampling:\n", multi_session_result)


Single Session Sampling:
 ('Session 2: Searching Algorithms', ['Depth-First Search', 'Binary Search'])
Multi Session Sampling:
 (['Session 3: Graph Algorithms', 'Session 2: Searching Algorithms'], ['Breadth-First Search', 'Binary Search', "Dijkstra's Algorithm"])


{'syllabus': [{'session_name': 'Introduction to Computer Science',
   'description': 'An overview of the field of computer science and its significance.',
   'key_concepts': ['History of computer science',
    'Applications of computer science',
    'Introduction to hardware and software']},
  {'session_name': 'Basics of Programming Languages',
   'description': 'Introduction to different programming languages and their paradigms.',
   'key_concepts': ['Types of programming languages',
    'Programming paradigms',
    'Hello World in multiple languages']},
  {'session_name': 'Getting Started with Python',
   'description': 'Setting up Python and writing first simple programs.',
   'key_concepts': ['Installing Python',
    'Python IDEs',
    'Writing and running a basic Python script']},
  {'session_name': 'Variables and Data Types',
   'description': 'Understanding different data types and variables in Python.',
   'key_concepts': ['Variables',
    'Data types (int, float, str, bool)',

In [68]:
print(questions[0])

    	{
        "syllabus_question": {
            "session_name": "Introduction to Algorithms, Control Structures: Conditionals, Control Structures: Loops, Functions: Definition and Usage",
            "description": "여러 개념을 활용하여 문제를 해결하는 과제를 통해 학습한 내용을 종합적으로 복습하세요.",
            "question": "주어진 정수 배열이 오름차순으로 정렬되어 있는지를 확인하는 Python 프로그램을 작성하세요. 이 프로그램은 다음과 같은 기능을 포함해야 합니다: \n1. 배열을 입력받는 함수 (함수를 정의하고, 함수 인자를 활용하세요).\n2. 배열의 각 요소를 비교하여 정렬 상태를 확인하는 조건문 (if, else, elif)을 사용하세요.\n3. 배열을 순회하며 검사를 수행하는 루프 (for loop 또는 while loop)를 사용하세요.\n4. 정렬되었는지 여부를 반환하는 값 (True 또는 False)을 반환하는 함수입니다.",
            "key_concepts": ["Functions", "Conditionals", "Loops", "Algorithm efficiency"]
        }
    }



In [71]:
print(questions[1])

    
    	{
	"question": "다음 과제는 여러 개의 세션에서 배운 주요 개념을 조합하여 풀어야 합니다. Python을 사용하여 아래의 요구 사항을 만족하는 프로그램을 작성하십시오.\n\n1. 사용자로부터 여러 개의 정수를 입력받아 리스트에 저장하십시오. (세션: '기본 데이터 구조: 리스트와 튜플')\n2. 리스트에 있는 숫자 중에서 가장 큰 값을 찾기 위해 'for 루프'를 사용하여 탐색 알고리즘을 구현하십시오. (세션: '제어 구조: 루프', '탐색 알고리즘')\n3. 입력받은 정수 리스트를 오름차순으로 정렬하기 위해 '버블 정렬' 알고리즘을 사용하십시오. (세션: '정렬 알고리즘')\n4. 정렬된 리스트의 중간 값을 출력하십시오. 만약 리스트의 길이가 짝수라면, 중간 두 값의 평균을 출력하십시오. (세션: '기본 데이터 구조: 리스트와 튜플')\n5. 이 모든 과정을 하나의 함수 안에서 처리하고, 해당 함수의 반환값으로 최종 결과를 출력하십시오. (세션: '함수: 정의와 사용')\n\n이 프로그램을 작성한 후, 각 단계에서 어떤 프로그래밍 개념이 사용되었는지 설명하는 주석을 추가하십시오. ",
	"keywords": [
		"리스트와 튜플",
		"제어 구조: 루프",
		"탐색 알고리즘",
		"정렬 알고리즘",
		"함수: 정의와 사용"
	]
}


In [None]:
# 명령어 생성 함수
def generate_instructions(syllabus, num_questions=5):
    questions = []
    for _ in range(num_questions):
        prompt = f"""
        Generate a homework question based on the following syllabus:
        
        {syllabus}
        
        The question should be challenging and cover multiple key concepts from the syllabus.
        """
        response = openai.Completion.create(
            engine="text-davinci-003",
            prompt=prompt,
            max_tokens=200,
            n=1,
            stop=None,
            temperature=0.7
        )
        question = response.choices[0].text.strip()
        questions.append(question)
    
    instructions = []
    for question in questions:
        prompt = f"""
        Here is a homework question:
        
        {question}
        
        Provide a detailed answer to the question.
        """
        response = openai.Completion.create(
            engine="text-davinci-003",
            prompt=prompt,
            max_tokens=500,
            n=1,
            stop=None,
            temperature=0.7
        )
        answer = response.choices[0].text.strip()
        instructions.append({
            "question": question,
            "answer": answer
        })
    
    return instructions


In [None]:
def generate_syllabus(subject, level, subtopics):
    prompt = f"""
    You are an expert in creating educational syllabi. Create a detailed syllabus for the subject "{subject}" at the {level} level. 
    The syllabus should be broken down into multiple class sessions, each covering different key concepts. 
    The subtopics for this subject include: {subtopics}. Provide the syllabus in JSON format with the following structure:
    [
        {{
            "session_name": "Session 1 Name",
            "description": "Brief description of the session",
            "key_concepts": ["Key concept 1", "Key concept 2", ...]
        }},
        ...
    ]
    """
    response = openai.Completion.create(
        engine="text-davinci-003",
        prompt=prompt,
        max_tokens=1500
    )
    syllabus = response.choices[0].text.strip()
    try:
        syllabus_json = json.loads(syllabus)
    except json.JSONDecodeError:
        syllabus_json = {"error": "Failed to parse JSON"}
    return syllabus_json

In [None]:
import openai
import json

# OpenAI API 키 설정
openai.api_key = 'your-api-key'

# Taxonomy 생성 함수
def generate_taxonomy():
    prompt = """
    Create a taxonomy of human knowledge and capabilities. Break it down into fields, sub-fields, and disciplines.
    """
    response = openai.Completion.create(
        engine="text-davinci-003",
        prompt=prompt,
        max_tokens=1500
    )
    taxonomy = response.choices[0].text.strip()
    try:
        taxonomy_json = json.loads(taxonomy)
    except json.JSONDecodeError:
        taxonomy_json = {"error": "Failed to parse JSON"}
    return taxonomy_json

# 과목 생성 함수
def generate_subjects(discipline):
    prompt = f"""
    You are an expert in {discipline}. Create a comprehensive list of subjects a student should learn under this discipline. For each subject, provide the level (e.g., undergraduate, graduate) and include key subtopics. Present the result in JSON format.
    """
    response = openai.Completion.create(
        engine="text-davinci-003",
        prompt=prompt,
        max_tokens=1500
    )
    subjects = response.choices[0].text.strip()
    try:
        subjects_json = json.loads(subjects)
    except json.JSONDecodeError:
        subjects_json = {"error": "Failed to parse JSON"}
    return subjects_json

# 커리큘럼 생성 함수
def generate_syllabus(subject, level, subtopics):
    prompt = f"""
    You are an expert in creating educational syllabi. Create a detailed syllabus for the subject "{subject}" at the {level} level. The syllabus should be broken down into multiple class sessions, each covering different key concepts. The subtopics for this subject include: {subtopics}. Provide the syllabus in JSON format with the following structure:
    [
        {{
            "session_name": "Session 1 Name",
            "description": "Brief description of the session",
            "key_concepts": ["Key concept 1", "Key concept 2", ...]
        }},
        ...
    ]
    """
    response = openai.Completion.create(
        engine="text-davinci-003",
        prompt=prompt,
        max_tokens=1500
    )
    syllabus = response.choices[0].text.strip()
    try:
        syllabus_json = json.loads(syllabus)
    except json.JSONDecodeError:
        syllabus_json = {"error": "Failed to parse JSON"}
    return syllabus_json

# 명령어 생성 함수
def generate_instructions(syllabus, num_questions=5):
    questions = []
    for _ in range(num_questions):
        prompt = f"""
        Generate a homework question based on the following syllabus:
        
        {syllabus}
        
        The question should be challenging and cover multiple key concepts from the syllabus.
        """
        response = openai.Completion.create(
            engine="text-davinci-003",
            prompt=prompt,
            max_tokens=200,
            n=1,
            stop=None,
            temperature=0.7
        )
        question = response.choices[0].text.strip()
        questions.append(question)
    
    instructions = []
    for question in questions:
        prompt = f"""
        Here is a homework question:
        
        {question}
        
        Provide a detailed answer to the question.
        """
        response = openai.Completion.create(
            engine="text-davinci-003",
            prompt=prompt,
            max_tokens=500,
            n=1,
            stop=None,
            temperature=0.7
        )
        answer = response.choices[0].text.strip()
        instructions.append({
            "question": question,
            "answer": answer
        })
    
    return instructions

# Algorithm 1 구현
def glan_instruction_generation():
    # 1. Taxonomy 생성
    taxonomy = generate_taxonomy()
    if "error" in taxonomy:
        print("Error generating taxonomy:", taxonomy["error"])
        return
    
    instructions_list = []
    
    # 2. 각 학문 분야에 대해 처리
    for discipline in taxonomy.get('disciplines', []):
        # 3. 과목 생성
        subjects = generate_subjects(discipline)
        if "error" in subjects:
            print(f"Error generating subjects for {discipline}:", subjects["error"])
            continue
        
        for subject_data in subjects:
            subject = subject_data["subject_name"]
            level = subject_data["level"]
            subtopics = ", ".join(subject_data["subtopics"])
            
            # 4. 커리큘럼 생성
            syllabus = generate_syllabus(subject, level, subtopics)
            if "error" in syllabus:
                print(f"Error generating syllabus for {subject}:", syllabus["error"])
                continue
            
            # 5. 명령어 생성
            instructions = generate_instructions(json.dumps(syllabus))
            instructions_list.extend(instructions)
    
    return instructions_list

# 실행
instructions_list = glan_instruction_generation()
for i, instruction in enumerate(instructions_list):
    print(f"Instruction {i+1}:\nQuestion: {instruction['question']}\nAnswer: {instruction['answer']}\n")


In [None]:

import os, shutil, random
import openai
from dotenv import load_dotenv
from langchain_community.document_loaders.csv_loader import CSVLoader

load_dotenv()

raw_data_dir = "raw_data"
splitted_raw_data_dir = f"splitted_{raw_data_dir}"

#file_path = f"{raw_data_dir}/prod-unst-pdf/[Sales Talk] 3. QnA3_Handling Objection_(S24)_240227.pdf"
file_path = f"{raw_data_dir}/prod-unst-pdf/SM-S92X_UG_UU_Kor_Rev.1.1_240129.pdf"


In [None]:
import fitz

# Open the first PDF document
doc1 = fitz.open(file_path)
split_pages = [(1, 15)]

for idx, s in enumerate(split_pages):
    # Create a new empty PDF document
    doc2 = fitz.open()

    # Insert the first 2 pages of doc1 into doc2
    doc2.insert_pdf(doc1, from_page=s[0], to_page=s[1])

    # Save the modified document
    doc2.save(f"{raw_data_dir}/prod-unst-pdf/s24-user-manual-part{idx}.pdf")

In [None]:
from util.common_utils import delete_folder_and_make_folder
from util.preprocess import remove_short_sentences, remove_small_images, analyze_pdf_page_content, split_pdf

file_path = f"{raw_data_dir}/prod-unst-pdf/s24-user-manual-part0.pdf"
result = analyze_pdf_page_content(file_path)
delete_folder_and_make_folder(splitted_raw_data_dir)    

print("### PDF Content Analysis Result:")
for content_type, pages in result.items():
    print(f"{content_type} pages: {pages}")
    split_pdf(file_path, f"{splitted_raw_data_dir}/{content_type}.pdf", pages)

In [None]:
import os
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import ContentFormat
from openai import AzureOpenAI

doc_intelligence_endpoint = os.getenv("AZURE_DOC_INTELLIGENCE_ENDPOINT")
doc_intelligence_key = os.getenv("AZURE_DOC_INTELLIGENCE_KEY")

document_intelligence_client = DocumentIntelligenceClient(
    endpoint=doc_intelligence_endpoint, 
    credential=AzureKeyCredential(doc_intelligence_key),
    headers={"x-ms-useragent":"sample-code-figure-understanding/1.0.0"},
)

aoai_api_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
aoai_api_key = os.getenv("AZURE_OPENAI_API_KEY")
aoai_api_version = os.getenv("AZURE_OPENAI_API_VERSION")
aoai_deployment_name = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")

client = AzureOpenAI(
    api_key=aoai_api_key,  
    api_version=aoai_api_version,
    base_url=f"{aoai_api_endpoint}/openai/deployments/{aoai_deployment_name}",
    max_retries=1
)

### Case 1: Mixed page (Images and text mixed appropriately)
After reading the document with Azure AI Document Intelligence, we replace the image descriptions inside the figure tags with text summarized by a multimodal LLM. (Often the image descriptions are blank or have only a short caption.)


#### Analyze Document

In [None]:
pdf_mixed_path = f"{splitted_raw_data_dir}/Mixed.pdf"

with open(pdf_mixed_path, "rb") as f:
    poller = document_intelligence_client.begin_analyze_document(
        "prebuilt-layout", analyze_request=f, content_type="application/octet-stream", 
        output_content_format=ContentFormat.MARKDOWN 
    )

result = poller.result()
md_content = result.content

#### Updates the content of the figure description (empty content or caption) with the image summary text generated by gpt-4o.

In [None]:
%%time
from util.preprocess import (
    image_complexity, is_bounding_box_larger_than, crop_image_from_file, 
    understand_image_with_gpt, update_figure_description
)
output_folder = "pdf_mixed_tmp"
delete_folder_and_make_folder(output_folder)
language = "Korean"
max_tokens = 1024
input_file_path = file_path

if result.figures:
    print("Figures:")
    for idx, figure in enumerate(result.figures):
        figure_content = ""
        img_description = ""
        #print(f"Figure #{idx} has the following spans: {figure.spans}")
        
        for i, span in enumerate(figure.spans):
            #print(f"Span #{i}: {span}")
            figure_content += md_content[span.offset:span.offset + span.length]
        #print(f"Original figure content in markdown: {figure_content}")

        # Note: figure bounding regions currently contain both the bounding region of figure caption and figure body
        if figure.caption:
            caption_region = figure.caption.bounding_regions
            #print(f"\tCaption: {figure.caption.content}")
            #print(f"\tCaption bounding region: {caption_region}")
            for region in figure.bounding_regions:
                if region not in caption_region:
                    #print(f"\tFigure body bounding regions: {region}")
                    # To learn more about bounding regions, see https://aka.ms/bounding-region
                    boundingbox = (
                            region.polygon[0],  # x0 (left)
                            region.polygon[1],  # y0 (top)
                            region.polygon[4],  # x1 (right)
                            region.polygon[5]   # y1 (bottom)
                        )

                    if is_bounding_box_larger_than(boundingbox):
                        #print(f"\tFigure body bounding box in (x0, y0, x1, y1): {boundingbox}")
                        cropped_image = crop_image_from_file(input_file_path, region.page_number - 1, boundingbox) # page_number is 1-indexed

                        if image_complexity(cropped_image)[0] == "Complex":
                            # Get the base name of the file
                            base_name = os.path.basename(input_file_path)
                            # Remove the file extension
                            file_name_without_extension = os.path.splitext(base_name)[0]

                            output_file = f"{file_name_without_extension}_cropped_image_{idx}.png"
                            cropped_image_filename = os.path.join(output_folder, output_file)

                            cropped_image.save(cropped_image_filename)
                            print(f"\tFigure {idx} cropped and saved as {cropped_image_filename}")

                            try: 
                                image_summarization = understand_image_with_gpt(client, aoai_deployment_name, cropped_image_filename, "", max_tokens=max_tokens, language=language)
                            except openai.BadRequestError as e:
                                print(f"BadRequestError: {e}")
                                image_summarization = ""
                            img_description += image_summarization

                            print(f"\tDescription of figure {idx}: {img_description}")
                        else:
                            print(f'simple image at idx {idx}')

        else:
            #print("\tNo caption found for this figure.")
            for region in figure.bounding_regions:
                #print(f"\tFigure body bounding regions: {region}")
                # To learn more about bounding regions, see https://aka.ms/bounding-region
                boundingbox = (
                        region.polygon[0],  # x0 (left)
                        region.polygon[1],  # y0 (top
                        region.polygon[4],  # x1 (right)
                        region.polygon[5]   # y1 (bottom)
                    )

                if is_bounding_box_larger_than(boundingbox):                    
                    #print(f"\tFigure body bounding box in (x0, y0, x1, y1): {boundingbox}")

                    cropped_image = crop_image_from_file(input_file_path, region.page_number - 1, boundingbox) # page_number is 1-indexed

                    if image_complexity(cropped_image)[0] == "Complex":
                        # Get the base name of the file
                        base_name = os.path.basename(input_file_path)
                        # Remove the file extension
                        file_name_without_extension = os.path.splitext(base_name)[0]

                        output_file = f"{file_name_without_extension}_cropped_image_{idx}.png"
                        cropped_image_filename = os.path.join(output_folder, output_file)
                        # cropped_image_filename = f"data/cropped/image_{idx}.png"
                        cropped_image.save(cropped_image_filename)
                        #print(f"\tFigure {idx} cropped and saved as {cropped_image_filename}")

                        try:
                            image_summarization = understand_image_with_gpt(client, aoai_deployment_name, cropped_image_filename, "", max_tokens=max_tokens, language=language)
                        except openai.BadRequestError as e:
                            print(f"BadRequestError: {e}")
                            image_summarization = ""
                        img_description += image_summarization
                        print(f"\tDescription of figure {idx}: {img_description}")
                    else:
                        print(f'simple image at idx {idx}')

        
        md_content = update_figure_description(md_content, img_description, idx)

In [None]:
# from IPython.display import display, Markdown, Latex
# display(Markdown(md_content[:200]))

Generate chunks for mixed pages

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
import re

text_splitter = RecursiveCharacterTextSplitter(
    separators=[
        r'<!-- PageNumber="\d+" -->',
        r"\n\n",
        r"\n",
        " ",
        ".",
        "",
    ],   
    is_separator_regex = True,    
    chunk_size=2000,
    chunk_overlap=200,
)

mixed_chunks = text_splitter.split_text(md_content)
print("Length of splits (mixed case): " + str(len(mixed_chunks)))

### Case 2: Text-heavy
Text-heavy PDFs can be processed with open source without the need to use toolkits like Azure AI Document Intelligence or Unstructured.

In [None]:
from langchain_community.document_loaders.pdf import PyMuPDFLoader
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter

pdf_text_path = f"{splitted_raw_data_dir}/Text.pdf"
loader = PyMuPDFLoader(pdf_text_path)
documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1200, 
    chunk_overlap=200
)

text_chunks = text_splitter.split_documents(documents)

for idx, chunk in enumerate(text_chunks):
    print(f"Chunk {idx}\n{chunk}")
    print("="*80)


text_chunks = [d.page_content for d in text_chunks]
print("Length of splits (text-heay case): " + str(len(text_chunks)))

### Case 3: Image-heavy
Image-heavy PDF can be converted the entire page to images and let a multimodal LLM like GPT-4o summarize each page.

### Preprocess Image

In [None]:
image_dir = "./pdf_image_tmp"
delete_folder_and_make_folder(image_dir) 

In [None]:
import fitz
from glob import glob

pdf_image_path = f"{splitted_raw_data_dir}/Image.pdf"
doc = fitz.open(pdf_image_path)
#clip_x, clip_y = 10, 45
clip_x, clip_y = 10, 10

for i, page in enumerate(doc):
    x, y, w, h = page.rect
    clip = fitz.Rect(x+clip_x, y+clip_y, w-clip_x, h-clip_y)
    page.set_cropbox(clip)
    pix = page.get_pixmap()
    pix.save(f"{image_dir}/page_{i:03d}.jpg")

images = sorted(glob(os.path.join(image_dir, "*.jpg")))

In [None]:
from langchain.schema.output_parser import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate
from langchain_openai import AzureChatOpenAI

max_tokens = 1024
llm = AzureChatOpenAI(
    temperature=0, 
    max_tokens=max_tokens,
    openai_api_version="2024-05-01-preview",
    azure_deployment="gpt-4o"                       
)

system_prompt = "You are an assistant tasked with describing table or image, specialized in Smartphone product."
system_message_template = SystemMessagePromptTemplate.from_template(system_prompt)
human_prompt = [
    {
        "type": "image_url",
        "image_url": {
            "url": "data:image/png;base64," + "{image_base64}",
        },
    },
    {
        "type": "text",
        "text": '''Given image, give a concise summary in Korean. Don't insert any XML tag such as <text> and </text> when answering.'''
    },
]
human_message_template = HumanMessagePromptTemplate.from_template(human_prompt)

prompt = ChatPromptTemplate.from_messages(
    [
        system_message_template,
        human_message_template
    ]
)

summarize_chain = prompt | llm | StrOutputParser()
#summarize_chain = {"image_base64": lambda x:x} | prompt | llm_text | StrOutputParser()

In [None]:
%%time
from util.preprocess import encode_image_base64
#images = glob(os.path.join(image_path, "*.jpg"))
base64_images = [encode_image_base64(img_path) for img_path in images]
image_summaries = summarize_chain.batch(base64_images, {"max_concurrency": 8})
image_summaries = remove_short_sentences(image_summaries)


In [None]:
print("Length of image_summaries (image-heavy case): " + str(len(image_summaries)))

## 2. Construct QnA Pairs
----

### Option 1. 
Leverage the azure-ai-generative package. The QADataGenerator class in this package makes it easy to generate QnA synthetic questions. However, using this class as is has the disadvantage of not being able to use custom prompts, so we inherited from it and created the CustomQADataGenerator class.


In [1]:
from langchain_community.document_loaders import PubMedLoader

loader = PubMedLoader("liver", load_max_docs=10)
documents = loader.load()

In [None]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

In [None]:
generator_llm = AzureChatOpenAI(
    temperature=0, 
    max_tokens=max_tokens,
    openai_api_version="2024-05-01-preview",
    azure_deployment="gpt-4"                       
)


In [None]:
generator_llm = AzureChatOpenAI(
    temperature=0, 
    max_tokens=max_tokens,
    openai_api_version="2024-05-01-preview",
    azure_deployment="gpt-4"                       
)


critic_llm = AzureChatOpenAI(
    temperature=0, 
    max_tokens=max_tokens,
    openai_api_version="2024-05-01-preview",
    azure_deployment="gpt-4o"                       
)


In [None]:
from langchain_openai import AzureOpenAIEmbeddings


In [None]:
embeddings = AzureOpenAIEmbeddings(azure_deployment="text-embedding-3-large", openai_api_version="2024-05-01-preview")
text = "this is a test document"

query_result = embeddings.embed_query(text)

In [4]:
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings

from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
# documents = load your documents
max_tokens=1024


embeddings = AzureOpenAIEmbeddings(azure_deployment="text-embedding-3-large", openai_api_version="2024-05-01-preview")
generator_llm = AzureChatOpenAI(
    temperature=0, 
    max_tokens=max_tokens,
    openai_api_version="2024-05-01-preview",
    azure_deployment="gpt-4"                       
)


critic_llm = AzureChatOpenAI(
    temperature=0, 
    max_tokens=max_tokens,
    openai_api_version="2024-05-01-preview",
    azure_deployment="gpt-4o"                       
)

generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings
)


In [6]:

from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
# documents = load your documents

# generator with openai models
# generator_llm = ChatOpenAI(model="gpt-3.5-turbo-16k")
# critic_llm = ChatOpenAI(model="gpt-4")
# embeddings = OpenAIEmbeddings()


generator = TestsetGenerator.from_langchain(
    generator_llm=generator_llm,
    critic_llm=critic_llm,
    embeddings=embeddings,
    chunk_size=1024
)
# Change resulting question type distribution
distributions = {
    simple: 0.5,
    multi_context: 0.4,
    reasoning: 0.1
}

# testsetgenerator = TestsetGenerator(generator_llm=ragas_llm, critic_llm=ragas_llm,embeddings_model=embedding_function, testset_distribution=testset_distribution)
# test_size = 3
# testset = testsetgenerator.generate(docs, test_size=test_size)



# generate testset
testset = generator.generate_with_langchain_docs(documents, test_size=5, distributions=distributions, raise_exceptions=False)

# use generator.generate_with_llamaindex_docs if you use llama-index as document loader
#testset = generator.generate_with_langchain_docs(documents, 5, distributions) 
#testset = generator.generate(input_batch, test_size=5)
testset.to_pandas()

Exception in thread Thread-7:
Traceback (most recent call last):
  File "/Users/daekeun/.pyenv/versions/3.12.2/lib/python3.12/threading.py", line 1073, in _bootstrap_inner
    self.run()
  File "/Users/daekeun/.pyenv/versions/3.12.2/envs/py312-dev/lib/python3.12/site-packages/ragas/executor.py", line 87, in run
    results = self.loop.run_until_complete(self._aresults())
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/daekeun/.pyenv/versions/3.12.2/lib/python3.12/asyncio/base_events.py", line 661, in run_until_complete
    self._check_running()
  File "/Users/daekeun/.pyenv/versions/3.12.2/lib/python3.12/asyncio/base_events.py", line 620, in _check_running
    raise RuntimeError('This event loop is already running')
RuntimeError: This event loop is already running
  self._invoke_excepthook(self)


ExceptionInRunner: The runner thread which was running the jobs raised an exeception. Read the traceback above to debug it. You can also pass `raise_exceptions=False` incase you want to show only a warning message instead.

  await self._event_pipe_gc()


In [7]:
testset = generator.generate_with_langchain_docs(documents, test_size=5, distributions=distributions, raise_exceptions=False)

Exception in thread Thread-8:
Traceback (most recent call last):
  File "/Users/daekeun/.pyenv/versions/3.12.2/lib/python3.12/threading.py", line 1073, in _bootstrap_inner
    self.run()
  File "/Users/daekeun/.pyenv/versions/3.12.2/envs/py312-dev/lib/python3.12/site-packages/ragas/executor.py", line 87, in run
    results = self.loop.run_until_complete(self._aresults())
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/daekeun/.pyenv/versions/3.12.2/lib/python3.12/asyncio/base_events.py", line 661, in run_until_complete
    self._check_running()
  File "/Users/daekeun/.pyenv/versions/3.12.2/lib/python3.12/asyncio/base_events.py", line 620, in _check_running
    raise RuntimeError('This event loop is already running')
RuntimeError: This event loop is already running


ExceptionInRunner: The runner thread which was running the jobs raised an exeception. Read the traceback above to debug it. You can also pass `raise_exceptions=False` incase you want to show only a warning message instead.

In [None]:
input_batch

In [None]:
from util.qa import CustomQADataGenerator
model_config = {
    "deployment": os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME"),
    "model": "gpt-4o",
    "max_tokens": 2000,
}

qa_generator = CustomQADataGenerator(model_config=model_config, templates_dir="./prompt_template/ko")

In [None]:
import asyncio
from collections import Counter
from typing import Dict
import os
from azure.ai.generative.synthetic.qa import QAType
concurrency = 6  # number of concurrent calls
sem = asyncio.Semaphore(concurrency)

#qa_type = QAType.CONVERSATION
qa_type = QAType.LONG_ANSWER

async def generate_async(text: str) -> Dict:
    async with sem:
        return await qa_generator.generate_async(
            text=text,
            qa_type=qa_type,
            num_questions=3,  # Number of questions to generate per text
        )

In [None]:
input_batch = mixed_chunks + text_chunks + image_summaries
results = await asyncio.gather(*[generate_async(text) for text in input_batch], return_exceptions=True)

question_answer_list = []
token_usage = Counter()
for result in results:
    if isinstance(result, Exception):
        raise result  # exception raised inside generate_async()
    question_answer_list.append(result["question_answers"])
    token_usage += result["token_usage"]

print("Successfully generated QAs")

In [None]:
question_answer_list[0]

### Option 2. 
You write the entire sequence of code to create a QnA dataset without using a separate toolkit. 

In [None]:
from langchain_openai import AzureChatOpenAI
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import JsonOutputParser
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

from util.qa_pair import get_qna_prompt_template, QAPair

llm = AzureChatOpenAI(
    temperature=0, 
    max_tokens=1024,
    openai_api_version=aoai_api_version,
    azure_deployment=aoai_deployment_name                    
)

parser = JsonOutputParser(pydantic_object=QAPair)
prompt = get_qna_prompt_template()
#prompt = get_qna_repair_cost_prompt_template()
chain = prompt | llm | parser

In [None]:
input_batch = []

for doc in mixed_chunks:
    dic = {"context": doc, "domain": "Samsung Galaxy S series Smartphone, especially S23 and S24 series", "num_questions": "3"}
    input_batch.append(dic)

for doc in text_chunks:
    dic = {"context": doc, "domain": "Samsung Galaxy S series Smartphone, especially S23 and S24 series", "num_questions": "3"}
    input_batch.append(dic)

for doc in image_summaries:
    dic = {"context": doc, "domain": "Samsung Galaxy S series Smartphone, especially S23 and S24 series", "num_questions": "3"}
    input_batch.append(dic)        


In [None]:
%%time
#input_query = {"context": md_content, "domain": "Samsung Galaxy S series Smartphone", "num_questions": "3"}
qa_pair = chain.batch(input_batch, {"max_concurrency": 5})

## 3. Save to jsonl for fine-tuning
---

In [None]:
save_jsonl(qa_pair, f"{output_dir}/{save_filename}.jsonl")
dd

In [None]:
import json
from util.common_utils import convert_to_oai_format, save_jsonl

output_dir = './dataset'
os.makedirs(output_dir, exist_ok=True)

system_prompt_msg = """You are an AI assistant that provides guidance to help users self-service resolve abnormalities in their Galaxy mobile phone.\n
Please answer the questions accurately. If the question is in Korean, write your answer in Korean. If the question is in English, write your answer in English."""

save_filename = "cs-self-solve"
oai_qa_pair = convert_to_oai_format(question_answer_list)

#save_jsonl(qa_pair, f"{output_dir}/{save_filename}.jsonl")
save_jsonl(oai_qa_pair, f"{output_dir}/{save_filename}-oai.jsonl")

### Clean up

In [None]:
!rm -rf {splitted_raw_data_dir} pdf_image_tmp pdf_mixed_tmp outputs_tmp images