<a target="_blank" href="https://colab.research.google.com/github/UpstageAI/cookbook/blob/main/cookbooks/upstage/Solar-Full-Stack LLM-101/05_3_OracleDB.ipynb">
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

# Try17: 
1. KB 구축
- parsing: Upstage 모델, html 태그가 나와서 카테고리별로 자름. (페이지별 아님)
- KB 추가: csv로 추가할 수 있음
- 임베딩: Upstage 모델, FAISS
2. 모델링
- Upstage 모델
3. 답변 생성
- 질문에서 core_question 뽑고 pdf llm으로 찾기
- N/A 나오면 core_question을 wiki로 검색하게 함. 이때 검색 언어는 core_question의 언어를 감지하게 하고, 검색은 단어 하나씩 검색하게 함. N/A 안 나올 때까지 반복.
4. 자잘한 것들 추가
- 답변 생성 시 맞게 검색해놓고 답안지를 선택하지 않을 경우가 있음: 답변은 선택지에서 선택하도록 프롬프팅
- acc를 위해 비교 시 답안 추출이 안 되는 경우가 있음: normalize_answer 해서 비교하게 함
- N/A가 한 번이 아니라 여러 번 나오는 경우가 있음: 안 나올 때까지 반복하게 함

In [None]:
import os
from langchain.embeddings import OpenAIEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
from openai import OpenAI # openai==1.52.2

api_key = "up_zsOzpjQ8Ow7NFmiWQPTh2x7P4Y4MQ"
data_path = "/Users/susie/Desktop/Temp_Laptop2/Python_Files/G/24-2/NLP/Team/baseline/"

client = OpenAI(
    api_key="up_zsOzpjQ8Ow7NFmiWQPTh2x7P4Y4MQ",
    base_url="https://api.upstage.ai/v1/solar"
)

In [2]:
PDF_PATH = os.path.join(data_path, 'ewha.pdf')
CSV_PATH = os.path.join(data_path, 'TestSamples.csv')
# TABLE_PAGES 필요없음

In [None]:
import requests
from langchain.schema import Document
from bs4 import BeautifulSoup

def extract_text_or_table(pdf_path):
    """
    Extract and categorize text from a PDF using Upstage API.
    """
    api_key = "up_zsOzpjQ8Ow7NFmiWQPTh2x7P4Y4MQ"
    url = "https://api.upstage.ai/v1/document-ai/document-parse"
    headers = {"Authorization": f"Bearer {api_key}"}
    documents = []

    with open(pdf_path, "rb") as file:
        response = requests.post(url, headers=headers, files={"document": file})

    if response.status_code == 200:
        data = response.json()
        html_content = data.get("content", {}).get("html", "")
        if not html_content:
            print("Error: No HTML content found in API response.")
            return []

        soup = BeautifulSoup(html_content, "html.parser")

        categories = {
            "table": "table",
            "figure": "figure",
            "chart": "img[data-category='chart']",
            "heading1": "h1",
            "header": "header",
            "footer": "footer",
            "caption": "caption",
            "paragraph": "p[data-category='paragraph']",
            "equation": "p[data-category='equation']",
            "list": "p[data-category='list']",
            "index": "p[data-category='index']",
            "footnote": "p[data-category='footnote']"
        }

        for category, selector in categories.items():
            elements = soup.select(selector)
            for element in elements:
                content = element.get_text(strip=True)
                metadata = {"category": category, "html": str(element)}
                documents.append(Document(page_content=content, metadata=metadata))

        if not documents:
            print("No sections were extracted.")
        return documents
    else:
        print(f"Error: {response.status_code} - {response.text}")
        return []
    
import re

def clean_extracted_text(text):
    # 문장 중간의 줄바꿈 제거
    cleaned_text = re.sub(r'(?<=[a-z,])\n(?=[a-z])', ' ', text)
    # 문장 끝 줄바꿈 유지
    cleaned_text = re.sub(r'(?<=[.?!])\s*\n', '\n', cleaned_text)
    
    return cleaned_text

import csv

def load_problem_data(csv_path):
    problems = []
    with open(csv_path, "r", encoding="utf-8") as file:
        reader = csv.reader(file)
        for row in reader:
            question = row[0]
            choices = row[1:-1]
            answer = row[-1]
            formatted_problem = f"{question}\n(A) {choices[0]}\n(B) {choices[1]}\n(C) {choices[2]}\n(D) {choices[3]}"
            problems.append({"type": "problem", "content": formatted_problem, "answer": answer})
    return problems


def combine_kb(documents, problems):
    combined_kb = [{"type": "pdf", "content": doc} for doc in documents]
    combined_kb += [{"type": "problem", "content": problem} for problem in problems]
    return combined_kb

def ensure_text_format(kb):
    """
    Convert a KB (list of dictionaries or strings) into a list of lists,
    where each dictionary or string forms its own list of strings.
    """
    if isinstance(kb, list):
        processed_kb = []
        for item in kb:
            if isinstance(item, dict):
                if "content" in item:
                    content = item["content"]
                    if isinstance(content, dict) and "page_content" in content:
                        processed_kb.append([content["page_content"]])
                    elif isinstance(content, str):
                        processed_kb.append([content])
            elif isinstance(item, str):
                processed_kb.append([item])
        return processed_kb
    else:
        raise ValueError("KB should be a list.")

import numpy as np
from openai import OpenAI
from langchain.vectorstores import FAISS
from langchain.schema import Document

def split_text(text, max_length=1000):
    """
    Split long text into manageable chunks.
    """
    return [text[i:i + max_length] for i in range(0, len(text), max_length)]

def preprocess_texts(texts):
    """
    Preprocess input texts to ensure compatibility with the embeddings API.
    """
    cleaned_texts = []
    for text in texts:
        if not isinstance(text, str):
            text = str(text)
        if len(text.strip()) > 0:
            cleaned_texts.append(text.strip())
    return cleaned_texts

class UpstageEmbeddings:
    def __init__(self, client, model="embedding-query"):
        self.client = client
        self.model = model

    def embed_documents(self, texts):
        """
        Generate embeddings for a list of texts.
        """
        embeddings = []
        for text in texts:
            try:
                response = self.client.embeddings.create(
                    model=self.model,
                    input=text
                )
                embeddings.append(response.data[0].embedding)
            except Exception as e:
                print(f"Error generating embedding for text: {text[:50]}... \n{e}")
        return embeddings

    def embed_query(self, text):
        """
        Generate an embedding for a single query.
        """
        try:
            response = self.client.embeddings.create(
                model=self.model,
                input=text
            )
            return response.data[0].embedding
        except Exception as e:
            print(f"Error generating embedding for query: {text[:50]}... \n{e}")
            return None

def create_vector_store(kb_list):
    documents = []
    for entry in kb_list:
        for text in entry:
            processed_texts = preprocess_texts([text])
            for processed_text in processed_texts:
                chunks = split_text(processed_text, max_length=1000)
                documents.extend([Document(page_content=chunk) for chunk in chunks])

    embeddings = UpstageEmbeddings(client=client)

    vector_store = FAISS.from_documents(documents, embeddings)
    return vector_store

def retrieve_context(question, vector_store):
    """
    Retrieve the most relevant context for a given question from the vector store.
    """
    try:
        question_embedding = client.embeddings.create(
            model="embedding-query",
            input=question
        ).data[0].embedding

        results = vector_store.similarity_search_by_vector(question_embedding, k=3)
        return "\n\n".join([doc.page_content for doc in results])
    except Exception as e:
        print(f"Error during retrieval: {e}")
        return "No relevant context found."
    
from openai import OpenAI

def create_qa_chain(vector_store, model, prompt_template):
    client = OpenAI(
        api_key="up_zsOzpjQ8Ow7NFmiWQPTh2x7P4Y4MQ",
        base_url="https://api.upstage.ai/v1/solar"
    )

    retriever = {"embedding": vector_store}
    
    def qa_model(prompt):
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "user", "content": prompt}
            ]
        )
        return response.choices[0].message.content
    
    return {"llm": qa_model, "retriever": retriever, "prompt": prompt_template}

import pandas as pd

def read_data(csv_path):
    """
    input: sample.csv
    output: prompts, answers
    """
    data = pd.read_csv(csv_path)
    return data['prompts'].tolist(), data['answers'].tolist()

import re

def extract_answer(text):
    match = re.search(r"\([A-Z]\)", text)
    return match.group(0) if match else "N/A"

import wikipediaapi
import re

# Initialize Wikipedia APIs
wiki_wiki_ko = wikipediaapi.Wikipedia(language='ko', user_agent='NLP_Team/1.0 (ewhanthbeot@ewhain.net)')
wiki_wiki_en = wikipediaapi.Wikipedia(language='en', user_agent='NLP_Team/1.0 (ewhanthbeot@ewhain.net)')

def detect_language(text):
    if re.search(r'[가-힣]', text):
        return 'ko'
    elif re.search(r'[a-zA-Z]', text):
        return 'en'
    else:
        return 'unknown'

def extract_core_question(question, core_extraction_model):
    language = detect_language(question)
    if language == 'ko':
        prompt = f"다음 질문에서 핵심 내용을 추출하세요 (선택지 제외):\n\n{question}\n\n핵심 내용:"
    elif language == 'en':
        prompt = f"Extract the core part of the following question (exclude choices):\n\n{question}\n\nCore content:"
    else:
        raise ValueError("Unsupported language detected.")
    core = core_extraction_model(prompt)
    return core.strip()

def fetch_from_wikipedia(query, language):
    """
    Fetch a summary from Wikipedia based on the query and language.
    """
    wiki_api = wiki_wiki_ko if language == 'ko' else wiki_wiki_en
    page = wiki_api.page(query)
    if page.exists():
        return page.summary[:500]
    else:
        return None
    
def normalize_answer(answer):
    """
    Normalize the format of the answer to handle cases like 'A' vs '(A)'.
    """
    if answer.startswith("(") and answer.endswith(")"):
        return answer[1:-1]
    return answer.strip()

# Main 실행

In [None]:
# STEP 1. KB 구축
# step 1.1 ewha.pdf에서 pdf parsing
documents = extract_text_or_table(PDF_PATH)

# step 1.2 pdf parsing한 것을 cleaning text
# 각 페이지의 텍스트에 적용, 새로운 변수를 만들지 않고 이런 식으로 처리하기!!!
cleaned_documents = []

for doc in documents:
    if isinstance(doc.page_content, str):
        cleaned_content = clean_extracted_text(doc.page_content)
        cleaned_documents.append({"page_content": cleaned_content, "metadata": doc.metadata})

# CSV 문제 데이터 로드
problems = "problems.csv"
problems = load_problem_data(problems)

# PDF 데이터와 문제 데이터 통합
kb = combine_kb(cleaned_documents, problems)

# 통합 후 리스트로 변환
list_kb = ensure_text_format(kb)

# step 1.4 전체 KB 임베딩하고 <Embedding 모델> 검색할 수 있게 <VectorStore 모델> 함 - 토큰 너무 많아지는 것을 방지하기 위해 split
vector_store = create_vector_store(list_kb)

# STEP 2. 모델링 <llm 모델> <QA 방식> - 언어 모델, 모델링 방식, 프롬프팅 준비
model = "solar-pro"
custom_prompt = """
다음 문서에서 이유를 찾고 질문에 답하세요: \n
{context}\n
1. Tables in the document follow a hierarchical structure:
- **Example**: 
    ```
    [['대학', '학부/학과/전공', '입학정원'], 
    ['사범대학', '교육학과\n유아교육과...', '27\n29...']]
    ```
- This indicates:
    - A university (e.g., 사범대학) contains colleges (e.g., 교육학과).
    - A college can further have departments or majors (e.g., 특수교육과 includes 유아특수교육전공).
\n
질문: {question}\n
답변은 선택지에서 선택하세요."""
qa_chain = create_qa_chain(vector_store, model, custom_prompt)

# STEP 3. samples에서 질문과 정답 가져오기 - csv 파일에서 -> list로 변경
prompts, answers = read_data(CSV_PATH)

correct = 0

for i, question in enumerate(prompts):
    core_question = extract_core_question(question, qa_chain["llm"])
    print(f"핵심 질문 추출: {core_question}")
    
    context = retrieve_context(core_question, vector_store)
    
    if not context.strip():
        print("PDF에서 관련 정보를 찾을 수 없습니다. Wikipedia를 검색합니다.")
        words = re.findall(r'[가-힣]+|[a-zA-Z]+|\d+', core_question)
        language = detect_language(core_question)
        
        for word in words:
            context = fetch_from_wikipedia(word, language)
            if context:
                print(f"Wikipedia에서 관련 정보를 찾았습니다: {word}")
                break  
        else:
            context = "No relevant information found."

    words = re.findall(r'[가-힣]+|[a-zA-Z]+|\d+', core_question)
    language = detect_language(core_question) 
    while True: 
        prompt = qa_chain["prompt"].format(context=context, question=question)
        result = qa_chain["llm"](prompt)
        predicted_answer = extract_answer(result)
        
        if predicted_answer != "N/A":
            break
        
        print(f"답변이 N/A로 표시됨: 다시 확인 중...")
        
        if words:
            word = words.pop(0)
            context = fetch_from_wikipedia(word, language)
            if context:
                print(f"Wikipedia에서 관련 정보를 찾았습니다: {word}")
        else:
            context = "No relevant information found."
            break

    normalized_predicted = normalize_answer(predicted_answer)
    normalized_actual = normalize_answer(answers[i])

    print(f"질문 {i + 1}: {question}")
    print(f"생성된 답변: {result}")
    print(f"예상 답: {normalized_predicted}, 실제 답: {normalized_actual}\n")

    if normalized_predicted == normalized_actual:
        correct += 1

print(f"총 {len(prompts)}개의 질문 중 {correct}개 맞춤. 정확도: {correct / len(prompts) * 100:.2f}%")
