<a href="https://colab.research.google.com/github/mingyung-park/NLP_Code/blob/main/WikiRetriever_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

---
# WikiRetriever_Colab CODE

In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Colab Notebooks/NLP/experiments/
!chmod +x install_packages.sh
!./install_packages.sh
!pip install wikipedia-api

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Colab Notebooks/NLP/experiments
Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Collecting langchain-community
  Downloading langchain_community-0.3.8-py3-none-any.whl.metadata (2.9 kB)
Collecting SQLAlchemy<2.0.36,>=1.4 (from langchain-community)
  Downloading SQLAlchemy-2.0.35-cp310-cp310-manylinux_2_17_x86_64.manylinux2014

In [2]:
!pip install wikipedia-api nltk



In [12]:
%cd /content/drive/MyDrive/Colab Notebooks/NLP/experiments/module
import wikipediaapi
from langchain_upstage import UpstageEmbeddings, ChatUpstage
from langchain.vectorstores import FAISS
from langchain.schema import Document
from dotenv import load_dotenv
from langchain_experimental.text_splitter import SemanticChunker
from langchain_core.prompts import PromptTemplate
from langchain.chains import LLMChain
import os
import re
import numpy as np

load_dotenv()

class WikipediaRetriever:
    def __init__(self, language='en', top_k = 3):
        self.wiki = wikipediaapi.Wikipedia('NLP_RAG(kateking001130@ewhain.net)',
                                           language,
                                           extract_format=wikipediaapi.ExtractFormat.WIKI)
        self.model = ChatUpstage(api_key=os.getenv("UPSTAGE_API_KEY"))
        self.embedding_model = UpstageEmbeddings(api_key=os.getenv("UPSTAGE_API_KEY"),
                                                 model="solar-embedding-1-large")
        self.semantic_chunker = SemanticChunker(
            embeddings=self.embedding_model,
            breakpoint_threshold_type="interquartile",
            breakpoint_threshold_amount=0.5
        )
        self.vector_store = None
        self.top_k = top_k

    def extract_keywords_with_model(self, question):
        prompt_template = PromptTemplate(
            input_variables=["question"],
            template="""
            You are an AI model designed to extract up to 6 key English keywords from questions.
            Question: "{question}"
            Response: Provide only the key English keywords, separated by commas.
            """
        )

        chain = LLMChain(
            llm=self.model,
            prompt=prompt_template
        )

        response = chain.run({"question": question})
        keywords = response.strip().split(",")
        return [keyword.strip() for keyword in keywords]

    def search_wikipedia(self, keywords):
        documents = []
        for keyword in keywords:
            page = self.wiki.page(keyword)
            if page.exists():
                documents.append(Document(
                    page_content=page.text,  # Full text including sections
                    metadata={"Title": keyword, "URL": page.fullurl}
                ))
        return documents

    def create_vector_store(self, documents):
        chunked_documents = self.semantic_chunker.split_documents(documents)
        self.vector_store = FAISS.from_documents(chunked_documents, self.embedding_model)

    def retrieve_context(self, question, top_k=3):
        if not self.vector_store:
            return "Vector store is not initialized. Please create it first."

        query_embedding = self.embedding_model.embed_query(question)
        results = self.vector_store.similarity_search_by_vector(query_embedding, k=self.top_k)
        return "\n\n".join([doc.page_content for doc in results])

    def retrieve(self, query):
        keywords = self.extract_keywords_with_model(query)
        if not keywords:
            return []

        documents = self.search_wikipedia(keywords)
        if not documents:
            return []

        self.create_vector_store(documents)
        return self.retrieve_context(query)

    def as_retriever(self):
        return self.retrieve

/content/drive/MyDrive/Colab Notebooks/NLP/experiments/module


In [9]:
retriever = WikipediaRetriever(language='en')

question = "What are foreign judgments in international law?"

keywords = retriever.extract_keywords_with_model(question)
print(keywords,end="\n=============\n")

documents = retriever.search_wikipedia(keywords)
print(documents,end="\n=============\n")

retriever.create_vector_store(documents)
print("Vector store created.")

context = retriever.retrieve_context(question)
print("Retrieved Context:\n", context)


  chain = LLMChain(
  response = chain.run({"question": question})


['foreign judgments', 'international law']
[Document(metadata={'Title': 'foreign judgments'}, page_content='In law, the enforcement of foreign judgments is the recognition and enforcement in one jurisdiction of judgments rendered in another ("foreign") jurisdiction. Foreign judgments may be recognized based on bilateral or multilateral treaties or understandings, or unilaterally without an express international agreement.\n\nDefinition of terms\nThe "recognition" of a foreign judgment occurs when the court of one country or jurisdiction accepts a judicial decision made by the courts of another "foreign" country or jurisdiction, and issues a judgment in substantially identical terms without rehearing the substance of the original lawsuit.\nIn English law, there is a clear distinction between recognition of foreign judgments, and enforcement of foreign judgments. Recognition means treating the claim as having been determined in favour of one of the litigating parties. This is an acknowle

In [10]:
retriever = WikipediaRetriever(language='en')

question = """QUESTION40)____________ refers to a strategic process involving stakeholder assessment to create long-term relationships with customers, while maintaining, supporting, and enhancing the natural environment.
(A) Greenwashing(B) Eco-branding(C) Recycle and reprocess management(D) Sustainable Development(E) Environmental Stewardship(F) Superfund reauthorization(G) Eco-strategy(H) Environmental Impact Assessment(I) Green Marketing(J) Eco-efficient Operations"""

keywords = retriever.extract_keywords_with_model(question)
print(keywords,end="\n=============\n")

documents = retriever.search_wikipedia(keywords)
print(documents,end="\n=============\n")

retriever.create_vector_store(documents)
print("Vector store created.")

context = retriever.retrieve_context(question)
print("Retrieved Context:\n", context)


['Sustainable Development', 'Environmental Stewardship', 'Eco-strategy', 'Green Marketing', 'Eco-efficient Operations']
[Document(metadata={'Title': 'Sustainable Development'}, page_content='Sustainable development is an approach to growth and human development that aims to meet the needs of the present without compromising the ability of future generations to meet their own needs. The aim is to have a society where living conditions and resources meet human needs without undermining planetary integrity. Sustainable development aims to balance the needs of the economy, environment, and social well-being. The Brundtland Report in 1987 helped to make the concept of sustainable development better known. \nSustainable development overlaps with the idea of sustainability which is a normative concept. UNESCO formulated a distinction between the two concepts as follows: "Sustainability is often thought of as a long-term goal (i.e. a more sustainable world), while sustainable development refer

# delete

In [None]:
# STEP 1. KB 구축
# step 1.1 ewha.pdf에서 pdf parsing
import requests
from langchain.schema import Document
from bs4 import BeautifulSoup

def extract_text_or_table(pdf_path):
    api_key = "up_zsOzpjQ8Ow7NFmiWQPTh2x7P4Y4MQ"
    url = "https://api.upstage.ai/v1/document-ai/document-parse"
    headers = {"Authorization": f"Bearer {api_key}"}
    documents = []

    with open(pdf_path, "rb") as file:
        response = requests.post(url, headers=headers, files={"document": file})

    if response.status_code == 200:
        data = response.json()
        html_content = data.get("content", {}).get("html", "")
        if not html_content:
            print("Error: No HTML content found in API response.")
            return []

        soup = BeautifulSoup(html_content, "html.parser")

        categories = {
            "table": "table",
            "figure": "figure",
            "chart": "img[data-category='chart']",
            "heading1": "h1",
            "header": "header",
            "footer": "footer",
            "caption": "caption",
            "paragraph": "p[data-category='paragraph']",
            "equation": "p[data-category='equation']",
            "list": "p[data-category='list']",
            "index": "p[data-category='index']",
            "footnote": "p[data-category='footnote']"
        }

        for category, selector in categories.items():
            elements = soup.select(selector)
            for element in elements:
                content = element.get_text(strip=True)
                metadata = {"category": category, "html": str(element)}
                documents.append(Document(page_content=content, metadata=metadata))

        if not documents:
            print("No sections were extracted.")
        return documents
    else:
        print(f"Error: {response.status_code} - {response.text}")
        return []

# step 1.2 pdf parsing한 것을 cleaning text
import re

def clean_extracted_text(text):
    # 문장 중간의 줄바꿈 제거
    cleaned_text = re.sub(r'(?<=[a-z,])\n(?=[a-z])', ' ', text)
    # 문장 끝 줄바꿈 유지
    cleaned_text = re.sub(r'(?<=[.?!])\s*\n', '\n', cleaned_text)

    return cleaned_text

# step 1.3 추가 KB, KB 통합
import csv

def load_problem_data(csv_path):
    problems = []
    with open(csv_path, "r", encoding="utf-8") as file:
        reader = csv.reader(file)
        for row in reader:
            # 모든 데이터를 하나의 context 필드로 통합
            context = " ".join(row)
            problems.append({"type": "problem", "content": context})
    return problems

def combine_kb(documents, problems):
    combined_kb = [{"type": "pdf", "content": doc} for doc in documents]
    combined_kb += [{"type": "problem", "content": problem["content"]} for problem in problems]
    return combined_kb

def ensure_text_format(kb):
    if isinstance(kb, list):
        processed_kb = []
        for item in kb:
            if isinstance(item, dict):
                if "content" in item:
                    content = item["content"]
                    if isinstance(content, dict) and "page_content" in content:
                        processed_kb.append([content["page_content"]])
                    elif isinstance(content, str):
                        processed_kb.append([content])
            elif isinstance(item, str):
                processed_kb.append([item])
        return processed_kb
    else:
        raise ValueError("KB should be a list.")

# step 1.4 전체 KB 임베딩하고 <Embedding 모델> 검색할 수 있게 <VectorStore 모델> 함 - 토큰 너무 많아지는 것을 방지하기 위해 split
import numpy as np
from openai import OpenAI
from langchain.vectorstores import FAISS
from langchain.schema import Document

def split_text(text, max_length=1000):
    return [text[i:i + max_length] for i in range(0, len(text), max_length)]

def preprocess_texts(texts):
    cleaned_texts = []
    for text in texts:
        if not isinstance(text, str):
            text = str(text)
        if len(text.strip()) > 0:
            cleaned_texts.append(text.strip())
    return cleaned_texts

class UpstageEmbeddings:
    def __init__(self, client, model="embedding-query"):
        self.client = client
        self.model = model

    def embed_documents(self, texts):
        embeddings = []
        for text in texts:
            try:
                response = self.client.embeddings.create(
                    model=self.model,
                    input=text
                )
                embeddings.append(response.data[0].embedding)
            except Exception as e:
                print(f"Error generating embedding for text: {text[:50]}... \n{e}")
        return embeddings

    def embed_query(self, text):
        try:
            response = self.client.embeddings.create(
                model=self.model,
                input=text
            )
            return response.data[0].embedding
        except Exception as e:
            print(f"Error generating embedding for query: {text[:50]}... \n{e}")
            return None

def create_vector_store(kb_list):
    documents = []
    for entry in kb_list:
        for text in entry:
            processed_texts = preprocess_texts([text])
            for processed_text in processed_texts:
                chunks = split_text(processed_text, max_length=1000)
                documents.extend([Document(page_content=chunk) for chunk in chunks])

    embeddings = UpstageEmbeddings(client=client)

    vector_store = FAISS.from_documents(documents, embeddings)
    return vector_store

# STEP 2. 모델링 <llm 모델> <QA 방식> - 언어 모델, 모델링 방식, 프롬프팅 준비
from openai import OpenAI

def create_qa_chain(vector_store, model, prompt_template):
    client = OpenAI(
        api_key="up_zsOzpjQ8Ow7NFmiWQPTh2x7P4Y4MQ",
        base_url="https://api.upstage.ai/v1/solar"
    )

    retriever = {"embedding": vector_store}

    def qa_model(prompt):
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "user", "content": prompt}
            ]
        )
        return response.choices[0].message.content

    return {"llm": qa_model, "retriever": retriever, "prompt": prompt_template}

# STEP 3. samples에서 질문과 정답 가져오기
import pandas as pd

def read_data(csv_path):
    data = pd.read_csv(csv_path)
    return data['prompts'].tolist(), data['answers'].tolist()

import re

def extract_answer(text):
    match = re.search(r"\([A-Z]\)", text)
    return match.group(0) if match else "N/A"

# STEP 4. 답변 생성
import wikipediaapi
import re

wiki_wiki_ko = wikipediaapi.Wikipedia(language='ko', user_agent='NLP_Team/1.0 (ewhanthbeot@ewhain.net)')
wiki_wiki_en = wikipediaapi.Wikipedia(language='en', user_agent='NLP_Team/1.0 (ewhanthbeot@ewhain.net)')

# Retrieve에서 관련성 높은 것만 필터링.
def filter_context_by_relevance(question, context, threshold=0.5):
    question_embedding = client.embeddings.create(
        model="embedding-query",
        input=question
    ).data[0].embedding

    context_embedding = client.embeddings.create(
        model="embedding-query",
        input=context
    ).data[0].embedding

    # Compute similarity (cosine similarity)
    similarity = np.dot(question_embedding, context_embedding) / (
        np.linalg.norm(question_embedding) * np.linalg.norm(context_embedding)
    )

    if similarity >= threshold:
        return context
    else:
        return None

# KB에서 검색. 관련성 높은 것만 필터링. (검색 시 질문에서 필요없는 것을 삭제하면 오히려 성능이 낮아짐.)
def retrieve_context(question, vector_store):
    try:
        question_embedding = client.embeddings.create(
            model="embedding-query",
            input=question
        ).data[0].embedding

        results = vector_store.similarity_search_by_vector(question_embedding, k=3)
        return "\n\n".join([doc.page_content for doc in results])
    except Exception as e:
        print(f"Error during retrieval: {e}")
        return "No relevant context found."

def retrieve_context_with_filter(question, vector_store):
    raw_context = retrieve_context(question, vector_store)
    if not raw_context.strip():
        return "No relevant context found."

    filtered_context = filter_context_by_relevance(question, raw_context)
    if not filtered_context:
        print("검색된 문서가 질문과 관련이 없어 제외되었습니다.")
        return "No relevant context found."

    return filtered_context

# Wiki에서 검색. 관련성 높은 것만 필터링. 검색 시 질문에서 필요없는 건 삭제.
def extract_high_quality_keywords(query):
    keywords = query.split()
    stopwords = {"question", "which", "of", "the", "following", "describes", "a", "key", "in", "that", "is", "to", "as", "early", "as"}
    high_quality_keywords = [word.strip().lower() for word in keywords if word.lower() not in stopwords and len(word) > 3]
    return high_quality_keywords

def fetch_from_wikipedia(query, language, max_keywords=5):
    wiki_api = wiki_wiki_ko if language == 'ko' else wiki_wiki_en
    high_quality_keywords = extract_high_quality_keywords(query)
    documents = []

    for keyword in high_quality_keywords[:max_keywords]:
        print(f"Searching Wikipedia for keyword: {keyword}")
        try:
            search_results = wiki_api.search(keyword, results=1)
            if search_results:
                page = wiki_api.page(search_results[0])
                if page.exists():
                    summary = page.summary[:500]  # 요약을 500자로 제한
                    # `Document` 객체로 변환
                    documents.append(Document(page_content=summary, metadata={"source": "Wikipedia", "title": page.title}))
        except Exception as e:
            print(f"Error fetching Wikipedia page for keyword '{keyword}': {e}")

    if documents:
        return documents  # `Document` 객체 리스트 반환
    else:
        print("No relevant Wikipedia page found.")
        return []

def add_to_vector_store(documents, vector_store):
    for doc in documents:
        vector_store.add_documents([doc])

# llm 호출. 언어 감지 후 호출.
def detect_language(text):
    if re.search(r'[가-힣]', text):
        return 'ko'
    elif re.search(r'[a-zA-Z]', text):
        return 'en'
    else:
        return 'unknown'

def run_llm_with_retry(question, combined_context, qa_chain, max_retries=5):
    language = detect_language(question)
    keywords = extract_high_quality_keywords(question)  # Only high-quality keywords
    retries = 0

    while retries < max_retries:
        prompt = qa_chain["prompt"].format(context=combined_context, question=question)
        result = qa_chain["llm"](prompt)
        predicted_answer = extract_answer(result)

        if predicted_answer != "N/A":
            return predicted_answer

        print(f"답변이 N/A로 표시됨: 다시 확인 중...")

        if keywords:
            keyword = keywords.pop(0)
            wiki_context = fetch_from_wikipedia(keyword, language)
            combined_context = f"PDF Context:\n{combined_context}\n\nWikipedia Context:\n{wiki_context or 'N/A'}"
        else:
            break

        retries += 1

    print("최대 재시도 횟수에 도달했습니다.")
    return "N/A"

def normalize_answer(answer):
    if answer.startswith("(") and answer.endswith(")"):
        return answer[1:-1]
    return answer.strip()