In [36]:
import os
import warnings
from dotenv import load_dotenv
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
import numpy as np
import argparse

warnings.filterwarnings("ignore")
os.environ['OPENAI_API_KEY'] = '****'
os.environ['LANGCHAIN_API_KEY'] = '****'
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_PROJECT'] = 'task'

#PDF 로드 함수
def load_pdf(file_path):
    try:
        loader = PyPDFLoader(file_path)
        docs = loader.load()
        return [doc.page_content for doc in docs]
    except Exception as e:
        print(f"PDF 로드 중 오류 발생: {e}")
        return None

# 텍스트 Splitter 함수
def split_text(documents, splitter_type="recursive", chunk_size=500, chunk_overlap=0):
    try:
        # splitter_type에 따라 텍스트 분리기 선택
        if splitter_type == "character":
            text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        elif splitter_type == "recursive":
            text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        else:
            raise ValueError("지원되지 않는 Splitter 타입입니다.")
        
        texts = text_splitter.create_documents(documents)
        return [text.page_content for text in texts]
    except Exception as e:
        print(f"텍스트 분리 중 오류 발생: {e}")
        return None

# Embedding 
def initialize_embedding(model_name="intfloat/multilingual-e5-large-instruct"):
    try:
        hf_embeddings = HuggingFaceEmbeddings(
            model_name=model_name,
            model_kwargs={"device": "cpu"},
            encode_kwargs={"normalize_embeddings": True},
        )
        return hf_embeddings
    except Exception as e:
        print(f"Embedding 초기화 중 오류 발생: {e}")
        return None

# Search_query
def search_query(query, texts, embeddings):
    try:
        embedded_query = embeddings.embed_query(query)
        embedded_documents = embeddings.embed_documents(texts)

        if len(embedded_query) == 0 or len(embedded_documents) == 0:
            raise ValueError("Embedding 결과가 비어 있습니다.")

        # 유사도 계산 (코사인 유사도)
        scores = np.dot(np.array(embedded_query), np.array(embedded_documents).T)
        sorted_idx = scores.argsort()[::-1]  # 높은 점수 순으로 정렬

        print(f"[Query] {query}\n{'='*40}")
        for i, idx in enumerate(sorted_idx[:5]):  # 상위 5개 출력
            print(f"[{i}] (유사도: {scores[idx]:.4f}) {texts[idx]}")
        return sorted_idx
    except Exception as e:
        print(f"검색 중 오류 발생: {e}")
        return None



In [None]:
# Main
if __name__ == "__main__":
    FILE_PATH = "C:/Users/kowm6/Desktop/test.pdf"  # 파일 경로 직접 입력
    model_name = "intfloat/multilingual-e5-large-instruct"

    # PDF 로드
    documents = load_pdf(FILE_PATH)

    if documents:
        # 텍스트 분리 (separator 제거, chunk_size와 chunk_overlap만 사용)
        texts = split_text(documents, splitter_type="recursive", chunk_size=500, chunk_overlap=0)
        if texts:
            hf_embeddings = initialize_embedding(model_name)
            if hf_embeddings:
                search_query(
                    """
                    1. AITP was designed to address which limitations of existing instruction-tuning datasets?
                    """,
                    texts,
                    hf_embeddings,
                )

[Query] 
                    what is this?
                    
[0] (유사도: 0.7738) Rewritten SetRewriting
Supervised 
Fine-TuningRaw Text
Text: # Confidence Interval calculation for 
Power Density Estimation in MATLAB
First of all, I am new to these statistics stuff but 
very interested in the background. I try to……
Original SFT Dataset
Figure 2: The pipeline of AITP. AITP first generates a difference set, then rewrites the raw text into instruction-response
pairs to form a rewritten set, and finally combines the rewritten set with the original SFT dataset for model training.
[1] (유사도: 0.7693) comparing their distribution to that of the pre-training cor-
pus. Underrepresented data is then rewritten into high-
quality instruction-response pairs, enhancing dataset cover-
age and alignment. As shown in Figure 2, AITP involves
three stages: (1) generating a difference set based on density
comparisons, (2) rewriting raw text into instruction-response
1arXiv:2501.09368v2  [cs.AI]  17 Jan 2025

#