# 0. 패키지 설치 및 환경설정, 모듈 로드

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%pip install "unstructured[pdf]"

In [None]:
!apt-get install -y poppler-utils

In [None]:
%pip install huggingface_hub[hf_xet]

In [None]:
!pip install langchain_huggingface

In [None]:
%pip install -U langchain-community

In [None]:
!pip install faiss-cpu

In [None]:
from unstructured.partition.pdf import partition_pdf
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.docstore.document import Document
import numpy as np
import pickle
import os

In [None]:
import os

os.environ['OPENAI_API_KEY'] = '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'

In [None]:
import os
from PIL import Image
from langchain.schema import Document
from langchain.vectorstores import FAISS
from transformers import CLIPProcessor, CLIPModel
from transformers import AutoTokenizer, AutoModel
import torch

# 1. PDF 업로드 및 이미지 저장

In [None]:
# 업로드된 PDF 경로
pdf_path = "/content/drive/MyDrive/EBS 2026학년도 수능특강 사회탐구영역 생활과 윤리(교사용).pdf"

# 이미지 저장 폴더
image_output_dir = "/content/drive/MyDrive/output_images"
os.makedirs(image_output_dir, exist_ok=True)

# PDF에서 텍스트 및 이미지 추출
elements = partition_pdf(
    filename=pdf_path,
    extract_images_in_pdf=True,
    infer_table_structure=True,


    chunking_strategy="by_title",
    max_characters=4000,
    new_after_n_chars=3800,
    combine_text_under_n_chars=2000,
    pdf_image_output_dir_path=image_output_dir,
)

# 일부 요소 출력 확인
for elem in elements[:5]:
    print(type(elem), "\n", elem, "\n---")

In [None]:
# 저장
with open("elements.pkl", "wb") as f:
    pickle.dump(elements, f)

In [None]:
# 불러오기
with open("/content/drive/MyDrive/elements.pkl", "rb") as f:
    elements = pickle.load(f)

In [None]:
'''
엑셀 파일로 저장

import shutil

folder_path = '/content/figures'
output_zip = '/content/figures.zip'

shutil.make_archive(output_zip.replace('.zip', ''), 'zip', folder_path)
'''

# 2. 실습 실행

In [None]:
# 1. 이미지 경로들 가져오기
image_paths = [os.path.join(image_output_dir, f) for f in os.listdir(image_output_dir) if f.lower().endswith((".png", ".jpg", ".jpeg"))]

In [None]:
# 2. 텍스트 추출
texts= [el.text for el in elements if el.text and el.text.strip() != ""]

In [None]:
# 3. 텍스트 임베딩: BGE-M3 (수능특강처럼 텍스트가 긴 경우 BAAI/bge-m3 사용)
bge_tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-m3")
bge_model = AutoModel.from_pretrained("BAAI/bge-m3")

# 이미지/텍스트 임베딩: CLIP
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

In [None]:
def embed_text_bge(text):
    inputs = bge_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=8192)
    with torch.no_grad():
        outputs = bge_model(**inputs)
    return outputs.last_hidden_state[:, 0].cpu().numpy()[0]

def embed_image_clip(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = clip_processor(images=image, return_tensors="pt")
    with torch.no_grad():
        outputs = clip_model.get_image_features(**inputs)
    return outputs[0].cpu().numpy()

In [None]:
# 4. DummyEmbeddingFunction 선언

class DummyEmbeddingFunction:
    def __init__(self, vectors):
        self.vectors = vectors
        self.i = 0

    def embed_documents(self, texts):
        results = self.vectors[self.i : self.i + len(texts)]
        self.i += len(texts)
        return results

    def embed_query(self, text):
        raise NotImplementedError("Query embedding은 지원하지 않습니다.")

In [None]:
# 5. docs, vectors 선언

# 텍스트 리스트
text_docs = [Document(page_content=t) for t in texts]
text_vectors = np.array([embed_text_bge(t) for t in texts]).astype("float32")

# 이미지 경로 리스트
image_docs = [Document(page_content="Image", metadata={"image_path": path}) for path in image_paths]
image_vectors = np.array([embed_image_clip(doc.metadata["image_path"]) for doc in image_docs]).astype("float32")

In [None]:
'''
img_save_path = "/content/drive/MyDrive/img_text_embeddings.pkl"  # 원하는 경로로 지정

with open(img_save_path, "wb") as f:
    pickle.dump({
        "documents": image_docs,
        "vectors": image_vectors,
    }, f)

print(f"✅ 저장 완료: {img_save_path}")


txt_save_path = "/content/drive/MyDrive/text_embeddings.pkl"  # 원하는 경로로 지정

with open(save_path, "wb") as f:
    pickle.dump({
        "documents": text_docs,
        "vectors": text_vectors,
    }, f)

print(f"✅ 저장 완료: {txt_save_path}")
'''

In [None]:
# 6. VectorStore 선언

# 텍스트 벡터스토어
text_embedding_fn = DummyEmbeddingFunction(text_vectors)
vectorstore_text = FAISS.from_documents(
    documents=text_docs,
    embedding=text_embedding_fn
)

# 이미지 벡터스토어
image_embedding_fn = DummyEmbeddingFunction(image_vectors)
vectorstore_image = FAISS.from_documents(
    documents=image_docs,
    embedding=image_embedding_fn
)

In [None]:
'''
# 벡터스토어 저장
vectorstore_image.save_local("/content/drive/MyDrive/vectorstore_text")
vectorstore_text.save_local("/content/drive/MyDrive/vectorstore_image")

from langchain.vectorstores.faiss import FAISS
from langchain.embeddings.base import Embeddings

# 벡터스토어 로드
class DummyEmbedding(Embeddings):
    def embed_documents(self, texts): return []
    def embed_query(self, text): return []

vectorstore_text = FAISS.load_local(
    folder_path="/content/drive/MyDrive/vectorstore_text",
    embeddings=DummyEmbedding()
)

vectorstore_image = FAISS.load_local(
    folder_path="/content/drive/MyDrive/vectorstore_image",
    embeddings=DummyEmbedding()
)
'''

In [None]:
# 7. unified_search

def unified_search(query, top_k=3):
    # 텍스트용 BGE-M3 임베딩
    bge_vector = embed_text_bge(query)
    text_results = vectorstore_text.similarity_search_by_vector(bge_vector, k=top_k)

    # 이미지 검색용 CLIP 텍스트 임베딩
    clip_inputs = clip_processor(text=[query], return_tensors="pt", padding=True)
    with torch.no_grad():
        clip_vector = clip_model.get_text_features(**clip_inputs)[0].cpu().numpy()
    image_results = vectorstore_image.similarity_search_by_vector(clip_vector, k=top_k)

    return {
        "text_results": text_results,
        "image_results": image_results
    }

# 3. 활용

In [None]:
query = "행복에 대한 윤리적 관점"
results = unified_search(query)

print("📘 텍스트 검색 결과:")
for r in results["text_results"]:
    print("-", r.page_content[:100])

print("\n🖼 이미지 검색 결과:")
for r in results["image_results"]:
    print("-", r.metadata.get("image_path", "경로 없음"))

In [None]:
query = "석가모니의 주장"
results = unified_search(query, 5)

print("📘 텍스트 검색 결과:")
for r in results["text_results"]:
    print("-", r.page_content[:1000])

print("\n🖼 이미지 검색 결과:")
for r in results["image_results"]:
    print("-", r.metadata.get("image_path", "경로 없음"))

# 4. retriever 활용

In [None]:
retriever_text = vectorstore_text.as_retriever(search_type="similarity", search_kwargs={"k": 3})
retriever_image = vectorstore_image.as_retriever(search_type="similarity", search_kwargs={"k": 3})

In [None]:
from langchain.schema import BaseRetriever, Document
from typing import List
from pydantic import PrivateAttr
import torch

class UnifiedMultiModalRetriever(BaseRetriever):
    _retriever_text: Any = PrivateAttr()
    _retriever_image: Any = PrivateAttr()

    def __init__(self, retriever_text, retriever_image):
        super().__init__()
        self._retriever_text = retriever_text
        self._retriever_image = retriever_image

    def _get_relevant_documents(self, query: str) -> List[Document]:
        # 텍스트 임베딩 → 텍스트 검색
        bge_vector = embed_text_bge(query)
        text_results = vectorstore_text.similarity_search_by_vector(bge_vector, k=3)

        # CLIP 임베딩 → 이미지 검색
        clip_inputs = clip_processor(text=[query], return_tensors="pt", padding=True)
        with torch.no_grad():
            clip_vector = clip_model.get_text_features(**clip_inputs)[0].cpu().numpy()
        image_results = vectorstore_image.similarity_search_by_vector(clip_vector, k=3)

        return text_results + image_results

In [None]:
retriever = UnifiedMultiModalRetriever(retriever_text, retriever_image)

results = retriever.get_relevant_documents("프롬의 주장은 뭘까?")

for doc in results:
    if "image_path" in doc.metadata:
        print("🖼 이미지:", doc.metadata["image_path"])
    else:
        print("📘 텍스트:", doc.page_content[:1000])