In [None]:
import os
import sys
from pathlib import Path

# Add the project root to the Python path
project_root = Path().resolve().parent
sys.path.insert(0, str(project_root))

from dotenv import load_dotenv
load_dotenv()
from langchain_upstage import UpstageEmbeddings
embeddings = UpstageEmbeddings(model="embedding-passage")

DB_URI = os.environ["POSTGRES_URI"]     

from langchain_postgres import PGVector
from langchain_core.documents import Document
vector_store = PGVector(
    embeddings=embeddings,
    collection_name="SPRI_ALL",
    connection=DB_URI
)

In [None]:
import pickle
papers = []
for path in sorted(Path(project_root).joinpath("outputs").glob("SPRI_*_split_documents.pkl")):
    
    with open(path, "rb") as f:
        split_documents = pickle.load(f)
        for doc in split_documents:
            name = path.stem.replace("_output_split_documents", "")
            doc.metadata["title"] = name
           
            if name in ["SPRI_2022", "SPRI_2023", "SPRI_2025"]:
                doc.metadata["category"] = "AI-index"
                if name == "SPRI_2022":
                    doc.metadata["tag"] = ["AI인덱스", "AI트렌드", "AI윤리", "AI인재"]
                elif name == "SPRI_2023":
                    doc.metadata["tag"] = ["AI인덱스", "AI트렌드", "AI규제", "AI인재"]
                elif name == "SPRI_2025":
                    doc.metadata["tag"] = ["AI인덱스", "AI트렌드", "AI스타트업", "정부정책"]
            elif "SPRI_is" in name:
                doc.metadata["category"] = "issue-report"
                if name == "SPRI_is_211":
                    doc.metadata["tag"] = ["AI스타트업", "기술사업화", "기업가정신", "AI인재"]
                elif name == "SPRI_is_212":
                    doc.metadata["tag"] = ["SW안전", "기능안전", "오토파일럿"]
                elif name == "SPRI_is_213":
                    doc.metadata["tag"] = ["미래기술", "산업전망", "AI스타트업", "AI인재"]
            elif "SPRI_c" in name:
                doc.metadata["category"] = "column"
                if name == "SPRI_c_1":
                    doc.metadata["tag"] = ["Govtech", "정부혁신", "디지털정부", "제도주의"]
                elif name == "SPRI_c_2":
                    doc.metadata["tag"] = ["개방형혁신", "SaaS", "AI인재"]
                elif name == "SPRI_c_3":
                    doc.metadata["tag"] = ["산업전망", "기업가정신", "AI윤리", "AI인재"]
                
                    
    papers.append(split_documents)


In [61]:
for paper in papers:
    vector_store.add_documents(paper)

In [27]:
retriever = vector_store.as_retriever(search_kwargs={"k": 10, "filter": 
{"$and": [
            {"tag": {"$eq": ["AI인재"]}},
            {"tag": {"$ne": ["AI트렌드"]}}
        ]}
})

In [28]:
retriever.invoke("AI 인덱스")

[Document(id='250c30ca-09f7-4d71-b8f3-cf9322068842', metadata={'id': 84, 'tag': ['AI인덱스', 'AI트렌드', 'AI윤리', 'AI인재'], 'page': 42, 'title': 'SPRI_2022', 'category': 'AI-index', 'image_id': [], 'image_path': [], 'text_summary': [], 'image_summary': []}, page_content='SPRi 이슈리포트 IS-139\n\n\n  \nAI Index 2022의 주요 내용 및 시사점\n\n참고문헌\n====\n\n국외문헌\n====\n\n  \n\n§ Daniel Zhang, Nestor Maslej, Erik Brynjolfsson, John Etchemendy, Terah  \nLyons, James Manyika, Helen Ngo, Juan Carlos Niebles, Michael Sellitto, Ellie  \nSakhaee, Yoav Shoham, Jack Clark, and Raymond Perrault, “The AI Index  \n2022 Annual Report,” AI Index Steering Committee, Stanford Institute for  \nHuman-Centered AI, Stanford University, March 2022\n\n37'),
 Document(id='fc0a9c91-3106-4ae6-a450-7412e143973e', metadata={'id': 46, 'tag': ['AI인덱스', 'AI트렌드', 'AI규제', 'AI인재'], 'page': 27, 'title': 'SPRI_2023', 'category': 'AI-index', 'image_id': [322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 335, 336, 337, 338, 339], 'image