In [1]:
import sys
from pathlib import Path
root = Path().resolve().parent  # adjust level as needed
sys.path.insert(0, str(root))

In [9]:
import pickle
papers = []
for path in sorted(Path(root).joinpath("outputs").glob("SPRI_*_split_documents.pkl")):
    
    with open(path, "rb") as f:
        split_documents = pickle.load(f)
        for doc in split_documents:
            name = path.stem.replace("_output_split_documents", "")
            doc.metadata["title"] = name
           
            if name in ["SPRI_2022", "SPRI_2023", "SPRI_2025"]:
                doc.metadata["category"] = "AI-index"
                if name == "SPRI_2022":
                    doc.metadata["tag"] = ["AI인덱스", "AI트렌드", "AI윤리", "AI인재"]
                elif name == "SPRI_2023":
                    doc.metadata["tag"] = ["AI인덱스", "AI트렌드", "AI규제", "AI인재"]
                elif name == "SPRI_2025":
                    doc.metadata["tag"] = ["AI인덱스", "AI트렌드", "AI스타트업", "정부정책"]
            elif "SPRI_is" in name:
                doc.metadata["category"] = "issue-report"
                if name == "SPRI_is_211":
                    doc.metadata["tag"] = ["AI스타트업", "기술사업화", "기업가정신", "AI인재"]
                elif name == "SPRI_is_212":
                    doc.metadata["tag"] = ["SW안전", "기능안전", "오토파일럿"]
                elif name == "SPRI_is_213":
                    doc.metadata["tag"] = ["미래기술", "산업전망", "AI스타트업", "AI인재"]
            elif "SPRI_c" in name:
                doc.metadata["category"] = "column"
                if name == "SPRI_c_1":
                    doc.metadata["tag"] = ["Govtech", "정부혁신", "디지털정부", "제도주의"]
                elif name == "SPRI_c_2":
                    doc.metadata["tag"] = ["개방형혁신", "SaaS", "AI인재"]
                elif name == "SPRI_c_3":
                    doc.metadata["tag"] = ["산업전망", "기업가정신", "AI윤리", "AI인재"]
                
                    
    papers.extend(split_documents)

In [None]:
import os
from elasticsearch import Elasticsearch
from langchain_upstage import UpstageEmbeddings
from dotenv import load_dotenv
load_dotenv(root.joinpath("elastic-start-local/.env"))
URL = os.environ["ES_LOCAL_URL"]
API_KEY = os.environ["ES_LOCAL_API_KEY"]
es = Elasticsearch(
    URL,
    api_key=API_KEY
)

In [17]:
# Load a sentence transformer model for generating embeddings
model = UpstageEmbeddings(model="embedding-passage")  
# Index name
index_name = "vector_search_demo"

# Create index with vector field mapping
mapping = {
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "metadata": {
                "properties": {
                    "page": {"type": "integer"},
                    "image_id": {"type": "integer"},
                    "image_path": {"type": "keyword"},
                    "text_summary": {"type": "text"},
                    "image_summary": {"type": "text"},
                    "id": {"type": "keyword"},
                    "title": {"type": "keyword"},
                    "category": {"type": "keyword"},
                    "tag": {"type": "keyword"}                   
                }
            },
            "vector": {
                "type": "dense_vector",
                "dims": 4096,  # Dimension of all-MiniLM-L6-v2 embeddings
                "index": True,
                "similarity": "cosine"
            }
        }
    }
}

# Delete index if exists and create new one
if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)
es.indices.create(index=index_name, body=mapping)
print(f"Created index: {index_name}")

Created index: vector_search_demo


In [None]:
import uuid
# Index documents with their embeddings
print("\nIndexing documents...")
for i, doc in enumerate(papers, 1):
    print(doc.metadata)
        
    # Generate embedding
    embedding = model.embed_query(doc.page_content)
    
    # Index document
    doc = {
        "text": doc.page_content,
        "metadata": doc.metadata,
        "vector": embedding
    }
    es.index(index=index_name, id=uuid.uuid4(), document=doc)
    print(f"Indexed: doc_id:{i}")


Indexing documents...
{'page': 1, 'image_id': [], 'image_path': [], 'text_summary': [], 'image_summary': [], 'id': 0, 'title': 'SPRI_2022', 'category': 'AI-index', 'tag': ['AI인덱스', 'AI트렌드', 'AI윤리', 'AI인재']}
Indexed: doc_id:1
{'page': 2, 'image_id': [], 'image_path': [], 'text_summary': [], 'image_summary': [], 'id': 1, 'title': 'SPRI_2022', 'category': 'AI-index', 'tag': ['AI인덱스', 'AI트렌드', 'AI윤리', 'AI인재']}
Indexed: doc_id:2
{'page': 3, 'image_id': [], 'image_path': [], 'text_summary': [], 'image_summary': [], 'id': 2, 'title': 'SPRI_2022', 'category': 'AI-index', 'tag': ['AI인덱스', 'AI트렌드', 'AI윤리', 'AI인재']}
Indexed: doc_id:3
{'page': 4, 'image_id': [], 'image_path': [], 'text_summary': [], 'image_summary': [], 'id': 3, 'title': 'SPRI_2022', 'category': 'AI-index', 'tag': ['AI인덱스', 'AI트렌드', 'AI윤리', 'AI인재']}
Indexed: doc_id:4
{'page': 5, 'image_id': [], 'image_path': [], 'text_summary': '다음은 주어진 텍스트의 한국어 요약입니다.\n\n- Stanford의 Human-centered Artificial Intelligence Institute(HAI)가 AI Index

In [28]:

# Hybrid search: Combine vector search with text search
def hybrid_search(query_text, top_k=5):
    query_vector = model.embed_query(query_text)
    
    common_filters = {
        "bool": {
            "should": [ # or
                {"term": {"metadata.tag": 'AI인재'}},
                {"term": {"metadata.tag": 'AI스타트업'}},
            ],
            "must_not": [
                {"term": {"metadata.tag": 'AI인덱스'}},
            ]
        }
    }

    # Combine KNN with text search
    hybrid_query = {
        "query": {
            "bool": {
                "must": [
                    {
                        "match": {
                            "text": query_text
                        }
                    }
                ],
                "filter": common_filters  # Use same filter
            }
        },
        "knn": {
            "field": "vector",
            "query_vector": query_vector,
            "k": top_k,
            "num_candidates": 100,
            "filter": common_filters  # Reuse same filter
        },
        "_source": ["text", "metadata"]
    }
    
    response = es.search(index=index_name, body=hybrid_query)
    return response

print(f"\n\n{'='*60}")
print("HYBRID SEARCH (Vector + Text)")
print('='*60)
print("Query: AI 트렌드")

results = hybrid_search("AI 트렌드", top_k=5)



HYBRID SEARCH (Vector + Text)
Query: AI 트렌드


In [29]:
from langchain_core.documents import Document
docs = []
for r in results['hits']['hits'][:5]:
    doc = Document(page_content=r['_source']['text'], metadata=r['_source']['metadata'])
    docs.append(doc)
    
