In [32]:
from pymongo import MongoClient
from pymongo.database import Database
from pymongo.collection import Collection
import requests
import json
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
import os
import pandas as pd
import numpy as np

In [33]:
# embedding_model = embedding_model.to(device)

from langchain.embeddings.openai import OpenAIEmbeddings
import os

openai_api_key = os.getenv('OPENAI_API_KEY')

# OpenAI API 접속정보
EMBEDDING_MODEL = OpenAIEmbeddings(openai_api_key=openai_api_key, model="text-embedding-ada-002")

In [34]:
import os


MONGO_URI = os.getenv('MONGODB_ATLAS_CLUSTER_URI')

DB_NAME = "test"

COLLECTION_NAME = "col1"

In [35]:
# MongoDB 클라이언트 생성


client = MongoClient(MONGO_URI)
collection = client[DB_NAME][COLLECTION_NAME]
# db: Database = client[DATABASE_NAME]
# collection: Collection = db[COLLECTION_NAME]

In [36]:
import pandas as pd

# csv 파일
input_file = "../용어사전 - 종합본_6781개.csv"
df = pd.read_csv(input_file, dtype=str)

# print(df.head())

In [37]:
# 하이브리드 검색 가중치 설정
BM25_MAX_VALUE = 13.0
BM25_MIN_VALUE = 3.0
VECTOR_SCORE_WEIGHT = 0.3
TEXT_SCORE_WEIGHT = 0.7

In [38]:
# @title
def vector_search(query_vector, vector_index_name, num_candidates=64, limit=25):
    """
    벡터 검색 수행
    """
    pipeline = [
        {
            "$vectorSearch": {
                "index": vector_index_name,
                "path": "embedding",
                "queryVector": query_vector,
                "numCandidates": num_candidates,
                "limit": limit
            }
        },
        {
            "$project": {
                "metadata": 1,
                "content": 1,
                # "media": 1,
                "vectorScore": {"$meta": "vectorSearchScore"},
                "score": {"$meta": "vectorSearchScore"}
            }
        },
        {
            "$sort": {"score": -1}
        },
        {
            "$limit": limit
        }
    ]

    results = collection.aggregate(pipeline)
    return list(results)

In [39]:
# @title
def text_search(query, text_index_name, limit=25):
    """
    텍스트 검색 수행
    """
    pipeline = [
        {
            "$search": {
                "index": text_index_name,
                "text": {
                    "query": query,
                    "path": ["content", "metadata.KO", "metadata.ENG"]
                }
            }
        },
        {
            "$project": {
                "metadata": 1,
                "content": 1,
                # "media": 1,
                "textScore": {"$meta": "searchScore"},
                "score": {"$meta": "searchScore"}
            }
        },
        {
            "$sort": {"score": -1}
        },
        {
            "$limit": limit
        }
    ]

    results = collection.aggregate(pipeline)
    return list(results)

In [40]:
# @title
def normalize_vector_score(vector_score):
    return (vector_score + 1) / 2.0

def normalize_bm25_score(bm25_score):
    return min((bm25_score - BM25_MIN_VALUE) / (BM25_MAX_VALUE - BM25_MIN_VALUE), 1.0)

def calculate_convex_score(vector_score, bm25_score):
    tmm_vector_score = normalize_vector_score(vector_score)
    tmm_bm25_score = normalize_bm25_score(bm25_score)
    return VECTOR_SCORE_WEIGHT * tmm_vector_score + TEXT_SCORE_WEIGHT * tmm_bm25_score

In [29]:
# @title
def create_metadata_array(query, limit=10):
    """
    Hybrid Search를 수행하여 metadata를 array 형태로 묶은 JSON string 반환
    """
    # Hybrid Search 수행
    search_results = hybrid_search(query, limit)

    # metadata array 생성
    metadata_array = []
    for result in search_results[:limit]:
        metadata = result.get("metadata", {})
        # metadata의 key-value를 배열 형태로 변환
        # metadata_entry = [{"key": key, "value": value} for key, value in metadata.items()]

        # 배열 추가
        metadata_array.append(metadata)

    # metadata array를 JSON string으로 변환
    metadata_json = json.dumps(metadata_array, ensure_ascii=False)

    return metadata_json

In [None]:
# @title
def hybrid_search(query, length=10, model=EMBEDDING_MODEL):
    # 벡터 및 텍스트 검색 수행
    embedding = get_embedding_from_openaiEmbedding(query, model)
    vector_results = vector_search(embedding, "word_vector_index")
    text_results = text_search(query, "word_text_index")

    # 결과 병합
    combined_results = {}
    for result in vector_results:
        doc_id = result["_id"]
        vector_score = result.get("vectorScore", 0)
        combined_results[doc_id] = {
            **result,
            "vectorScore": vector_score,
            "score": calculate_convex_score(vector_score, 0)
        }

    for result in text_results:
        doc_id = result["_id"]
        text_score = result.get("textScore", 0)
        if doc_id not in combined_results:
            combined_results[doc_id] = {
                **result,
                "vectorScore": 0,
                "score": calculate_convex_score(0, text_score)
            }
        else:
            vector_score = combined_results[doc_id]["vectorScore"]
            combined_results[doc_id]["textScore"] = text_score
            combined_results[doc_id]["score"] = calculate_convex_score(vector_score, text_score)

    # score가 0.3 미만인 결과는 제외
    filtered_results = [result for result in combined_results.values() if result["score"] >= 0.5]

    # 결과 정렬 (score 높은 순으로 내림차순)
    sorted_results = sorted(filtered_results, key=lambda x: x["score"], reverse=True)

    # 상위 length개의 결과만 반환
    return sorted_results[:length]


In [30]:
def get_embedding_from_openaiEmbedding(text, model=EMBEDDING_MODEL):
    """
    OpenAIEmbeddings 사용해 텍스트 임베딩 생성

    Args:
        text (str or List[str]): 임베딩을 생성할 텍스트 또는 텍스트 리스트
        model: OpenAIEmbeddings

    Returns:
        List[float] or List[List[float]]: 입력 텍스트의 임베딩
    """
    if isinstance(text, str):
        text = [text]  # 단일 문자열 입력을 리스트로 변환

    try:
        embeddings = model.embed_documents(text)  # 임베딩 생성
    except Exception as e:
        raise ValueError(f"임베딩 생성 중 오류가 발생했습니다: {e}")

    # 단일 문장 입력 시 첫 번째 임베딩만 반환
    return embeddings[0] if len(embeddings) == 1 else embeddings


In [12]:

import uuid
# 데이터프레임의 각 행에 대해 처리
for _, row in df.iterrows():
    # Content 생성
    _content = '\t'.join(map(str, row.tolist()))

    # OpenAIEmbeddings를 사용한 임베딩 생성
    try:
        _embedding = get_embedding_from_openaiEmbedding(_content, model=EMBEDDING_MODEL)
    except Exception as e:
        print(f"임베딩 생성 중 오류 발생: {e}")
        continue  # 오류 발생 시 해당 행은 건너뜁니다.

    # MongoDB에 삽입할 Document 생성
    document = {
        "_id": str(uuid.uuid4()),  # 고유 ID 생성
        "metadata": {col: row[col] for col in df.columns},  # 각 컬럼을 metadata에 저장
        "content": _content,  # content에 탭으로 연결된 값
        "media": [],
        "embedding": _embedding,  # 생성된 임베딩 데이터 저장
    }

    # MongoDB 컬렉션에 Document 삽입
    collection.insert_one(document)

print("데이터가 성공적으로 MongoDB에 저장되었습니다!")


데이터가 성공적으로 MongoDB에 저장되었습니다!


In [20]:
def process_csv_with_glossary(input_file):
    """
    csv 파일의 source 데이터를 기반으로 Hybrid Search 결과를 glossary 컬럼에 추가
    """
    # 엑셀 파일 읽기
    df = pd.read_csv(input_file, dtype=str)
    df = df.fillna("(None)")
    df = df.replace([np.inf, -np.inf], 0)  # Infinity를 0으로 대체
    # glossary 컬럼 추가
    glossary_data = []
    for _, row in df.iterrows():
        source_text = row["OriTextData"]  # source 컬럼 값

        # Hybrid Search 수행
        search_results = hybrid_search(source_text)

        # metadata array 생성
        metadata_array = [result.get("metadata", {}) for result in search_results]

        # JSON string으로 변환
        metadata_json = json.dumps(metadata_array, ensure_ascii=False)

        # glossary 데이터 추가
        glossary_data.append(metadata_json)

    # 새로운 컬럼 추가
    df["glossary"] = glossary_data



In [43]:
query = "어딜 놀러갈까? 서구문화회관이랑 동원화랑은 어때? 나는 씨네팔공산에도 관심이 있어."
type_filter = "report"
hybrid_search(query)

NameError: name 'vector_results' is not defined

In [28]:
query = "누에박물관을 간 다음에 덕진문고를 가자. 고래산까지 갈 수 있으면 좋겠다.."
type_filter = "report"
hybrid_search(query)

[{'_id': '82fe66aa-4b6f-49ea-a6c4-d13bded8f11f',
  'metadata': {'KO': '누에박물관', 'ENG': 'Silkworm Museum', 'JPN': '蚕博物館'},
  'content': '누에박물관\tSilkworm Museum\t蚕博物館',
  'vectorScore': 0.9270575046539307,
  'score': 0.9890586256980896,
  'textScore': 23.548418045043945},
 {'_id': 'f4e4f7bc-4bb2-4601-aea6-735dd0a7e946',
  'metadata': {'KO': '덕진문고', 'ENG': 'Deokjin Bookstore', 'JPN': 'トクジン文庫'},
  'content': '덕진문고\tDeokjin Bookstore\tトクジン文庫',
  'textScore': 20.895797729492188,
  'score': 0.85,
  'vectorScore': 0},
 {'_id': '35ed9c3e-37be-4856-bd37-c53006fccf8a',
  'metadata': {'KO': '고래산', 'ENG': 'Gorae Mountain', 'JPN': '高崍山'},
  'content': '고래산\tGorae Mountain\t高崍山',
  'textScore': 17.97307777404785,
  'score': 0.85,
  'vectorScore': 0},
 {'_id': 'b301631a-eeef-4a91-9d71-d3427fa779aa',
  'metadata': {'KO': '박물관 수', 'ENG': 'Su Museum', 'JPN': '博物館の数'},
  'content': '박물관 수\tSu Museum\t博物館の数',
  'textScore': 16.9704532623291,
  'score': 0.85,
  'vectorScore': 0},
 {'_id': '1fed1349-ab77-43c9

In [31]:
create_metadata_array(query, 10)

'[{"KO": "누에박물관", "ENG": "Silkworm Museum", "JPN": "蚕博物館"}, {"KO": "덕진문고", "ENG": "Deokjin Bookstore", "JPN": "トクジン文庫"}, {"KO": "고래산", "ENG": "Gorae Mountain", "JPN": "高崍山"}, {"KO": "박물관 수", "ENG": "Su Museum", "JPN": "博物館の数"}, {"KO": "가천박물관", "ENG": "Gacheon Museum", "JPN": "嘉泉博物館"}, {"KO": "다산박물관", "ENG": "Dasan Museum", "JPN": "茶山博物館"}, {"KO": "지질박물관", "ENG": "Geological Museum", "JPN": "地質博物館"}, {"KO": "가래산", "ENG": "Garae Mountain", "JPN": "加来山"}, {"KO": "떡박물관", "ENG": "Tteok Museum", "JPN": "トク博物館"}, {"KO": "소금박물관", "ENG": "Salt Museum", "JPN": "塩博物館"}]'

In [None]:
max_seq_length = 4096 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="merged_model", # YOUR MODEL YOU USED FOR TRAINING
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

In [None]:
prompt_template_str = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
Translate the original text into the target language
Adhere to the glossary for applicable terms

If a term does not have an entry in the target language,
adapt the most relevant one or translate appropriately based on context.

Apply proper capitalization rules for general nouns,
ensuring they are lowercase in the middle of a sentence unless they are proper nouns
or explicitly marked as capitalized in the glossary.

For terms like Scourge,
use lowercase and pluralize if the context suggests multiple entities.

Provide only the translated text without explanations.

### target language ###
{target_language}

### glossary ###
{glossary}

<|eot_id|><|start_header_id|>user<|end_header_id|>
{user_message}

<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""

# PromptTemplate 정의
prompt_template = PromptTemplate(
    input_variables=["target_language", "glossary", "user_message"],  # 사용자가 입력할 변수
    template=prompt_template_str,
)

In [None]:
# Hugging Face 파이프라인 설정
hf_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    #max_length=128,        # 생성될 텍스트의 최대 길이
    #temperature=0.7,       # 출력 다양성 조정
    #top_k=50,              # Top-k 샘플링
    #top_p=0.9,             # Nucleus 샘플링
    return_full_text=False,
)

In [None]:
# LangChain에서 Hugging Face 파이프라인 Wrapping
llm = HuggingFacePipeline(pipeline=hf_pipeline)

In [None]:
# LLMChain 설정
chain = LLMChain(llm=llm, prompt=prompt_template)

In [None]:
user_message = """우디가 팀에 새로 합류했어요!
우디는 토이스토리 렐름에서만 이용이 가능하다구요!!!
"""

In [None]:
chain.run({
    "target_language": "English",
    "glossary": create_metadata_array(user_message, 10),
    "user_message": user_message
}).lstrip("\n")