In [None]:
import faiss
import pandas as pd
from sentence_transformers import SentenceTransformer

# 모델 및 데이터 로드

file_path = 'fair_m.csv'
df = pd.read_csv(file_path, encoding='utf-8-sig')

model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')

# 제목과 내용을 개별 임베딩
def create_embeddings(df, model):
    title_embeddings = model.encode(df['Title'].fillna('').tolist())
    context_embeddings = model.encode(df['Context'].fillna('').tolist())
    return title_embeddings, context_embeddings

title_embeddings, context_embeddings = create_embeddings(df, model)

# 제목과 내용 각각의 Faiss 인덱스 생성
vector_dim = title_embeddings.shape[1]
index_title = faiss.IndexFlatL2(vector_dim)
index_context = faiss.IndexFlatL2(vector_dim)

index_title.add(title_embeddings)
index_context.add(context_embeddings)

def retrieve_and_debug_with_filter(question, index_title, index_context, df, model, top_k=5, title_weight=0.7, keyword=None):
    # 질문 임베딩
    question_embedding = model.encode([question])

    # 제목과 내용 검색
    _, title_indices = index_title.search(question_embedding, top_k)
    _, context_indices = index_context.search(question_embedding, top_k)

    combined_scores = {}
    for i in range(top_k):
        t_idx = title_indices[0][i]
        c_idx = context_indices[0][i]

        combined_scores[t_idx] = combined_scores.get(t_idx, 0) + title_weight
        combined_scores[c_idx] = combined_scores.get(c_idx, 0) + (1 - title_weight)

    # 점수로 정렬
    sorted_results = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)

    # 키워드 필터링
    filtered_results = []
    for idx, score in sorted_results[:top_k]:
        title = df.iloc[idx]['Title']
        context = df.iloc[idx]['Context']
        if keyword and keyword not in title and keyword not in context:
            continue  # 키워드가 제목이나 내용에 없으면 제외
        filtered_results.append((idx, score, title, context))

    return filtered_results

# 질문
question = "이순신 이야기 들려줘"
keyword = "이순신"

# 검색 및 필터링
filtered_results = retrieve_and_debug_with_filter(question, index_title, index_context, df, model, keyword=keyword)

# 검색 결과 출력
print("검색된 결과 (인덱스, 가중치, 제목, 내용):")
for idx, weight, title, context in filtered_results:
    print(f"Index: {idx}, Weight: {weight:.2f}, Title: {title}")
    print(f"Context: {context[:200]}...")
    print("-" * 80)