In [1]:
import os
import torch
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
import heapq

def create_embeddings(use_cpu=False):
    device = "cpu" if use_cpu else ("cuda" if torch.cuda.is_available() else "cpu")
    return HuggingFaceEmbeddings(
        model_name="BAAI/bge-m3",
        model_kwargs={'device': device}
    )

def load_FAISS_vectorstore(vectorstore_path, embeddings):
    if os.path.exists(vectorstore_path):
        try:
            vectorstore = FAISS.load_local(vectorstore_path, embeddings, allow_dangerous_deserialization=True)
            print(f"Loaded vector store from {vectorstore_path}")
            return vectorstore
        except Exception as e:
            print(f"Error loading vector store from {vectorstore_path}: {str(e)}")
    else:
        print(f"Vector store not found at {vectorstore_path}")
    return None

def load_vectorstores_from_directory(parent_directory, embeddings):
    vectorstores = []
    for root, dirs, files in os.walk(parent_directory):
        if 'index.faiss' in files and 'index.pkl' in files:
            vectorstore_path = root
            vs = load_FAISS_vectorstore(vectorstore_path, embeddings)
            if vs:
                vectorstores.append(vs)
    return vectorstores

def retrieve_from_multiple_stores(vectorstores, query, k=5, fetch_k=20):
    all_results = []
    for vs in vectorstores:
        # 使用 max_marginal_relevance_search 來獲取多樣化的結果
        results = vs.max_marginal_relevance_search(query, k=fetch_k, fetch_k=fetch_k)
        # 對這些結果進行評分
        scored_results = vs.similarity_search_with_score(query, k=len(results))
        all_results.extend(scored_results)
    
    # 根據分數對所有結果進行排序，選擇前 k 個
    return sorted(all_results, key=lambda x: x[1])[:k]

# def main():
#     # 設置參數
#     parent_directory = "/media/starklab/BACKUP/向量庫"  # 母資料夾路徑 /media/starklab/BACKUP/向量庫
#     k = 5  # 總共返回的文檔數量
#     fetch_k = 100  # 每個向量庫初始檢索的文檔數量
#     use_cpu = True  # 設置為 True 以使用 CPU

#     # 創建 embeddings 對象
#     embeddings = create_embeddings(use_cpu)

#     # 加載母資料夾中的所有向量存儲
#     vectorstores = load_vectorstores_from_directory(parent_directory, embeddings)
#     if not vectorstores:
#         print("No vector stores loaded. Exiting.")
#         return

#     # 自定問題
#     query = "幫我優化這個問題(請問台美文化的差異？)" #請問明月堂現在是第幾代接班？

#     # 執行檢索
#     results = retrieve_from_multiple_stores(vectorstores, query, k, fetch_k)

#     # 輸出檢索結果
#     print(f"\n檢索結果 for query: '{query}'\n")
#     for idx, (doc, score) in enumerate(results):
#         retrieval_filename = f"{os.path.splitext(doc.metadata['episode_name'])[0]}_節目名稱{doc.metadata['Podcast_name']}"
#         print(f"Result {idx+1}:")
#         print(f"  檔案名稱: {retrieval_filename}")
#         print(f"  相似度分數: {score:.4f}")
#         print(f"  內容摘要: {doc.page_content[:100]}...\n")

# if __name__ == "__main__":
#     main()

In [1]:
import os
import torch
import gradio as gr
from langchain.vectorstores import Chroma, FAISS
from langchain.schema.runnable import RunnablePassthrough
from langchain.prompts import PromptTemplate, ChatPromptTemplate
from langchain_groq import ChatGroq
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain_huggingface import HuggingFaceEmbeddings
import heapq
from langchain.schema import BaseRetriever
from pydantic import Field

# 設置環境變數以禁用 tokenizers 的並行處理
os.environ["TOKENIZERS_PARALLELISM"] = "false"

k = 5
fetch_k = 100

def create_embeddings(use_cpu=False):
    device = "cpu" if use_cpu else ("cuda" if torch.cuda.is_available() else "cpu")
    return HuggingFaceEmbeddings(
        model_name="BAAI/bge-m3",
        model_kwargs={'device': device}
    )

def load_FAISS_vectorstore(vectorstore_path, embeddings):
    if os.path.exists(vectorstore_path):
        try:
            vectorstore = FAISS.load_local(vectorstore_path, embeddings, allow_dangerous_deserialization=True)
            print(f"Loaded vector store from {vectorstore_path}")
            return vectorstore
        except Exception as e:
            print(f"Error loading vector store from {vectorstore_path}: {str(e)}")
    else:
        print(f"Vector store not found at {vectorstore_path}")
    return None

def load_vectorstores_from_directory(parent_directory, embeddings):
    vectorstores = []
    for root, dirs, files in os.walk(parent_directory):
        if 'index.faiss' in files and 'index.pkl' in files:
            vectorstore_path = root
            vs = load_FAISS_vectorstore(vectorstore_path, embeddings)
            if vs:
                vectorstores.append(vs)
    return vectorstores

def retrieve_from_multiple_stores(vectorstores, query, k=5, fetch_k=100):
    all_results = []
    for vs in vectorstores:
        results = vs.max_marginal_relevance_search(query, k=fetch_k, fetch_k=fetch_k)
        scored_results = vs.similarity_search_with_score(query, k=len(results))
        all_results.extend(scored_results)
    
    return [doc for doc, score in heapq.nsmallest(k, all_results, key=lambda x: x[1])]

def setup_qa_chain(use_cpu=False):
    groq_api_key = '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'
    #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
    #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
    model = 'llama-3.1-8b-instant'
    groq_chat = ChatGroq(groq_api_key=groq_api_key, model_name=model)
    memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
    
    # 使用新的FAISS檢索邏輯，並傳入 use_cpu 參數
    parent_directory = "/media/starklab/BACKUP/向量庫"  # 更新為您的向量庫目錄
    embeddings = create_embeddings(use_cpu)
    vectorstores = load_vectorstores_from_directory(parent_directory, embeddings)
    
    class CustomRetriever(BaseRetriever):
        vectorstores: list = Field(default_factory=list)
    
        def __init__(self, vectorstores):
            super().__init__()
            self.vectorstores = vectorstores
    
        def _get_relevant_documents(self, query):
            results = retrieve_from_multiple_stores(self.vectorstores, query, k=k, fetch_k=fetch_k)
            return results  # 這裡應該直接返回文檔列表，而不是元組

    custom_retriever = CustomRetriever(vectorstores)



    template = """我將作為您的Podcast搜尋引擎。當您向我詢問有關特定Podcast節目或內容的問題時，我將使用RAG（檢索增強生成）技術來回答您的問題。請注意，如果RAG檢索庫中沒有您所需的內容，我將告知您「RAG資料庫內沒有您所需的內容」。我希望您根據這些條件提問。

您的第一句話是「嗨」。

檢索資料信息（包括節目標題）：
{context}

聊天歷史：
{chat_history}

當前問題：
{question}

回答指南：
1. **問題處理**：首先對當前問題進行清晰的 prompt engineering，確保理解問題的核心需求。
2. **信息使用**：僅使用檢索資料中的信息來回答問題。如果資料不足以回答問題，請直接回答「RAG 資料庫沒有您想要的資料」。
3. **回答內容**：
   - **具體內容要點**：回答應包括具體的內容要點。
   - **時間戳**：每個內容要點應附上對應的時間戳。請使用完整的格式，例如（MM:SS~MM:SS）。如果只有一個時間點，則使用（MM:SS）。
   - **節目標題**：最後應提供節目標題（格式：（節目標題：[完整標題]））。
4. **回答格式示例**：
   - 「根據檢索資料，[內容摘要1]（時間戳）。此外，[內容摘要2]（時間戳）。[如有更多內容，繼續列舉]。（節目標題：[完整標題]）」
5. **回答語言和風格**：回答要清楚詳細，使用繁體中文。
6. **資訊限制**：不要添加任何檢索資料中沒有的信息。
7. **格式問題**: 請不要使用刪除線或任何其他特殊格式標記在你的回答中。
8. **記憶**: 如果使用者希望接續前面的問答再次提問，系統應該能夠檢索並提供對話紀錄（chat_history），並根據這些紀錄回答使用者的問題。
請根據上述指南回答問題：
"""

    document_prompt = PromptTemplate(
        input_variables=["page_content", "episode_name", "Podcast_name"],
        template="內容: {page_content}\n來源: {episode_name}, {Podcast_name}"
    )
    prompt = ChatPromptTemplate.from_template(template)

    qa_chain = ConversationalRetrievalChain.from_llm(
        llm=groq_chat,
        retriever=custom_retriever,
        memory=memory,
        combine_docs_chain_kwargs={
            "prompt": prompt,
            "document_variable_name": "context",
            "document_prompt": document_prompt
        }
    )

    return qa_chain, custom_retriever

# 修改主函數以接受 use_cpu 參數
def main(use_cpu=False):
    qa_chain, retriever = setup_qa_chain(use_cpu)

    def get_program_list(folder_path):
        try:
            programs = [name for name in os.listdir(folder_path) if os.path.isdir(os.path.join(folder_path, name))]
            program_list = "\n".join(f"{i + 1}: {program}" for i, program in enumerate(programs))
            return program_list
        except FileNotFoundError:
            return "指定的資料夾不存在。"
        except Exception as e:
            return f"發生錯誤: {e}"

    def chat_function(message, history):
        try:
            results = retriever.get_relevant_documents(message)
            response = qa_chain.invoke({"question": message, "chat_history": history})
            answer = response['answer']
    
            # 使用集合來存儲唯一的 (episode_name, podcast_name) 組合
            unique_sources = set()
            for result in results:
                episode_name = result.metadata.get('episode_name', 'Unknown Episode')
                podcast_name = result.metadata.get('Podcast_name', 'Unknown Podcast')
                unique_sources.add((episode_name, podcast_name))
    
            # 格式化 sources 字符串
            sources_str = "\n可參考下方節目集數：\n"
            for idx, (episode_name, podcast_name) in enumerate(unique_sources, 1):
                sources_str += f"Result {idx}: {episode_name}, {podcast_name}\n"
    
            # 將答案和來源信息合併為一個字符串
            full_response = f"{answer}\n\n{sources_str}"
    
            return full_response
    
        except Exception as e:
            error_message = f"發生錯誤: {str(e)}\n很抱歉，我無法處理您的問題。請再試一次或換個問題。"
            return error_message

    with gr.Blocks() as iface:
        gr.Markdown(f"## 目前資料庫中的節目有：\n{get_program_list('/media/starklab/BACKUP/Podcast_project/轉錄文本存放區')}\n\n請在下方提問：")

        chatbot = gr.ChatInterface(
            chat_function,
            title="Podcast Q&A Assistant",
            description="Ask questions about podcast content, and I'll provide answers based on the retrieved information.",
            theme="soft",
            examples=[
                "林書豪這個賽季遇到了什麼困難？",
                "請告訴我這個節目討論了哪些主題？",
                "這集節目中有提到哪些重要的觀點？"
            ],
            retry_btn="重試",
            undo_btn="撤銷",
            clear_btn="清除"
        )

    iface.launch(share=True)

if __name__ == "__main__":
    use_cpu = True  # 設置為 True 以使用 CPU，False 則使用 GPU（如果可用）
    main(use_cpu)

Loaded vector store from /media/starklab/BACKUP/向量庫/科技浪 Techwav
Loaded vector store from /media/starklab/BACKUP/向量庫/史塔克實驗室
Loaded vector store from /media/starklab/BACKUP/向量庫/老高與小茉 Mr & Mrs Gao
Loaded vector store from /media/starklab/BACKUP/向量庫/寧可當吃貨
Loaded vector store from /media/starklab/BACKUP/向量庫/週報時光機（生活歷史、冷知識)
Loaded vector store from /media/starklab/BACKUP/向量庫/古今中歪
Loaded vector store from /media/starklab/BACKUP/向量庫/屎作勇者
Loaded vector store from /media/starklab/BACKUP/向量庫/跳脫Do式圈
Loaded vector store from /media/starklab/BACKUP/向量庫/A’s 對話日誌
Loaded vector store from /media/starklab/BACKUP/向量庫/Joe &amp; Jet 未過濾的 with Jason
Running on local URL:  http://127.0.0.1:7860
Running on public URL: https://a2cdbc98b19eae6ad0.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


In [111]:
import json
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NumpyEncoder, self).default(obj)

# 讀取 JSON 文件
with open('/home/starklab/Documents/QA集/qa_dataset.json', 'r', encoding='utf-8') as f:
    qa_data = json.load(f)

# 初始化 sentence transformer 模型
model = SentenceTransformer('all-MiniLM-L6-v2')

def calculate_similarity(text1, text2):
    # 計算兩段文本的餘弦相似度
    embeddings = model.encode([text1, text2])
    return cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]

results = []

for item in qa_data['questions']:
    question = item['question']
    llm_answer = item['llm_answer']
    ground_truth = item['ground_truth']
    
    # 計算 LLM 回答和 ground truth 的相似度
    similarity = calculate_similarity(llm_answer, ground_truth)
    
    results.append({
        'question': question,
        'similarity_score': similarity
    })

# 計算平均相似度分數
average_similarity = sum(item['similarity_score'] for item in results) / len(results)

# 輸出結果
print(f"Average similarity score: {average_similarity}")
for item in results:
    print(f"Question: {item['question']}")
    print(f"Similarity score: {item['similarity_score']}")
    print()

# 將結果保存到文件，使用自定義的 NumpyEncoder
with open('evaluation_results.json', 'w', encoding='utf-8') as f:
    json.dump({
        'average_similarity': average_similarity,
        'results': results
    }, f, ensure_ascii=False, indent=4, cls=NumpyEncoder)

print("評估完成，結果已保存到 evaluation_results.json")

Average similarity score: 0.52421535551548
Question: 根據林書豪的描述,新北國王隊在本賽季面臨了哪些主要挑戰?
Similarity score: 0.4405829608440399

Question: 林書豪提到球隊傷病問題如何影響了球隊的整體表現?
Similarity score: 0.696298360824585

Question: 與往季相比,林書豪認為本賽季在維持球隊狀況方面有什麼不同?
Similarity score: 0.6541098356246948

Question: 在冠軍賽中,林書豪遇到了什麼個人健康問題?這如何影響了他的表現?
Similarity score: 0.5578819513320923

Question: 林書豪如何描述球隊在季後賽期間的轉變?
Similarity score: 0.403282105922699

Question: 根據播客內容,目前科技業的裁員情況如何?
Similarity score: 0.5089830756187439

Question: 主持人們提到被裁員後,在找新工作時應該如何考慮薪資問題?
Similarity score: 0.3919658958911896

Question: 播客中提到了哪些可能導致員工想要離職的原因?
Similarity score: 0.5406186580657959

評估完成，結果已保存到 evaluation_results.json
