<a href="https://colab.research.google.com/github/Jessiemkh/RAG-LangChain-Colab/blob/main/%E3%80%8C%E7%B7%B4%E7%BF%92langChain_RAG_ipynb%E3%80%8D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 安裝必要的庫

In [None]:
!pip install gradio google-generativeai youtube_transcript_api python-docx PyPDF2 langchain chromadb  langchain-google-genai langchain-community faiss-cpu unstructured[pdf] python-docx pandas openpyxl




如果有需要，更新下面的庫

In [None]:
!pip install --upgrade google-generativeai langchain-google-genai youtube_transcript_api python-docx PyPDF2 gradio tenacity

Collecting langchain-google-genai
  Using cached langchain_google_genai-2.0.11-py3-none-any.whl.metadata (3.6 kB)
INFO: pip is looking at multiple versions of langchain-google-genai to determine which version is compatible with other requirements. This could take a while.


# 實作方式

1.   將youtube影片轉成文本，將檔案上傳成chucks
2.   創建向量數據庫，建立知識庫，創建RAG AI
3.   並加入 tavily 網路搜索做補充，（ＲＡＧ和 網路搜索分開）






In [None]:
import gradio as gr
import os
import pandas as pd
from langchain.document_loaders import YoutubeLoader, CSVLoader, UnstructuredPDFLoader, UnstructuredWordDocumentLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.tools import TavilySearchResults
from google.colab import userdata
# 設置API密鑰
# 設置你的 api key 到環境變數
os.environ["GOOGLE_API_KEY"] = userdata.get('GOOGLE_API_KEY')
os.environ["TAVILY_API_KEY"] = userdata.get('TAVILY_API_KEY')

def process_file(file_path):
    # 根據文件類型選擇適當的加載器
    file_extension = file_path.split('.')[-1].lower()
    if file_extension == 'csv':
        loader = CSVLoader(file_path=file_path)
    elif file_extension == 'pdf':
        loader = UnstructuredPDFLoader(file_path)
    elif file_extension in ['docx', 'doc']:
        loader = UnstructuredWordDocumentLoader(file_path)
    else:
        raise ValueError(f"不支持的文件類型: {file_extension}")

    # 加載文檔
    documents = loader.load()

    # 設置文本分割器
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

    # 分割文本
    chunks = []
    for doc in documents:
        doc_chunks = text_splitter.split_text(doc.page_content)
        chunks.extend(doc_chunks)

    return chunks

def process_youtube(url):
    loader = YoutubeLoader.from_youtube_url(url)
    documents = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    chunks = []
    for doc in documents:
        doc_chunks = text_splitter.split_text(doc.page_content)
        chunks.extend(doc_chunks)
    return chunks

def create_knowledge_base(chunks):
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    knowledge_base = FAISS.from_texts(chunks, embeddings)
    return knowledge_base

def tavily_search(query):
    tavily_tool = TavilySearchResults()
    tavily_result = tavily_tool.run(query)
    return tavily_result
def answer_query(knowledge_base, query, use_web_search):
    llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash-001")
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=knowledge_base.as_retriever()
    )

    rag_response = qa_chain.run(query)

    if use_web_search:
        tavily_result = tavily_search(query)
        combined_response = llm.predict(f"""
        基於以下信息回答問題:

        RAG系統回答: {rag_response}

        網絡搜索結果: {tavily_result}

        請綜合上述信息,提供一個全面的回答。如果RAG系統的回答已經足夠全面,可以主要使用RAG的內容,
        並用網絡搜索結果進行補充或驗證。如果RAG系統的回答不夠完整,請使用網絡搜索結果來擴展回答。

        問題: {query}
        """)
        return f"RAG + 網路搜索回答:\n{combined_response}"
    else:
        return f"僅RAG回答:\n{rag_response}"

def process_input(file, youtube_url, query, use_web_search):
    chunks = []
    if file:
        try:
            file_chunks = process_file(file.name)
            chunks.extend(file_chunks)
        except Exception as e:
            return f"處理文件時發生錯誤: {str(e)}"
    if youtube_url:
        try:
            youtube_chunks = process_youtube(youtube_url)
            chunks.extend(youtube_chunks)
        except Exception as e:
            return f"處理YouTube URL時發生錯誤: {str(e)}"

    if not chunks:
        return "請上傳文件或提供YouTube URL"

    knowledge_base = create_knowledge_base(chunks)
    answer = answer_query(knowledge_base, query, use_web_search)
    return answer

iface = gr.Interface(
    fn=process_input,
    inputs=[
        gr.File(label="上傳文件 (CSV, PDF, Word)"),
        gr.Textbox(label="YouTube URL (可選)"),
        gr.Textbox(label="輸入您的問題"),
        gr.Checkbox(label="使用網路搜索", value=True)
    ],
    outputs=gr.Textbox(label="回答"),
    title="智能文檔和youtube影片查詢系統 (RAG + 可選網路搜索)",
    description="上傳CSV、PDF或Word文件,或輸入YouTube URL,然後提出問題。您可以選擇是否使用網路搜索來補充RAG系統的回答。",
    flagging_dir="/content/my_flagged_data", #存入你想要存的資料夾
    flagging_options=[("儲存此對話內容到CSV","儲存問答")],
)

iface.launch()


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://3f3acfd207d587e23d.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


