<a href="https://colab.research.google.com/github/GuanRuLai/Python-project-Youtube-Video-Comment-Chatbot/blob/main/Youtube_video_comment_chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install necessary libraries

In [34]:
!pip install langchain
!pip install langchain_openai rich
!pip install youtube_search
!pip install yt-dlp
!pip install faiss-gpu
!pip install langchain-community

Collecting yt-dlp
  Downloading yt_dlp-2024.12.23-py3-none-any.whl.metadata (172 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m172.1/172.1 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading yt_dlp-2024.12.23-py3-none-any.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m52.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: yt-dlp
Successfully installed yt-dlp-2024.12.23


# Import libraries

In [35]:
import os
from google.colab import userdata
from langchain_community.tools import YouTubeSearchTool
from yt_dlp import YoutubeDL
from pytube.exceptions import VideoUnavailable, PytubeError
from openai import OpenAI
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI

# Set api key

In [52]:
os.environ["OPENAI_API_KEY"] = userdata.get("OPENAI_API_KEY")

# define a function for getting the url of the video

In [59]:
def search_url(query):
  tool = YouTubeSearchTool() # Build Youtube tool object
  result = tool.invoke(query + ",1")
  urls = eval(result) # Parse the string into a Python expression
  urls = [url.split("&")[0] for url in urls]
  return urls

# Define a function for downloading the the audio of the video

In [60]:
def url_download(url):
    try:
        # Set output file name
        drive_path = "/content/drive/MyDrive/Colab Notebooks/人工智慧-深度學習/深度學習模型實作"
        output_path = os.path.join(drive_path, "audio")

        # yt-dlp options for downloading audio
        ydl_opts = {
            'format': 'bestaudio/best', # Download the best quality audio
            'outtmpl': output_path,
            'postprocessors': [{
                'key': 'FFmpegExtractAudio', # Extract audio
                'preferredcodec': 'mp3', # Convert to mp3
                'preferredquality': '192', # Audio quality
            }],
        }

        # Download the audio
        with YoutubeDL(ydl_opts) as ydl:
            print(f"開始下載: {url}")
            ydl.download([url])
            print("音訊下載成功")
    except Exception as e:
        print(f"下載過程中出現錯誤: {e}")

# Define a function for ASR using OpenAI Whisper API

In [61]:
def audio_to_text(file_path):
  client = OpenAI()
  audio_file = open(file_path, "rb")
  transcript = client.audio.transcriptions.create(
      model="whisper-1",
      file=audio_file
    )
  return transcript.text

# Define a function for data processing & RAG chain building

In [62]:
def rag(text):
  # split the data
  text_splitter = RecursiveCharacterTextSplitter(separators=[" "],
                                               chunk_size=300,
                                               chunk_overlap=20)
  splits = text_splitter.split_text(text)

  # convert text to vector & store into vector database
  embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
  db = FAISS.from_texts(splits, embeddings)
  db.save_local("/content/drive/MyDrive/Colab Notebooks/人工智慧-深度學習/深度學習模型實作/youtube_db")

  new_db = FAISS.load_local(
      folder_path="/content/drive/MyDrive/Colab Notebooks/人工智慧-深度學習/深度學習模型實作/youtube_db",
      embeddings=embeddings,
      allow_dangerous_deserialization=True)

  # create conversation chain
  chat_model = ChatOpenAI(model="gpt-3.5-turbo")

  str_parser = StrOutputParser()

  template = (
      "你是專業的小幫手，請根據以下內容加上自身判斷回答問題:\n"
      "{context}\n"
      "問題: {question}"
      )
  prompt = ChatPromptTemplate.from_template(template)

  retriever = new_db.as_retriever()

  chain = (
      {"context": retriever, "question": RunnablePassthrough()}
      | prompt
      | chat_model
      | str_parser
  )
  return chain

# Main program

In [63]:
while True:
    msg = input("請問您要查詢的影片關鍵字是？：")
    if not msg.strip():
        break

    urls = search_url(msg)
    print(urls)
    url_download(urls[0])
    print("處理中，請稍後...")
    text = audio_to_text("/content/drive/MyDrive/Colab Notebooks/人工智慧-深度學習/深度學習模型實作/audio.mp3")
    print("處理完畢，可以開始進行問答！")

    chain = rag(text)

    while True:
        msg = input("我的問題：")
        if not msg.strip():
            break
        response = chain.invoke(msg)
        print(response)

請問您要查詢的影片關鍵字是？：海與天之間
['https://www.youtube.com/watch?v=xf-ghX2wB6o']
開始下載: https://www.youtube.com/watch?v=xf-ghX2wB6o
[youtube] Extracting URL: https://www.youtube.com/watch?v=xf-ghX2wB6o
[youtube] xf-ghX2wB6o: Downloading webpage
[youtube] xf-ghX2wB6o: Downloading ios player API JSON
[youtube] xf-ghX2wB6o: Downloading mweb player API JSON
[youtube] xf-ghX2wB6o: Downloading m3u8 information
[info] xf-ghX2wB6o: Downloading 1 format(s): 251
[download] /content/drive/MyDrive/Colab Notebooks/人工智慧-深度學習/深度學習模型實作/audio has already been downloaded
[download] 100% of    3.65MiB
[ExtractAudio] Destination: /content/drive/MyDrive/Colab Notebooks/人工智慧-深度學習/深度學習模型實作/audio.mp3
Deleting original file /content/drive/MyDrive/Colab Notebooks/人工智慧-深度學習/深度學習模型實作/audio (pass -k to keep)
音訊下載成功
處理中，請稍後...
處理完畢，可以開始進行問答！
我的問題：請問這首歌表達甚麼意境？
根據提供的歌詞內容，這首歌表達了一種失去與想念的情感。歌詞中描述了一個人在愛情中的迷失與無法放下的感受，以及對離別後的思念和無法忘懷的痛苦。整首歌透露出深沉的思念和無法忘記的情感，表達了對過去愛情的回憶和對對方的思念之情。
我的問題：
請問您要查詢的影片關鍵字是？：
