# Define StateSchema

In [4]:
from langgraph.graph.message import add_messages
from langchain_core.messages import BaseMessage
from typing_extensions import Annotated, Sequence, TypedDict

class MessagesState(TypedDict):
    messages: Annotated[Sequence[BaseMessage], add_messages]

## Initialize a new Chat

In [5]:
from utils import ChatHistoryManager
chat: ChatHistoryManager = ChatHistoryManager.new_chat()
print(f"Chat ID:", chat.chat_id)

Chat ID: 8d336eb9-3a14-4281-a5e3-9d527fc1c3a3


## Initialize the RAG System
Using `TextDocumentRAG()` from `intellitube.rag` module automatically initializes `Qdrant` client as Vector Database.

In [6]:
import os
from typing import List
from intellitube.rag import TextDocumentRAG
from langchain_core.documents import Document

document_rag = TextDocumentRAG(
    path_on_disk=chat.chat_dirpath,
    collection_path_on_disk=os.path.join(chat.chat_dirpath, "collection"),
    collection_name=chat.chat_id,
)

def add_to_vdb(docuemnts: List[Document]) -> None:
    # convert to a list of document(s) if not already!
    if type(docuemnts) == Document:
        docuemnts = [docuemnts]
    
    document_rag.add_documents(
        docuemnts, split_text=True,
        split_config={
            "chunk_size": 512,
            "chunk_overlap": 128
        },
        skip_if_collection_exists=True,
    )

[32m2025-07-08 15:46:10.099[0m | [34m[1mDEBUG   [0m | [36mintellitube.rag[0m:[36minit_vector_store[0m:[36m66[0m - [34m[1mCreaing Client...[0m
[32m2025-07-08 15:46:10.111[0m | [34m[1mDEBUG   [0m | [36mintellitube.rag[0m:[36minit_vector_store[0m:[36m73[0m - [34m[1mCreaing vector store[0m


## Create Document Loader Tools

In [7]:
from langchain.tools import tool

### 1. Add YouTube Videos to the Vector Database

In [8]:
from utils import (
    YTContentData,
    webvtt_2_str,
    download_youtube_audio_or_transcript,
)

test_url = "https://www.youtube.com/watch?v=W3I3kAg2J7w&t=231s"

@tool
def load_youtube_transcript(youtube_url: str) -> str:
    """Load the given YouTube video's transcript to the vector database.
    It is required to answer user-queries based on the the Transcript context."""
    
    # download the youtube transcript (or audio if transcript not available)
    yt_video_data: YTContentData = download_youtube_audio_or_transcript(
        video_url=youtube_url,
    )

    # convert the WEBVTT format trancript to a plain text string
    vtt_str = webvtt_2_str(vtt_file_path=yt_video_data.transcript_path)
    
    print(vtt_str[:100])    # print first 100 characters

    # add the transcript-string to the vector database
    add_to_vdb(Document(vtt_str))
    return "YouTube Video Transcript has been loaded successfully!"

### 2. Add PDF/Text Documents to the Vector Dataabse

In [9]:
import os
from langchain_community.document_loaders import PyPDFLoader


@tool
def load_document(document_path: str) -> None:
    """Load the given Document's content to the vector database.
    It is required to answer user-queries based on the the Document context."""
    
    ext = os.path.splitext(document_path)[1][1:].lower()
    documents: List[Document]

    if ext == 'pdf':
        documents = PyPDFLoader(document_path).load()
    elif ext == 'txt':
        with open(document_path, 'r') as file:
            documents = [Document(
                page_content=file.read(),
                metadata={ "source": document_path }
            )]
    else:
        return f"Unsupported filetype: {ext}!"
    
    add_to_vdb(documents)
    return "The document has been loaded successfully!"
        

### 3. Add WebPages as Documents to the Vector Dataabse

In [10]:
from langchain_community.document_loaders import WebBaseLoader

@tool
def load_webpage(webpage_url: str) -> None:
    """Load the given WebSite's content to the vector database.
    It is required to answer user-queries based on the the WebPage's context."""

    add_to_vdb(WebBaseLoader(webpage_url).load())
    return "The webpage has been loaded successfully!"

USER_AGENT environment variable not set, consider setting it to identify your requests.


#### Finally, compile a list of the tools

In [12]:
tools = [load_youtube_transcript, load_document, load_webpage]

from pprint import pprint
pprint(tools)

[StructuredTool(name='load_youtube_transcript', description="Load the given YouTube video's transcript to the vector database.\n    It is required to answer user-queries based on the the Transcript context.", args_schema=<class 'langchain_core.utils.pydantic.load_youtube_transcript'>, func=<function load_youtube_transcript at 0x7f45cba9e020>),
 StructuredTool(name='load_document', description="Load the given Document's content to the vector database.\n    It is required to answer user-queries based on the the Document context.", args_schema=<class 'langchain_core.utils.pydantic.load_document'>, func=<function load_document at 0x7f45cba9e7a0>),
 StructuredTool(name='load_webpage', description="Load the given WebSite's content to the vector database.\n    It is required to answer user-queries based on the the WebPage's context.", args_schema=<class 'langchain_core.utils.pydantic.load_webpage'>, func=<function load_webpage at 0x7f45cba9f560>)]
