## Initialize a new Chat

In [None]:
from intellitube.utils import ChatHistoryManager
chat: ChatHistoryManager = ChatHistoryManager.new_chat()
print(f"Chat ID:", chat.chat_id)

chat.remove_unlisted_chats()

[32m2025-07-10 18:03:56.666[0m | [31m[1mERROR   [0m | [36mutils.chat_history[0m:[36mremove_unlisted_chats[0m:[36m209[0m - [31m[1mChat: 95c6a153-f7eb-4e22-a075-0e24f18c5b44 is non-existent![0m
[32m2025-07-10 18:03:56.666[0m | [31m[1mERROR   [0m | [36mutils.chat_history[0m:[36mremove_unlisted_chats[0m:[36m209[0m - [31m[1mChat: 74c0961d-2f87-448b-abcf-cbe751bd614d is non-existent![0m
[32m2025-07-10 18:03:56.667[0m | [31m[1mERROR   [0m | [36mutils.chat_history[0m:[36mremove_unlisted_chats[0m:[36m209[0m - [31m[1mChat: 3af7c60f-6458-442a-b0ed-ec173c6d667a is non-existent![0m


Chat ID: ebe0960e-eec1-4253-80aa-d870bce63e8d


## Initialize the RAG System
Using `TextDocumentRAG()` from `intellitube.rag` module automatically initializes `Qdrant` client as Vector Database.

In [None]:
import os
from typing import List
from intellitube.rag import TextDocumentRAG
from langchain_core.documents import Document

document_rag = TextDocumentRAG(
    path_on_disk=chat.chat_dirpath,
    collection_path_on_disk=os.path.join(chat.chat_dirpath, "collection"),
    collection_name=chat.chat_id,
)

def add_to_vdb(docuemnts: List[Document]) -> None:
    # convert to a list of document(s) if not already!
    if type(docuemnts) == Document:
        docuemnts = [docuemnts]
    
    document_rag.add_documents(
        docuemnts, split_text=True,
        split_config={
            "chunk_size": 512,
            "chunk_overlap": 128
        },
        skip_if_collection_exists=True,
    )

## Create Document Loader Tools

In [None]:
from langchain.tools import tool

### 1. Add YouTube Videos to the Vector Database

In [None]:
from utils import (
    YTContentData,
    webvtt_2_str,
    download_youtube_audio_or_transcript,
)

test_url = "https://www.youtube.com/watch?v=W3I3kAg2J7w&t=231s"

# @tool
def load_youtube_transcript(youtube_url: str) -> str:
    """Load the given YouTube video's transcript to the vector database.
    It is required to answer user-queries based on the the Transcript context."""

    print("Loading Youtube Transcript...")
    
    # download the youtube transcript (or audio if transcript not available)
    yt_video_data: YTContentData = download_youtube_audio_or_transcript(
        video_url=youtube_url,
    )

    # convert the WEBVTT format trancript to a plain text string
    vtt_str = webvtt_2_str(vtt_file_path=yt_video_data.transcript_path)
    
    print(vtt_str[:100])    # print first 100 characters

    # add the transcript-string to the vector database
    add_to_vdb(Document(vtt_str))
    return "YouTube Video Transcript has been loaded successfully!"

### 2. Add PDF/Text Documents to the Vector Dataabse

In [None]:
import os
from langchain_community.document_loaders import PyPDFLoader


# @tool
def load_document(document_path: str) -> str:
    """Load the given Document's content to the vector database.
    It is required to answer user-queries based on the the Document context."""
    print("Loading Document...")
    
    ext = os.path.splitext(document_path)[1][1:].lower()
    documents: List[Document]

    if ext == 'pdf':
        documents = PyPDFLoader(document_path).load()
    elif ext == 'txt':
        with open(document_path, 'r') as file:
            documents = [Document(
                page_content=file.read(),
                metadata={ "source": document_path }
            )]
    else:
        return f"Unsupported filetype: {ext}!"
    
    add_to_vdb(documents)
    return "The document has been loaded successfully!"
        

### 3. Add WebPages as Documents to the Vector Dataabse

In [None]:
from langchain_community.document_loaders import WebBaseLoader

# @tool
def load_webpage(webpage_url: str) -> str:
    """Load the given WebSite's content to the vector database.
    It is required to answer user-queries based on the the WebPage's context."""

    add_to_vdb(WebBaseLoader(webpage_url).load())
    print("Loading Webpage...")
    return "The webpage has been loaded successfully!"

### Pass the Query Tool
This is a function to be called by the Agent if none of the other tools can be used.

In [None]:
# @tool
def pass_user_query(user_query: str) -> None:
    """Use this tool when none of the other tools are useful."""
    print(f"Passes User Query: {user_query}")
    return f"User: {user_query}"

#### Finally, compile a list of the tools

In [None]:
# document_loader_tools = [load_youtube_transcript, load_document, load_webpage, pass_user_query]

# from pprint import pprint
# pprint(document_loader_tools)

document_loader_functions = {
    "document": load_document,
    "youtube_video": load_youtube_transcript,
    "website": load_webpage
}

## Choose an LLM

In [None]:
from typing import Literal, Optional
# from langchain.chat_models import init_chat_model
from langchain_core.language_models import BaseChatModel

from dotenv import load_dotenv
load_dotenv()

def select_llm(
    model_provider: Literal['openai', 'groq', 'nvidia', 'google', 'ollama'],
    model_name: Optional[str] = None,
    temperature: float = 0.0,
) -> BaseChatModel:
    if model_provider == 'openai':
        from langchain_openai import ChatOpenAI
        return ChatOpenAI(model=model_name or "gpt-4o-mini", temperature=temperature)
    elif model_provider == 'groq':
        from langchain_groq import ChatGroq
        return ChatGroq(model=model_name or "llama-3.3-70b-versatile", temperature=temperature)
    elif model_provider == 'nvidia':
        from langchain_nvidia_ai_endpoints import ChatNVIDIA
        return ChatNVIDIA(model=model_name or "mistralai/mistral-small-24b-instruct", temperature=temperature)
        # return ChatNVIDIA(model=model_name or "nvidia/llama-3.1-nemotron-51b-instruct", temperature=temperature)
    elif model_provider == 'google':
        from langchain_google_genai import ChatGoogleGenerativeAI
        return ChatGoogleGenerativeAI(model=model_name or "gemini-2.0-flash", temperature=temperature)
    elif model_provider == 'ollama':
        from langchain_ollama import ChatOllama
        # return ChatOllama(model=model_name or "granite3.3:8b", temperature=temperature)
        return ChatOllama(model=model_name or "llama3.2:3b", temperature=temperature)
    
    raise ValueError(f"Invalid model_provider: {model_provider}")

#### Test the LLM

In [None]:
TEST_LLM = False
llm = select_llm(model_provider='google')
# llm = select_llm(model_provider='groq')
# llm = select_llm(model_provider='ollama')

if TEST_LLM:
    resp = llm.invoke("What is superiority complex? Respond with a nicely structured & formatted answer!")
    print(resp)
    
    from IPython.display import display, Markdown
    display(Markdown(resp.content))

## Define State Schema

### Messages State Schema

In [None]:
from langgraph.graph.message import add_messages
from langchain_core.messages import BaseMessage
from typing_extensions import Annotated, Sequence, TypedDict

class MessagesState(TypedDict):
    messages: Annotated[Sequence[BaseMessage], add_messages]

## Create Nodes

### Router Node
This router will decide if the user has provided any Document/Website URL/YouTube URL. Depending on the type of URL it will call a function to load the document or just redirect the query to a RAG Agent for direct response generation if no URL is provided.

### 💡 Idea Behind Router Agent
The Router Agent will take the query and identify below mentioned features from the query:
 - User Query
 - URL/Path of a file/website/youtube-video

And then it will return a structured output with the following informations:
 - `url`: URL/Path mentioned in the provided query (Can be `None` if absent)
 - `user_query`: The user query
 - `url_of`: `"website"`, `"youtube_video"`, `"document"`

In [None]:
from pydantic import BaseModel, Field
from typing_extensions import Any, Optional
from langchain_core.messages import AIMessage, SystemMessage

In [None]:
system_message = SystemMessage("""
You are an extraction agent. Your task is to return a JSON response in this exact format:

{
  "user_query": "<EXACT user query WITHOUT any URLs or paths>",
  "url": "<the full URL or local path if present, else null>",
  "url_of": "<one of 'youtube_video', 'website', or 'document' if URL/path is present, else null>"
}

⚠️ VERY STRICT RULES (follow them or your output is invalid):
1. DO NOT paraphrase, correct, or modify the user's query — copy it EXACTLY as it appears.
2. REMOVE all URLs and file paths from the `user_query`.
3. IF a URL or file path exists, assign it to the `url` field and classify it using `url_of`.
4. `url_of` MUST be one of: "youtube_video", "website", or "document". Never invent new types.
5. IF no URL/path is found, both `url` and `url_of` must be null or omitted.

====================
EXAMPLES:

# Example 1 (simple website URL):
Input: How to use LangChain structured output? https://docs.langchain.com/docs/structured_outputs
Output:
{
  "user_query": "How to use LangChain structured output?",
  "url": "https://docs.langchain.com/docs/structured_outputs",
  "url_of": "website"
}

# Example 2 (YouTube video link):
Input: Summarize this video https://www.youtube.com/watch?v=dQw4w9WgXcQ
Output:
{
  "user_query": "Summarize this video",
  "url": "https://www.youtube.com/watch?v=dQw4w9WgXcQ",
  "url_of": "youtube_video"
}

# Example 3 (file name with extension):
Input: Convert file.txt to JSON
Output:
{
  "user_query": "Convert file.txt to JSON",
  "url": "file.txt",
  "url_of": "document"
}

# Example 4 (multiple file paths — take only the one mentioned):
Input: I saved it in ./notes/lecture1.md. Please summarize.
Output:
{
  "user_query": "Please summarize.",
  "url": "./notes/lecture1.md",
  "url_of": "document"
}

# Example 5 (no URL or file path):
Input: You should stop wasting your time
Output:
{
  "user_query": "You should stop wasting your time"
}

# Example 6 (unclear context — do NOT assume):
Input: Read this https://mystery.link/something
Output:
{
  "user_query": "Read this",
  "url": "https://mystery.link/something",
  "url_of": "website"
}

# Example 7 (file path with Windows format):
Input: Please check C:\\Users\\Me\\Desktop\\data.csv
Output:
{
  "user_query": "Please check",
  "url": "C:\\Users\\Me\\Desktop\\data.csv",
  "url_of": "document"
}
====================

🧠 TIP: If you are unsure about the type of the `url`, classify based on the extension or domain. If no clue, default to "website".

NOW RETURN ONLY THE JSON OBJECT. Do NOT add explanations, comments, or markdown.
""")


In [None]:
class RouterAgentResponse(BaseModel):
    user_query: str = Field(description=(
        "The user's original query EXACTLY as it appears, without any modification, rewording, or interpretation.\n"
        "You MUST NOT include any URLs, file paths, or hyperlinks in this field — only the natural language query.\n"
        "Preserve the casing, punctuation, and wording. Do NOT fix typos or grammar."
    ))
    url: Optional[str] = Field(default=None, description=(
        "The exact URL or local document/file path mentioned in the user's input.\n"
        "If there is no URL or file path, leave this as null (do not fabricate one).\n"
        "Example: 'https://example.com/page', 'C:/Documents/myfile.txt', './notes.md'"
    ))
    url_of: Optional[Literal["youtube_video", "website", "document"]] = Field(default=None, description=(
        "The type of content the `url` field refers to:\n"
        "- 'youtube_video': if it's a YouTube video link\n"
        "- 'website': for general websites or web pages\n"
        "- 'document': for file paths (like .txt, .pdf, .md, etc.)\n"
        "If no URL/path is provided, this should be null."
    ))

In [None]:
# Old RouterAgentResponse

'''
class RouterAgentResponse(BaseModel):
    user_query: str = Field(description="The EXACT user query/question/statement present in the given text (anything other than the URL/Path).")
    # Optional Field
    url: Optional[str] = Field(description=(
            "The URL or Local Document Path mentioned in the provided text."
            " No need to provide a value if the URL is absent in the given text."
        ), default=None)
    # Optional Field
    url_of: Optional[Literal["youtube_video", "website", "document"]] = Field(description=(
        "What the provided URL/Path represents.\n"
        "The values must be one of these:\n"
        " - youtube_video: If the provided URL represents a YouTube Video.\n"
        " - website: If the provided URL represents a website.\n"
        " - document: If the provided URL is a Path representing a document.\n"
        "No need to provide a value if the URL is absent in the given text."
    ), default=None)
# '''

# Old Router Agent Node
'''
def router_agent_node(state: MessagesState) -> MessagesState:
    user_query: str = state["messages"][-1].content
    tools_llm = llm.bind_tools(tools=document_loader_tools)
    
    system_prompt = SystemMessage(
f"""You are a very helpful assistant. You have access to {len(document_loader_tools)} tools.

Here is when to use which tool:
    - load_youtube_transcript: To load an YouTube Video's transcript
    - load_document: To load a Text/PDF document (can be a local path)
    - load_webpage: To laod an WebPage
    - pass_user_query: When you find no URL/Path in the user query

Here is the user query: {user_query}

Observe the user query and if you see any URL/Path of a file/document/YT Video/Website use the necessary tool to load it.
If you see no URL then just use the `pass_user_query` tool to pass the query to the next Agent.

You MUST use ONE of the above tools. DO NOT generate any extra text beyond what's instructed.
    """)

    ai_msg: AIMessage = tools_llm.invoke([system_prompt] + state["messages"])
    # Validate id the ai_msg has tool calls (IT MUST)
    # ...
    return {"messages": [ai_msg]}
# '''


def router_agent_node(state: MessagesState) -> MessagesState:
    structured_llm = llm.with_structured_output(RouterAgentResponse)
    _system_message = SystemMessage("""You must STRICTLY follow the structure.
You MUST provide the user query word for word (BUT MUST NOT INCLUDE the URLS/PATHS in `user_query`). DO NOT MODIFY OR IMPROVISE THE QUERY.

Here are some examples:

# Example Input 1: How to use the structured output?
https://python.langchain.com/docs/concepts/structured_outputs/

# Example Output 1:
{"user_query":"How to use the structured output?","url":"https://python.langchain.com/docs/concepts/structured_outputs/","url_of":"website"}

# Example Input 2: Convert this file into JSON format file.txt

# Example Output 2:
{"user_query":"Convert this file into JSON format file.txt", "url":"file.txt", "url_of":"document"}

# Example Input 3: You should quit listening to music

# Example Output 3:
{"user_query":"You should quit listening to music"}
""")
    ai_msg: AIMessage = structured_llm.invoke([system_message, state["messages"][-1]])
    return {"messages": [ai_msg]}

In [None]:
TEST_DOC_LOAD_ROUTER = False

if TEST_DOC_LOAD_ROUTER:
    queries = [
        # Website URL
        "What is the model name mentioned?\nhttps://build.nvidia.com/nvidia/llama-3_1-nemotron-51b-instruct",
        # YouTube URL
        "What do you see here?\nhttps://www.youtube.com/watch?v=W3I3kAg2J7w&t=231s",
        # Local Document Path
        "Summarize this document: ~/data.json",
        # Just a normal Query
        "Why oranges are red and violates are blue?",
        "This asdj/klrt/1234.py file appears to be corrupted. Can you read it?"
    ]

    for i, query in enumerate(queries):
        print("-"*15, f"Test {i + 1}", "*"*15, end='\n\n')
        response = router_agent_node(user_query=query)
        print("\n\n")


TEST_ROUTER_NODE = 1

if TEST_ROUTER_NODE:
    from langchain_core.messages import HumanMessage

    queries = [
        # Website URL
        "What is the model name mentioned?\nhttps://build.nvidia.com/nvidia/llama-3_1-nemotron-51b-instruct",
        # YouTube URL
        "What do you see here?\nhttps://www.youtube.com/watch?v=W3I3kAg2J7w&t=231s",
        # Local Document Path
        "Summarize this document: ~/data.json",
        # Just a normal Query
        "Why oranges are red and violates are blue?",
        "This asdj/klrt/1234.py file appears to be corrupted. Can you read it?"
    ]

    for query in queries:
        print("Query:", query)
        state = MessagesState(messages=[HumanMessage(query)])
        resp = router_agent_node(state)
        print(resp["messages"][0].model_dump_json())

## Document Loader Function Executor

### Chat Agent Node

In [None]:
def chat_agent_node(state: MessagesState) -> MessagesState:
    """A Chat Agent"""
    system_prompt = SystemMessage("""You are IntelliTube AI, a smart research parter for the user.""")
    ai_msg: AIMessage = llm.invoke([system_prompt] + state["messages"])
    return {"messages": [ai_msg]}

## Create the Agent

In [None]:
from langgraph.prebuilt import ToolNode
from langgraph.graph import START, END, StateGraph

graph = (
    StateGraph(state_schema=MessagesState)
    .add_node("router_agent", router_agent_node)
    .add_node("tools", ToolNode(tools=document_loader_tools))
    .add_node("chat_agent", chat_agent_node)
    .add_edge(START, "router_agent")
    .add_edge("router_agent", "tools")
    .add_edge("tools", "chat_agent")
    .add_edge("chat_agent", END)
)

agent = graph.compile()

In [None]:
from IPython.display import Image, display
display(Image(agent.get_graph().draw_mermaid_png()))

## Chat Function

In [2]:
from langchain_core.messages import HumanMessage

def chat_loop() -> None:
    usr_msg: str = input(">> ").strip()

    while usr_msg.lower() != "/exit":
        usr_msg = HumanMessage(usr_msg)
        chat.add_message(usr_msg)
        chat.chat_messages = agent.invoke({"messages": chat.chat_messages})["messages"]
        ai_msg: AIMessage = chat.chat_messages[-1]
        ai_msg.pretty_print()
        usr_msg: str = input(">> ").strip()
    chat.end_chat()

In [3]:
chat_loop()

NameError: name 'agent' is not defined