# Self Query Retriever Test Notebook

In [1]:
import os
import shutil
from loguru import logger
from intellitube.utils import ChatManager
from intellitube.vector_store import VectorStoreManager

## Create Vector Store

In [2]:
vs_path = "test_data/qdrant_vector_store/test_vct_store"
vsc_path = os.path.join(vs_path, "collections")

for path in [vsc_path, vs_path]:
    if os.path.exists(path):
        logger.info(f"Removing path: {path}")
        shutil.rmtree(path)

vectorstore = VectorStoreManager(
    path_on_disk=vs_path,
    collection_path_on_disk=vsc_path
)

[32m2025-07-12 16:40:13.514[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mRemoving path: test_data/qdrant_vector_store/test_vct_store[0m
[32m2025-07-12 16:40:24.033[0m | [34m[1mDEBUG   [0m | [36mintellitube.vector_store[0m:[36minit_vector_store[0m:[36m73[0m - [34m[1mCreaing Client...[0m
[32m2025-07-12 16:40:24.245[0m | [34m[1mDEBUG   [0m | [36mintellitube.vector_store[0m:[36minit_vector_store[0m:[36m81[0m - [34m[1mCreaing vector store[0m


## Loading YouTube Video Transcript

In [3]:
from intellitube.utils import (
    YTContentData, download_youtube_audio_or_transcript, webvtt_2_str
)

# Steve Jobs' 2005 Stanford Commencement Address
video_url = "https://www.youtube.com/watch?v=UF8uR6Z6KLc"
video_data: YTContentData = download_youtube_audio_or_transcript(video_url)
vtt_str = webvtt_2_str(vtt_file_path=video_data.transcript_path)

print(video_data, end='\n\n')
print(vtt_str[:500])

[32m2025-07-12 16:40:24.259[0m | [34m[1mDEBUG   [0m | [36mintellitube.utils.youtube[0m:[36mdownload_youtube_audio_or_transcript[0m:[36m133[0m - [34m[1mCache exists, validating cache...[0m
[32m2025-07-12 16:40:24.260[0m | [34m[1mDEBUG   [0m | [36mintellitube.utils.youtube[0m:[36mdownload_youtube_audio_or_transcript[0m:[36m145[0m - [34m[1mCache contains the requested data. Using cache.[0m


type='text' transcript_path='test_data/cache/youtube/downloads/0290de54-13f5-4bbe-b3de-7b06f46a5f07.vtt' audio_path=None

This program is brought to you by Stanford University.
Please visit us at stanford.edu
Thank You. I am honored to be with you today at your commencement
from one of the finest universities in the world.
Truth be told I never graduated from college
and this is the closest I've ever gotten to a college graduation.
Today I want to tell you three stories from my life. That's it.
No big deal. Just three stories.
The first story is about connecting the dots.
I dropped out of Reed College after the fir


## Adding to Vector Database

### Split Text

In [4]:
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=256,
)
texts = text_splitter.split_text(vtt_str)

print(len(texts), end="\n" + "-"*20 + "\n\n")
print(texts[0], end="\n" + "-"*20 + "\n\n")

documents = []
for i, chunk in enumerate(texts):
    document = Document(
        page_content=chunk,
        metadata={
            "source": video_url,
            "chunk_index": i + 1,
        }
    )
    documents.append(document)

47
--------------------

This program is brought to you by Stanford University.
Please visit us at stanford.edu
Thank You. I am honored to be with you today at your commencement
from one of the finest universities in the world.
Truth be told I never graduated from college
and this is the closest I've ever gotten to a college graduation.
Today I want to tell you three stories from my life. That's it.
No big deal. Just three stories.
The first story is about connecting the dots.
I dropped out of Reed College after the first 6 months,
--------------------



In [5]:
vectorstore.add_documents(
    documents, split_text=False,
    skip_if_collection_exists=True,
)

[32m2025-07-12 16:40:24.352[0m | [1mINFO    [0m | [36mintellitube.vector_store[0m:[36madd_documents[0m:[36m108[0m - [1mAdding documents...[0m


In [None]:
# vectorstore.retriever.

## Creating the retriever

In [6]:
retriever = vectorstore.vectorstore.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={'score_threshold': 0.6}
)