# Self Query Retriever Test Notebook

In [None]:
import os
import shutil
from loguru import logger
from intellitube.utils import ChatManager
from intellitube.vector_store import VectorStoreManager

## Create Vector Store

In [None]:
vs_path = "test_data/qdrant_vector_store/test_vct_store"
vsc_path = os.path.join(vs_path, "collections")
collection_name = "Steve Jobs' 2005 Stanford Commencement Address"

for path in [vsc_path, vs_path]:
    if os.path.exists(path):
        logger.info(f"Removing path: {path}")
        shutil.rmtree(path)

vectorstore = VectorStoreManager(
    path_on_disk=vs_path,
    collection_path_on_disk=vsc_path,
    collection_name=collection_name,
)

## Loading YouTube Video Transcript

In [None]:
from intellitube.utils import (
    YTContentData, download_youtube_audio_or_transcript, webvtt_2_str
)

video_url = "https://www.youtube.com/watch?v=UF8uR6Z6KLc"
video_data: YTContentData = download_youtube_audio_or_transcript(video_url)
vtt_str = webvtt_2_str(vtt_file_path=video_data.transcript_path)

print(video_data, end='\n\n')
print(vtt_str[:500])

## Adding to Vector Database

### Split Text

In [None]:
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=256,
)
texts = text_splitter.split_text(vtt_str)

print(len(texts), end="\n" + "-"*20 + "\n\n")
print(texts[0], end="\n" + "-"*20 + "\n\n")

documents = []
for i, chunk in enumerate(texts):
    document = Document(
        page_content=chunk,
        metadata={
            "source": video_url,
            "chunk_index": i + 1,
        }
    )
    documents.append(document)

In [None]:
vectorstore.add_documents(
    documents, split_text=False,
    skip_if_collection_exists=True,
)

In [None]:
# vectorstore.retriever.

## Creating the retriever

In [None]:
retriever = vectorstore.vectorstore.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={'score_threshold': 0.6}
)

# Testing Qdrant Client

### Remove old database

In [1]:
import os
import shutil
from loguru import logger

vs_path = "test_data/qdrant_vector_store/test_vct_store"
vsc_path = os.path.join(vs_path, "collections")
collection_name = "Steve Jobs' 2005 Stanford Commencement Address"

for path in [vsc_path, vs_path]:
    if os.path.exists(path):
        logger.info(f"Removing path: {path}")
        shutil.rmtree(path)

[32m2025-07-13 13:02:52.741[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m11[0m - [1mRemoving path: test_data/qdrant_vector_store/test_vct_store[0m


### Initialize embedding model

In [2]:
from langchain_huggingface import HuggingFaceEmbeddings

embedding_model_name: str = 'sentence-transformers/all-MiniLM-L12-v2'
embedding_model = HuggingFaceEmbeddings(model=embedding_model_name)

### Load Texts

In [3]:
from intellitube.utils import (
    YTContentData, download_youtube_audio_or_transcript, webvtt_2_str
)

video_url = "https://www.youtube.com/watch?v=UF8uR6Z6KLc"
video_data: YTContentData = download_youtube_audio_or_transcript(video_url)
vtt_str = webvtt_2_str(vtt_file_path=video_data.transcript_path)

print(video_data, end='\n\n')
print(vtt_str[:500])

[32m2025-07-13 13:03:03.270[0m | [34m[1mDEBUG   [0m | [36mintellitube.utils.youtube[0m:[36mdownload_youtube_audio_or_transcript[0m:[36m133[0m - [34m[1mCache exists, validating cache...[0m
[32m2025-07-13 13:03:03.271[0m | [34m[1mDEBUG   [0m | [36mintellitube.utils.youtube[0m:[36mdownload_youtube_audio_or_transcript[0m:[36m145[0m - [34m[1mCache contains the requested data. Using cache.[0m


type='text' transcript_path='test_data/cache/youtube/downloads/0290de54-13f5-4bbe-b3de-7b06f46a5f07.vtt' audio_path=None

This program is brought to you by Stanford University.
Please visit us at stanford.edu
Thank You. I am honored to be with you today at your commencement
from one of the finest universities in the world.
Truth be told I never graduated from college
and this is the closest I've ever gotten to a college graduation.
Today I want to tell you three stories from my life. That's it.
No big deal. Just three stories.
The first story is about connecting the dots.
I dropped out of Reed College after the fir


### Split texts

In [4]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=256,
)

texts = text_splitter.split_text(vtt_str)
metadatas = [
    {"source": video_url, "chunk_index": i + 1}
    for i, chunk in enumerate(texts)
]

In [5]:
from qdrant_client import QdrantClient
from langchain_community.vectorstores import Qdrant
from qdrant_client.http.models import Distance, VectorParams

client = QdrantClient(
    path=vs_path
)

client.create_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(
        size=len(embedding_model.embed_query("hehe")), distance=Distance.COSINE
    )
)

vectorstore = Qdrant(
    client=client,
    collection_name=collection_name,
    embeddings=embedding_model,
)

  vectorstore = Qdrant(


### Add texts to database

In [6]:
vectorstore.add_texts(texts=texts, metadatas=metadatas)

['d1e7e7fc1fbf4a179245c12d2fff5d63',
 '449bce5111404bbca3f842f9ac3236eb',
 '855a68fe65024ba28205f7211b8a7990',
 'd736384cc58044ef8c060410e5b582c1',
 '38e09e4e7b1c4438b15d844eeaf500be',
 'f7950de356274c4caa45c5e8c2c1b1a4',
 '77a1ec95fa5b494cb866608f332ff062',
 'ffbcbcbd669e43028c586b42c7f9523a',
 '45ce9702d7564e3b8fa759dded41362e',
 'f42ef406c8dc486e8cae20fc91f94eaa',
 '58b12462247b44e7b1701eec2ef948cf',
 '0fbafabf8e2f4a84b585a2044e9f9b25',
 'edec695b4e284373b897793f84a1a06d',
 'ca86e49964ea48a88c0f36d8bdbcb038',
 'c1497cb9151743b7b00425ad1152911b',
 '0c3bc9220e174beeaa0c7ec761be9168',
 '63e9372c519b46bb9b38f0741831af69',
 'ab3f3e8810d149daa9cdaafc82ed69e1',
 '2f09eb4bf7a54c5bbad2f6d8206cc36d',
 '6344f43a8dae4d068a86975f9a2ab491',
 '8aae641daeb34e83983b886d294453af',
 'a954a6eba52c4275ae2302eaf9dc9e6a',
 '3235027dce2b4dfd9d09de01ec5b5ee1',
 'd4c4d9b71246459381751128f24a16be',
 '547e32080fc34efb9fc3c681e052e618',
 'd94fd95b71f14a6d9ef02de37b852a53',
 'b6c8412f8b6e4cf88f7b030723212693',
 

### Retrieve all the documents by metadata filtering
Thanks to this [stackoverflow question!](https://stackoverflow.com/questions/78118020/qdrant-client-scroll-filter-does-not-work)

In [None]:
from qdrant_client import models

scroll_result = vectorstore.client.scroll(
    collection_name=collection_name,
    scroll_filter=models.Filter(
        must=[
            models.FieldCondition(key="metadata.source", match=models.MatchValue(value=video_url)),
            # models.FieldCondition(key="metadata.chunk_index", match=models.MatchValue(value=2)),
        ]
    ),
    limit=1000,
    with_payload=True,
    with_vectors=False
)

print(len(scroll_result[0]))
# print(scroll_result[1])

47


In [10]:
vectorstore.similarity_search("why is death beautiful?", k=5)

[Document(metadata={'source': 'https://www.youtube.com/watch?v=UF8uR6Z6KLc', 'chunk_index': 40, '_id': '01492c3a233d4b98bb23dea4bab89239', '_collection_name': "Steve Jobs' 2005 Stanford Commencement Address"}, page_content="I can now say this to you with a bit more certainty than when\ndeath was a useful but purely intellectual concept:\nNo one wants to die.\nEven people who want to go to heaven don't want to die to get there.\nAnd yet death is the destination we all share.\nNo one has ever escaped it. And that is as it should be,\nbecause Death is very likely the single best invention of Life.\nIt is Life's change agent.\nIt clears out the old to make way for the new.\nRight now the new is you, but someday not too long from now,"),
 Document(metadata={'source': 'https://www.youtube.com/watch?v=UF8uR6Z6KLc', 'chunk_index': 33, '_id': 'e7fde5cccba346c99be793338df0265a', '_collection_name': "Steve Jobs' 2005 Stanford Commencement Address"}, page_content="Remembering that I'll be dead soo