# Self Query Retriever Test Notebook

## The Types We'll be Using:
 - Self Query
 - Multi Query
 - Query Expansion

## Remove old database

In [1]:
import os
import shutil
from loguru import logger

vs_path = "test_data/qdrant_vector_store/test_vct_store"
vsc_path = os.path.join(vs_path, "collections")
collection_name = "Steve Jobs' 2005 Stanford Commencement Address"

for path in [vsc_path, vs_path]:
    if os.path.exists(path):
        logger.info(f"Removing path: {path}")
        shutil.rmtree(path)

## Initialize embedding model

In [2]:
from langchain_huggingface import HuggingFaceEmbeddings

embedding_model_name: str = 'sentence-transformers/all-MiniLM-L12-v2'
embedding_model = HuggingFaceEmbeddings(model=embedding_model_name)

## Create the vector store

In [3]:
from intellitube.vector_store import VectorStoreManager

vdb = VectorStoreManager(
    embedding_model=embedding_model,
    path_on_disk=vs_path,
    collection_path_on_disk=vsc_path,
    collection_name=collection_name,
)

client = vdb.client
vectorstore = vdb.vectorstore

[32m2025-07-15 00:07:37.480[0m | [34m[1mDEBUG   [0m | [36mintellitube.vector_store[0m:[36minit_vector_store[0m:[36m120[0m - [34m[1mNew Qdrant Client Initialized.[0m
[32m2025-07-15 00:07:37.489[0m | [34m[1mDEBUG   [0m | [36mintellitube.vector_store[0m:[36minit_vector_store[0m:[36m126[0m - [34m[1mNew Vector Store Created.[0m


## Loading YouTube Video Transcript

In [4]:
from intellitube.utils import (
    YTContentData, download_youtube_audio_or_transcript, webvtt_2_str
)

video_url = "https://www.youtube.com/watch?v=UF8uR6Z6KLc"
video_data: YTContentData = download_youtube_audio_or_transcript(video_url)
vtt_str = webvtt_2_str(vtt_file_path=video_data.transcript_path)

print(video_data, end='\n\n')
print(vtt_str[:500])

[32m2025-07-15 00:07:37.540[0m | [34m[1mDEBUG   [0m | [36mintellitube.utils.youtube[0m:[36mdownload_youtube_audio_or_transcript[0m:[36m133[0m - [34m[1mCache exists, validating cache...[0m
[32m2025-07-15 00:07:37.541[0m | [34m[1mDEBUG   [0m | [36mintellitube.utils.youtube[0m:[36mdownload_youtube_audio_or_transcript[0m:[36m145[0m - [34m[1mCache contains the requested data. Using cache.[0m


type='text' transcript_path='test_data/cache/youtube/downloads/0290de54-13f5-4bbe-b3de-7b06f46a5f07.vtt' audio_path=None

This program is brought to you by Stanford University.
Please visit us at stanford.edu
Thank You. I am honored to be with you today at your commencement
from one of the finest universities in the world.
Truth be told I never graduated from college
and this is the closest I've ever gotten to a college graduation.
Today I want to tell you three stories from my life. That's it.
No big deal. Just three stories.
The first story is about connecting the dots.
I dropped out of Reed College after the fir


## Split texts & Add them to VDB

### 1. Split Texts + Create Metadatas

In [5]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

split_chunk_size = 512
split_chunk_overlap = 128

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=split_chunk_size,
    chunk_overlap=split_chunk_overlap,
)

texts = text_splitter.split_text(vtt_str)
metadatas = [
    {"source": video_url, "chunk_index": i + 1}
    for i in range(0, len(texts))
]

### 2. Add them to VDB

In [6]:
vectorstore.add_texts(texts=texts, metadatas=metadatas)

['4c4dcc59252c4ad8b8064fe5d0af23eb',
 'bfc74a56234e4678841689062bb91965',
 '1be80c1f4fd74189a254c147503cd956',
 '872ca096e30a45e0bace8f74725bf718',
 'b8df65c424954fe499585362e3ede9d6',
 '51dcdac3530f453bbc207a08ddf38963',
 'e79d9080bcce45988bf6629cc481ff3c',
 '6a985b0aa3394c62a160f554b38efb6c',
 '992aef62f4c74adc89203f1895932397',
 'cac6cd408a1345a8a2a910439203c106',
 '1b3fb46c468d4102a2194af25fd9f0a1',
 '4c561675bf7c4249978a91bf527dfc9b',
 'f703c83d7ae84bd3935e11f8977805be',
 '2e0445a9c1c8407c827b3c76f8b61f4c',
 '31eba1679c35444a9796d0ceac1413ef',
 'a1f5b9b3cc364a7b8ed5fec5e2f26a00',
 'c4bd9febc5c242faa8b110612a5b3951',
 '62d3632950344e1482debdf78a23e08d',
 '16336c1ec620431cab069016ca211f65',
 '3b6fffe43004411f8c0dc4f1016a71bc',
 'ec23e055690141e39bc3fe23b83a5ff0',
 '380e2841aee04ed7ae7a3d3ce4c0ffd0',
 '29ec9b5446e84a8f855af41df35327c7',
 '6c5c0b34502f4d71b106175ac7d844c1',
 'f419a7f41a8143359fd3bf5170792312',
 '7bf11a84f3c943ab99f720abcb21de8c',
 '0cf33c52637742679a9cf1f607dfb383',
 

## Create the Retriever

In [7]:
retriever = vectorstore.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={'score_threshold': 0.6}
)

## Retrieve all the documents by metadata filtering
Thanks to this [stackoverflow question!](https://stackoverflow.com/questions/78118020/qdrant-client-scroll-filter-does-not-work)

In [8]:
from qdrant_client import models

scroll_result = client.scroll(
    collection_name=collection_name,
    scroll_filter=models.Filter(
        must=[
            models.FieldCondition(key="metadata.source", match=models.MatchValue(value=video_url)),
            # models.FieldCondition(key="metadata.chunk_index", match=models.MatchValue(value=2)),
        ]
    ),
    limit=1000,
    with_payload=True,
    with_vectors=False
)

print(len(scroll_result[0]))
# print(scroll_result[1])

32


In [9]:
vectorstore.similarity_search("why is death beautiful?", k=5)

[Document(metadata={'source': 'https://www.youtube.com/watch?v=UF8uR6Z6KLc', 'chunk_index': 27, '_id': '0cf33c52637742679a9cf1f607dfb383', '_collection_name': "Steve Jobs' 2005 Stanford Commencement Address"}, page_content="death was a useful but purely intellectual concept:\nNo one wants to die.\nEven people who want to go to heaven don't want to die to get there.\nAnd yet death is the destination we all share.\nNo one has ever escaped it. And that is as it should be,\nbecause Death is very likely the single best invention of Life.\nIt is Life's change agent.\nIt clears out the old to make way for the new.\nRight now the new is you, but someday not too long from now,\nyou will gradually become the old and be cleared away."),
 Document(metadata={'source': 'https://www.youtube.com/watch?v=UF8uR6Z6KLc', 'chunk_index': 26, '_id': '7bf11a84f3c943ab99f720abcb21de8c', '_collection_name': "Steve Jobs' 2005 Stanford Commencement Address"}, page_content="told me that when they viewed the cells 