# Qdrant Search Testing

In [57]:
import sys
import json

from qdrant_client import AsyncQdrantClient
from pathlib import Path
from typing import Optional, Sequence
from tqdm.asyncio import tqdm

# Add parent directory to path to import from implementation package
# Notebooks are in implementation/notebooks/, so we go up two levels to project root
sys.path.insert(0, str(Path().resolve().parent.parent))

from implementation.classes.movie import BaseMovie
from implementation.classes.enums import EntityCategory, Genre
from db.vector_search import run_vector_search
from db.vector_scoring import calculate_vector_scores
from implementation.vectorize import (
    create_anchor_vector_text,
    create_plot_events_vector_text,
    create_plot_analysis_vector_text,
    create_viewer_experience_vector_text,
    create_watch_context_vector_text,
    create_narrative_techniques_vector_text,
    create_production_vector_text,
    create_reception_vector_text,
)
from db.ingest_movie import ingest_movie_to_qdrant, ingest_movies_to_qdrant_batched
from implementation.classes.schemas import MetadataFilters, ExtractedEntitiesResponse, LexicalCandidate, ExtractedEntityData
from implementation.misc.helpers import tokenize_title_phrase, create_watch_provider_offering_key
from implementation.classes.watch_providers import FILTERABLE_WATCH_PROVIDER_IDS

# Qdrant Initialization
qdrant_client = AsyncQdrantClient(host="localhost", port=6333)

In [58]:
# LOAD MOVIES

json_path = Path("../../saved_imdb_movies.json")
with open(json_path, "r", encoding="utf-8") as f:
    movies_data = json.load(f)

# Convert each dictionary to an IMDBMovie object
movies = [BaseMovie(**movie_dict) for movie_dict in movies_data]

for i, movie in enumerate(movies):
    print(f"{i}: {movie.title} ({movie.tmdb_id})")

0: ferris bueller's day off (9377)
1: zootopia (269149)
2: school of rock (1584)
3: frozen (109445)
4: the princess bride (2493)
5: coco (354912)
6: klaus (508965)
7: up (14160)
8: mulan (10674)
9: shrek (808)
10: the year without a santa claus (13397)
11: mad max: fury road (76341)
12: raiders of the lost ark (85)
13: the dark knight (155)
14: john wick (245891)
15: captain america: the first avenger (1771)
16: spider-man: across the spider-verse (569094)
17: avengers: endgame (299534)
18: star wars (11)
19: harry potter and the philosopher's stone (671)
20: the lord of the rings: the fellowship of the ring (120)
21: gladiator (98)
22: inception (27205)
23: the matrix (603)
24: interstellar (157336)
25: blade runner 2049 (335984)
26: jurassic park (329)
27: arrival (329865)
28: hereditary (493922)
29: the shining (694)
30: insidious (49018)
31: terrifier 3 (1034541)
32: saw (176)
33: se7en (807)
34: parasite (496243)
35: get out (419430)
36: american psycho (1359)
37: fight club (550)

## Ingesting

In [59]:
# # Batched
# await ingest_movies_to_qdrant_batched(movies)

# # Single
# # _ = await tqdm.gather(*[ingest_movie_to_qdrant(movie) for movie in movies], desc="Ingesting movies (Qdrant)")

## Running Searches

In [60]:
search_query = "well choerographed fights"
metadata_filters = MetadataFilters(
)

vector_search_results = await run_vector_search(
    query=search_query,
    metadata_filters=metadata_filters,
    qdrant_client=qdrant_client,
    original_limit=10,
    subquery_limit=10,
    anchor_limit=10,
)

debug_data = vector_search_results.debug
print(f"Vector search produced {debug_data.total_candidates} candidates in {debug_data.wall_clock_ms}ms")

Subquery LLM returned relevant_subquery_text as None for narrative_techniques, skipping subquery search.
Subquery LLM returned relevant_subquery_text as None for production, skipping subquery search.
Weight LLM returned not_relevant for plot_analysis, skipping original-query search.
Weight LLM returned not_relevant for narrative_techniques, skipping original-query search.
Subquery LLM returned relevant_subquery_text as None for plot_events, skipping subquery search.
Subquery LLM returned relevant_subquery_text as None for plot_analysis, skipping subquery search.
Weight LLM returned not_relevant for plot_events, skipping original-query search.
Weight LLM returned not_relevant for production, skipping original-query search.
Vector search complete: 7 jobs, 21 unique candidates, 3005.47ms wall clock
Vector search produced 21 candidates in 3005.47ms


## Scoring

In [61]:
final_scores = calculate_vector_scores(
    vector_search_result=vector_search_results,
)

In [62]:
scores_tuple = [(score[0], score[1]) for score in final_scores.final_scores.items()]

sorted_scores = sorted(scores_tuple, key=lambda x: x[1], reverse=True)
for score in sorted_scores:
    movie = [m for m in movies if m.tmdb_id == score[0]][0]
    print(f"{movie.title} ({movie.tmdb_id}) - {score[1]}")


john wick (245891) - 0.8014908104842333
mad max: fury road (76341) - 0.6474586437930294
gladiator (98) - 0.5078540343936043
the matrix (603) - 0.4976366718896973
spider-man: across the spider-verse (569094) - 0.41643609438197415
avengers: endgame (299534) - 0.3958539961950093
the dark knight (155) - 0.3913690724392479
raiders of the lost ark (85) - 0.3834639002527124
jurassic park (329) - 0.2205533505347931
fight club (550) - 0.21623744845041512
terrifier 3 (1034541) - 0.17729784249291874
captain america: the first avenger (1771) - 0.1147997774981665
the lord of the rings: the fellowship of the ring (120) - 0.11442916293775197
scott pilgrim vs. the world (22538) - 0.10956567005680062
star wars (11) - 0.06058008858569175
get out (419430) - 0.029283676605844165
everything everywhere all at once (545611) - 0.02310259909288106
inception (27205) - 0.016845248695893822
parasite (496243) - 0.01630988603677434
saw (176) - 0.015014417075652719
the princess bride (2493) - 0.011079352136486519
