# Qdrant Search Testing

In [1]:
import sys
import json

from qdrant_client import AsyncQdrantClient
from pathlib import Path
from typing import Optional, Sequence
from tqdm.asyncio import tqdm

# Add parent directory to path to import from implementation package
# Notebooks are in implementation/notebooks/, so we go up two levels to project root
sys.path.insert(0, str(Path().resolve().parent.parent))

from implementation.classes.movie import BaseMovie
from implementation.classes.enums import EntityCategory, Genre
from db.vector_search import run_vector_search
from db.vector_scoring import calculate_vector_scores
from implementation.vectorize import (
    create_anchor_vector_text,
    create_plot_events_vector_text,
    create_plot_analysis_vector_text,
    create_viewer_experience_vector_text,
    create_watch_context_vector_text,
    create_narrative_techniques_vector_text,
    create_production_vector_text,
    create_reception_vector_text,
)
from implementation.classes.languages import Language
from db.ingest_movie import ingest_movie_to_qdrant, ingest_movies_to_qdrant_batched
from implementation.classes.schemas import MetadataFilters, ExtractedEntitiesResponse, LexicalCandidate, ExtractedEntityData
from implementation.misc.helpers import tokenize_title_phrase, create_watch_provider_offering_key
from implementation.classes.watch_providers import FILTERABLE_WATCH_PROVIDER_IDS
from db.qdrant import qdrant_client
from db.search import search
from db.postgres import pool
from db.lexical_search import lexical_search

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# LOAD MOVIES

json_path = Path("../../saved_imdb_movies.json")
with open(json_path, "r", encoding="utf-8") as f:
    movies_data = json.load(f)

# Convert each dictionary to an IMDBMovie object
movies = [BaseMovie(**movie_dict) for movie_dict in movies_data]

for i, movie in enumerate(movies):
    print(f"{i}: {movie.title} ({movie.tmdb_id})")

0: ferris bueller's day off (9377)
1: zootopia (269149)
2: school of rock (1584)
3: frozen (109445)
4: the princess bride (2493)
5: coco (354912)
6: klaus (508965)
7: up (14160)
8: mulan (10674)
9: shrek (808)
10: the year without a santa claus (13397)
11: mad max: fury road (76341)
12: raiders of the lost ark (85)
13: the dark knight (155)
14: john wick (245891)
15: captain america: the first avenger (1771)
16: spider-man: across the spider-verse (569094)
17: avengers: endgame (299534)
18: star wars (11)
19: harry potter and the philosopher's stone (671)
20: the lord of the rings: the fellowship of the ring (120)
21: gladiator (98)
22: inception (27205)
23: the matrix (603)
24: interstellar (157336)
25: blade runner 2049 (335984)
26: jurassic park (329)
27: arrival (329865)
28: hereditary (493922)
29: the shining (694)
30: insidious (49018)
31: terrifier 3 (1034541)
32: saw (176)
33: se7en (807)
34: parasite (496243)
35: get out (419430)
36: american psycho (1359)
37: fight club (550)

## Ingesting

In [3]:
# # Batched
# await ingest_movies_to_qdrant_batched(movies)

# # Single
# # _ = await tqdm.gather(*[ingest_movie_to_qdrant(movie) for movie in movies], desc="Ingesting movies (Qdrant)")

## Running Searches

In [4]:
# Open the pool and establish initial connections
await pool.open()
# Validate that connections actually work (fast-fail if Postgres is unreachable)
await pool.check()

In [5]:
OVERALL_TEST_QUERIES = [
    "Shrek",
    "Tom Hanks movies",
    "90s comedies",
    "something feel-good and lighthearted",
    "that movie with the spinning top at the end",
    "films directed by Fincher starring Brad Pitt",
    "A24 horror movies",
    "movies with a character like Walter White",
    "Spielberg and Lucas collaborations",
    "that leandro dicaprio boat movie from 2001",
    "Christoph Nolan's space movie with Matt Damon",
    "shawshank movie prison escape morgan friedman",
    "horror movies but not slashers or torture porn",
    "80s action without Schwarzenegger or Stallone",
    "thrillers that aren't too stressful or dark",
    "critically acclaimed sci-fi from the last 5 years under 2 hours",
    "R-rated crime dramas from before 1980",
    "foreign language best picture nominees",
    "something my parents and kids can all watch together",
    "background movie while I work, nothing too demanding",
    "first date movie that's romantic but not cheesy",
    "movies about grief and learning to move on",
    "heist movies with a twist ending where the villain wins",
    "nonlinear storytelling like Pulp Fiction or Memento",
    "underrated 90s neo-noir thrillers with morally ambiguous protagonists, preferably under 2 hours, not directed by the usual suspects like Tarantino",
]

In [37]:
query = "Movies with silly gag humor"#OVERALL_TEST_QUERIES[9]
filters = MetadataFilters()

print(f'QUERY: "{query}"')

overall_search_results = await search(
    query=query,
    metadata_filters=filters,
    qdrant_client=qdrant_client,
    vector_candidate_limit_original=10,
    vector_candidate_limit_subquery=10,
    vector_candidate_limit_anchor=10,
)

sorted_candidates = sorted(overall_search_results.candidates, key=lambda x: (x.vector_score + x.lexical_score), reverse=True)

print()
print(f'Top 5 results for query: "{query}"')
for i, candidate in enumerate(sorted_candidates[:5]):
    movie = [m for m in movies if m.tmdb_id == candidate.movie_id][0]
    print(f"{i}: {movie.title}")
    print(f"  Vector score: {candidate.vector_score}")
    print(f"  Lexical score: {candidate.lexical_score}")

QUERY: "Movies with silly gag humor"
Subquery LLM returned relevant_subquery_text as None for production, skipping subquery search.
Weight LLM returned not_relevant for narrative_techniques, skipping original-query search.
Weight LLM returned not_relevant for production, skipping original-query search.
Weight LLM returned not_relevant for plot_events, skipping original-query search.
Subquery LLM returned relevant_subquery_text as None for plot_events, skipping subquery search.
Subquery LLM returned relevant_subquery_text as None for plot_analysis, skipping subquery search.
Subquery LLM returned relevant_subquery_text as None for narrative_techniques, skipping subquery search.
Vector search complete: 8 jobs, 18 unique candidates, 2433.22ms wall clock

Top 5 results for query: "Movies with silly gag humor"
0: the naked gun: from the files of police squad!
  Vector score: 1.0
  Lexical score: 0.0
1: scott pilgrim vs. the world
  Vector score: 0.38897371103858835
  Lexical score: 0.0
2: fe

In [18]:
debug = overall_search_results.debug
print(f"Found {debug.total_candidates} results in {debug.total_latency_ms}ms")
lexical_debug = debug.lexical_debug
print(f"  Lexical: candidates: {lexical_debug.candidates_returned}, latency: {lexical_debug.latency_ms}, llm_generation_time: {lexical_debug.llm_generation_time_ms}")
vector_debug = debug.vector_debug
print(f"  Vector: candidates: {vector_debug.total_candidates}, latency: {vector_debug.wall_clock_ms}, total jobs: {vector_debug.total_jobs_executed}")

Found 24 results in 3338.056874461472ms
  Lexical: candidates: 0, latency: 1040.192541666329, llm_generation_time: 1040.1822496205568
  Vector: candidates: 24, latency: 3328.7, total jobs: 9
