In [1]:
import json
import sys
from pathlib import Path

# Add parent directory to path to import from implementation package
# Notebooks are in implementation/notebooks/, so we go up two levels to project root
sys.path.insert(0, str(Path().resolve().parent.parent))

from implementation.movie import IMDBMovie
from implementation.vectorize import (
    create_dense_anchor_vector_text,
    create_and_save_dense_anchor_vector,
    create_and_save_plot_events_vector,
    create_and_save_plot_analysis_vector,
    create_and_save_viewer_experience_vector,
    create_and_save_watch_context_vector,
    create_and_save_production_vector,
    clear_collections_from_chroma,
    fetch_all_vectors_from_chroma,
    search_similar_vectors,
    create_plot_events_vector_text,
    create_plot_analysis_vector_text
)
from implementation.visualize import visualize_vectors_tsne

# Load movies from JSON file (relative to notebook location)
json_path = Path("../../saved_imdb_movies.json")
with open(json_path, "r", encoding="utf-8") as f:
    movies_data = json.load(f)

# Convert each dictionary to an IMDBMovie object
movies = [IMDBMovie(**movie_dict) for movie_dict in movies_data]

print(f"Loaded {len(movies)} movies")

Loaded 50 movies


In [2]:
movie = movies[3]

print(movie.title)
print(create_plot_events_vector_text(movie))
# print(movie.plot_events_metadata)

frozen
In Arendelle, young Princesses Elsa and Anna play in winter; Elsa (who can create ice and snow) accidentally injures Anna with her powers, turning a lock of Anna's hair white. Their parents, King Agnarr and Queen Iduna, take them to the trolls led by Grand Pabbie, who heal Anna and remove her memories of magic but warn Elsa to control her power and fear. The royal family isolates the sisters; Elsa grows up hidden and taught to suppress emotion, while Anna becomes lonely. Years later, after their parents die at sea, Elsa's coronation opens the castle gates. Anna meets Prince Hans of the Southern Isles and impulsively accepts his marriage proposal; when Anna presses Elsa to bless the match, Elsa panics, loses control and exposes her powers to the guests. The townspeople and the Duke of Weselton accuse Elsa of witchcraft; Elsa flees across the fjord, unknowingly plunging Arendelle into eternal winter. Anna volunteers to find Elsa and restore summer.

Anna travels north, stopping at

In [None]:
# EMBEDDING ALL VECTORS
from concurrent.futures import ThreadPoolExecutor, as_completed

def process_movie_vectors(movie):
    """
    Processes all three vector types for a single movie.
    
    This function creates and saves anchor, content, and vibe vectors
    for a movie sequentially to avoid database contention.
    
    Args:
        movie: IMDBMovie instance to process
    """
    # create_and_save_dense_anchor_vector(movie, db_path="../chroma_db")
    # create_and_save_plot_events_vector(movie, db_path="../chroma_db")
    # create_and_save_plot_analysis_vector(movie, db_path="../chroma_db")
    # create_and_save_viewer_experience_vector(movie, db_path="../chroma_db")
    # create_and_save_watch_context_vector(movie, db_path="../chroma_db")
    # create_and_save_production_vector(movie, db_path="../chroma_db")

# Process movies in parallel using ThreadPoolExecutor
# Using max_workers to control concurrency (adjust based on API rate limits)
# Since these are I/O-bound operations (API calls and DB writes), threading is appropriate
max_workers = 7  # Adjust this based on your API rate limits and system capacity

with ThreadPoolExecutor(max_workers=max_workers) as executor:
    # Submit all movie processing tasks
    futures = [executor.submit(process_movie_vectors, movie) for movie in movies]
    
    # Wait for all tasks to complete and handle any errors
    for future in as_completed(futures):
        try:
            future.result()  # This will raise any exceptions that occurred
        except Exception as e:
            print(f"Error processing movie: {e}")

Processing: ferris bueller's day off (ID: tt0091042)
  Creating embedding...
Processing: zootopia (ID: tt2948356)
  Creating embedding...
Processing: school of rock (ID: tt0332379)
  Creating embedding...
Processing: frozen (ID: tt2294629)
  Creating embedding...
Processing: the princess bride (ID: tt0093779)
  Creating embedding...
Processing: coco (ID: tt2380307)
  Creating embedding...
Processing: klaus (ID: tt4729430)
  Creating embedding...
  Embedding created: [0.002006266498938203, 0.05060894414782524, -0.03597363457083702, 0.006505869794636965, -0.05255722254514694, -0.012605847790837288, 0.0033718033228069544, 0.061417266726493835, 0.002297638915479183, -0.020770076662302017, -0.0004265490861143917, -0.0080134691670537, -0.05547964572906494, -0.006842180620878935, -0.018787004053592682, 0.003267431166023016, -0.05747431516647339, 0.001361912814900279, 0.0026846861001104116, 0.044717710465192795, 0.01883339136838913, 0.012976949103176594, -0.018439097329974174, -0.0383625999093

In [None]:
# CLEARS ALL VECTORS BE CAREFUL!!!
# clear_collections_from_chroma(collection_names=["dense_anchor_vectors"])
# clear_collections_from_chroma(collection_names=["dense_content_vectors"])
# clear_collections_from_chroma(collection_names=["dense_vibe_vectors"])

✓ Cleared 53 vector(s) from collection 'dense_anchor_vectors'
✓ Cleared 53 vector(s) from collection 'dense_content_vectors'


In [6]:
# FETCHING ALL VECTORS
# vector_collection = fetch_all_vectors_from_chroma(collection_name="dense_anchor_vectors")
vector_collection = fetch_all_vectors_from_chroma(collection_name="dense_vibe_vectors")

In [5]:
visualize_vectors_tsne(vector_collection, n_components=3)

: 

In [9]:
import numpy as np

index = 16
n_results = 5

searching_name = vector_collection.metadatas[index].get("title")
searching_vector = vector_collection.embeddings[index]
searching_original_text = vector_collection.documents[index]

print(f"*** Getting closest vectors to {searching_name} \n")
print(searching_original_text)
print()

# Get closest vectors
results = search_similar_vectors(searching_vector, 
    collection_name="dense_vibe_vectors", 
    n_results=n_results,
    ids_to_filter_out=[vector_collection.ids[index]]
)

print(f"\n***The {n_results} closest vector(s) to {searching_name} are:")
for i, result in enumerate(results.metadatas):
    title = result.get('title')
    # Get the original embedded text (document)
    original_text = results.documents[i] if i < len(results.documents) else "N/A"
    # Print distance if available (use getattr to handle cases where attribute might not exist)
    distances = getattr(results, 'distances', None)
    if distances is not None and i < len(distances):
        distance = distances[i]
        print(f"{i+1}. {title} (distance: {distance:.4f})")
    else:
        print(f"{i+1}. {title}")
    print("******")
    print(f" {original_text}")
    print("******")
    print("========================")

# Calculate distances for all vectors to find the furthest ones
# Reuse vector_collection which already contains all vectors from the collection
# Calculate cosine distances for all vectors (excluding the query vector itself)
# ChromaDB uses cosine distance = 1 - cosine_similarity
distances_with_indices = []
for i, vector_id in enumerate(vector_collection.ids):
    # Skip the query vector itself
    if i == index:
        continue
    
    # Calculate cosine similarity
    vec1 = np.array(searching_vector)
    vec2 = np.array(vector_collection.embeddings[i])
    
    # Normalize vectors
    vec1_norm = vec1 / (np.linalg.norm(vec1) + 1e-8)
    vec2_norm = vec2 / (np.linalg.norm(vec2) + 1e-8)
    
    # Cosine similarity
    cosine_sim = np.dot(vec1_norm, vec2_norm)
    # Cosine distance (ChromaDB uses this)
    cosine_distance = 1 - cosine_sim
    
    distances_with_indices.append((i, cosine_distance))

# Sort by distance descending (furthest first) and get top n_results
distances_with_indices.sort(key=lambda x: x[1], reverse=True)
furthest_indices = [idx for idx, _ in distances_with_indices[:n_results]]

print(f"\nThe {n_results} furthest vector(s) from {searching_name} are:")
for rank, idx in enumerate(furthest_indices, 1):
    title = vector_collection.metadatas[idx].get('title')
    distance = distances_with_indices[rank - 1][1]
    original_text = vector_collection.documents[idx]
    print(f"{rank}. {title} (distance: {distance:.4f})")
    print("******")
    print(f" {original_text}")
    print("******")
    print("========================")

# search_similar_vectors()

*** Getting closest vectors to harry potter and the philosopher's stone 

Dominant mood: wonder-filled, adventurous, wonderment with cozy nostalgia; magical, playful curiosity; warm, uplifting guardianship undercurrent
Movie energy: balanced pacing; steady, exploratory; episodic yet cohesive; intimate character focus with occasional grand spectacle
Intensity: mildly suspenseful with eerie moments; intermittent tension rather than constant dread; hopeful resolve amidst danger
Romance, humor, and sexuality: soft humor; light, family-friendly warmth; gentle, wholesome camaraderie; minimal romance elements
Final viewer impression: inspirational, comforting, awe-inspired; sense of belonging and possibility; enduring magical wonder
Genres: adventure, family, fantasy


***The 5 closest vector(s) to harry potter and the philosopher's stone are:
1. frozen (distance: 0.2341)
******
 Dominant mood: warm-hearted, uplifting, hopeful, magical, earnest
Movie energy: lush, expansive, slowly unfolding,

In [3]:
# List of title snippets to search for (case-insensitive)
title_snippets = ["titanic", "princess bride", "leap year", "scott pilgrim", "notebook", "gump"]

# For each snippet, find matching movies and display their content vectors
for snippet in title_snippets:
    # Find movies whose titles contain the snippet (case-insensitive)
    matching_movies = [
        movie for movie in movies 
        if snippet.lower() in movie.title.lower()
    ]
    
    if not matching_movies:
        print(f"No movies found matching '{snippet}'")
        print()
        continue
    
    # If multiple matches, print all of them
    if len(matching_movies) > 1:
        print(f"Found {len(matching_movies)} movies matching '{snippet}':")
        print()
    
    # Print each matching movie's title and content vector
    for movie in matching_movies:
        print(movie.title)
        print(create_plot_analysis_vector_text(movie))
        print()

titanic
High level overview: A young aristocratic woman defies her controlling engagement to fall for a poor artist aboard an 'unsinkable' ocean liner, and their romance is destroyed when the ship strikes an iceberg and sinks, leading her to survival and a decades‑later reckoning with the past.
Setting: 1912 North Atlantic aboard an ocean liner; 1996 salvage ship
Premise: A forbidden cross‑class romance aboard a luxury liner collides with institutional hubris and disaster, told through a survivor's flashback and a modern salvage hunt for a lost jewel.
Main protagonists: A constrained, rebellious seventeen‑year‑old upper‑class woman and a free‑spirited, kind but poor itinerant artist.
Protagonists goals and motivation: She seeks freedom from an imposed, financial marriage and wants to choose love; he wants connection and to be accepted, and they plan to run away together by defying social strictures aboard the ship.
Stakes: If they fail, she remains trapped in a controlling marriage and

In [14]:
for movie in movies:
    print(f"{movie.title} {movie.tmdb_id}")

zootopia 269149
school of rock 1584
frozen 109445
ferris bueller's day off 9377
the princess bride 2493
shrek 808
up 14160
coco 354912
mulan 10674
klaus 508965
mad max: fury road 76341
captain america: the first avenger 1771
the dark knight 155
john wick 245891
raiders of the lost ark 85
spider-man: across the spider-verse 569094
harry potter and the philosopher's stone 671
the lord of the rings: the fellowship of the ring 120
star wars 11
avengers: endgame 299534
gladiator 98
inception 27205
interstellar 157336
the matrix 603
blade runner 2049 335984
jurassic park 329
arrival 329865
hereditary 493922
the shining 694
insidious 49018
terrifier 3 1034541
saw 176
se7en 807
parasite 496243
get out 419430
american psycho 1359
fight club 550
titanic 597
forrest gump 13
past lives 666277
leap year 25195
the pianist 423
the notebook 11036
50 first dates 1824
fifty shades of grey 216015
murder on the orient express 392044
everything everywhere all at once 545611
scott pilgrim vs. the world 2253