# Use this for quickly trying different iterations for embedding n stuff

In [4]:
import json
import sys
from pathlib import Path

# Add parent directory to path to import from implementation package
# Notebooks are in implementation/notebooks/, so we go up two levels to project root
sys.path.insert(0, str(Path().resolve().parent.parent))

from implementation.classes.movie import IMDBMovie

# Load movies from JSON file (relative to notebook location)
json_path = Path("../../saved_imdb_movies.json")
with open(json_path, "r", encoding="utf-8") as f:
    movies_data = json.load(f)

# Convert each dictionary to an IMDBMovie object
# movies = [IMDBMovie(**movie_dict) for movie_dict in movies_data]

In [2]:
for i in range(len(movies)):
    movie = movies[i]   
    print(f"{i}: {movie.title}")

0: ferris bueller's day off
1: zootopia
2: school of rock
3: frozen
4: the princess bride
5: coco
6: klaus
7: up
8: mulan
9: shrek
10: the year without a santa claus
11: mad max: fury road
12: raiders of the lost ark
13: the dark knight
14: john wick
15: captain america: the first avenger
16: spider-man: across the spider-verse
17: avengers: endgame
18: star wars
19: harry potter and the philosopher's stone
20: the lord of the rings: the fellowship of the ring
21: gladiator
22: inception
23: the matrix
24: interstellar
25: blade runner 2049
26: jurassic park
27: arrival
28: hereditary
29: the shining
30: insidious
31: terrifier 3
32: saw
33: se7en
34: parasite
35: get out
36: american psycho
37: fight club
38: titanic
39: forrest gump
40: past lives
41: the pianist
42: the notebook
43: 50 first dates
44: leap year
45: fifty shades of grey
46: murder on the orient express
47: everything everywhere all at once
48: scott pilgrim vs. the world
49: the naked gun: from the files of police sq

In [2]:
movie = movies[41]
movie.title

'the pianist'

In [6]:
# DELETE CURRENT PLOT / VIBE METADATA FROM SAVED MOVIES

# Set vibe_metadata to None for each movie dictionary
for movie_dict in movies_data:
    movie_dict["plot_events_metadata"] = None
    movie_dict["plot_analysis_metadata"] = None
    movie_dict["viewer_experience_metadata"] = None
    movie_dict["watch_context_metadata"] = None
    movie_dict["narrative_techniques_metadata"] = None
    movie_dict["production_metadata"] = None
    movie_dict["reception_metadata"] = None

# Save the updated list back to the JSON file
with open(json_path, "w", encoding="utf-8") as f:
    json.dump(movies_data, f, indent=2, ensure_ascii=False)

print(f"✓ Updated {len(movies_data)} movies and saved to {json_path}")

✓ Updated 50 movies and saved to ../../saved_imdb_movies.json


# Test LLM Generations

In [3]:
# PLOT METADATA GENERATION

plot_metadata = generate_plot_summary(
    overview=movie.overview,
    overall_keywords=movie.overall_keywords,
    plot_keywords=movie.plot_keywords,
    plot_summaries=movie.plot_summaries,
    synopsis=movie.synopsis,
    should_clean=True
)

movie.plot_metadata = plot_metadata

plot_metadata

PlotMetadata(overview=PlotOverview(premise='A celebrated Polish-Jewish pianist, referred to here as the pianist, struggles to survive in Nazi-occupied Warsaw after being separated from his family during World War II.', goal='The pianist aims to stay alive and avoid deportation or execution by hiding in Warsaw, using friends, false identities, and concealment to endure until liberation.', opposition='Nazi German occupation forces and collaborating systems (deportations, ghettoization, raids, and urban destruction) seek to remove, deport, or kill Jewish citizens and suppress resistance.', stakes='If the pianist fails he will be deported to an extermination camp or killed; failure also means the permanent loss of his family and cultural voice.'), beats=PlotBeatsRaw(inciting_incident="A bombing at the radio station during Germany's 1939 invasion and subsequent Nazi occupation lead to anti-Jewish laws and the pianist's family's forced relocation into the Warsaw Ghetto, initiating danger and

In [None]:
# Dive deeper into plot metadata

import json 

# movie.plot_metadata = plot_metadata
print(create_plot_analysis_vector_text(movie))

# print(plot_metadata.premise)
# print()
# print(plot_metadata.plot_summary)
# print()
# print(plot_metadata.beats)
# print()
# print(plot_metadata.conflict_stakes)
# print()
# print(plot_metadata.key_relationships)
# print()
# print(plot_metadata.themes)

# print(json.dumps(plot_metadata.model_dump(), indent=2))


[OVERVIEW]
premise=A celebrated Polish-Jewish pianist, referred to here as the pianist, struggles to survive in Nazi-occupied Warsaw after being separated from his family during World War II.
goal=The pianist aims to stay alive and avoid deportation or execution by hiding in Warsaw, using friends, false identities, and concealment to endure until liberation.
opposition=Nazi German occupation forces and collaborating systems (deportations, ghettoization, raids, and urban destruction) seek to remove, deport, or kill Jewish citizens and suppress resistance.
stakes=If the pianist fails he will be deported to an extermination camp or killed; failure also means the permanent loss of his family and cultural voice.

[BEATS]
inciting=A bombing at the radio station during Germany's 1939 invasion and subsequent Nazi occupation lead to anti-Jewish laws and the pianist's family's forced relocation into the Warsaw Ghetto, initiating danger and deprivation.
key_challenges=The pianist faces ghetto sta

In [5]:
cleaned_metadata = clean_plot_summary(json.dumps(plot_metadata.model_dump(), indent=2))

In [None]:
# print(json.dumps(cleaned_metadata.model_dump(), indent=2))
movie.plot_metadata = cleaned_metadata
print(create_plot_analysis_vector_text(movie))

[PREMISE]
During World War II, a Polish Jewish pianist, PROTAGONIST, survives the Nazi occupation of Warsaw. After the invasion, he is separated from his FAMILY, forced into the ghetto, and goes into hiding among the ruins as the city is destroyed, relying on allies and his music to endure until liberation.

[PLOT]
PROTAGONIST, a renowned Polish Jewish pianist, plays on the Warsaw radio when the city is bombed at the outbreak of World War II. As German forces conquer Poland, his FAMILY is deported and he is forced into the Warsaw ghetto; later, during mass deportations, he is separated from his relatives. With the help of friends, he hides first in a friends apartment, then moves through a sequence of abandoned homes and hidden rooms as occupying forces tighten control, performing privately to survive and smuggle food while his FAMILY suffers and is sent to concentration camps. After a revolt and the destruction of the city, PROTAGONIST endures years of starvation and illness, moving 

In [None]:
# Call generate_vibe_metadata with values from the movie object
# Convert synopsis from str | None to Optional[list[str]]
synopsis_list = None
if movie.synopsis:
    synopsis_list = [movie.synopsis]
elif movie.debug_synopses:
    synopsis_list = movie.debug_synopses

# Convert ParentalGuideItem objects to dicts for the function
parental_guide_dicts = [item.model_dump() for item in movie.parental_guide_items]

# Call the function with all movie attributes
vibe_metadata = generate_vibe_metadata(
    overview=movie.overview,
    genres=movie.genres,
    overall_keywords=movie.overall_keywords,
    plot_keywords=movie.plot_keywords,
    synopsis=synopsis_list,
    plot_summaries=movie.debug_plot_summaries if movie.debug_plot_summaries else None,
    maturity_rating=movie.maturity_rating,
    maturity_reasoning=movie.maturity_reasoning,
    parental_guide_items=parental_guide_dicts,
    reception_summary=movie.reception_summary
)

vibe_metadata

('vibe',
 VibeMetadata(dominant_mood='tense, claustrophobic, atmospheric dread; austere, ominous, unnerving; disciplined, grimly functional', movie_energy='slow-burn, meticulous, claustrophobic pacing; methodical, restrained, escalating tension', intensity='slow-building unease to visceral shocks; suspenseful, grim, body-horror tinges', romance_humor_sexuality='minimal humor, restrained sexuality implied, clinical and procedural', final_viewer_impression='haunting, iconic, unsettled; reverent, eerie, enduring', viewing_context='solo late-night focus; adrenaline-tinged thriller experience; best on a dark, quiet night with immersive sound'))

In [None]:
# Dive deeper into vibe metadata

VibeMetadata(dominant_mood='electrifying, mind-bending, rebellious', movie_energy='high-octane yet calibrating to intimate moments; kinetic action interspersed with contemplative stretches', intensity_and_driver='taut and propulsive; suspense stems from high-stakes conflict and existential questions, with occasional bursts of adrenaline', humor='sparingly ironic and dry, occasional witticisms amid dense sci-fi concepts', romance_and_sexuality='subtle and implied; none central to the experience', viewing_context='great for immersive, single-screen viewing with focused attention; ideal for a night of sci-fi action and thought-provoking spectacle; enhances with a dim, theater-like environment; energetic but rewarding with repeat viewings')

# Update Generated Values in Current DB

In [5]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Tuple

def process_single_movie(args: Tuple[int, IMDBMovie]) -> Tuple[int, IMDBMovie, bool, str]:
    """
    Process a single movie to generate vibe metadata.
    
    Args:
        args: Tuple of (index, movie) where index is the movie's position in the original list
        
    Returns:
        Tuple of (index, updated_movie, success, error_message)
        - index: Original position in the list (for maintaining order)
        - updated_movie: Updated IMDBMovie object (or original if generation failed)
        - success: Boolean indicating if generation was successful
        - error_message: Error message if generation failed, empty string otherwise
    """
    i, movie = args
    
    try:
        # Convert synopsis from str | None to Optional[list[str]]
        synopsis_list = None
        if movie.synopsis:
            synopsis_list = [movie.synopsis]
        elif movie.debug_synopses:
            synopsis_list = movie.debug_synopses
        
        # Convert ParentalGuideItem objects to dicts for the function
        parental_guide_dicts = [item.model_dump() for item in movie.parental_guide_items]
        
        # Generate vibe metadata for this movie
        result_type, vibe_metadata = generate_vibe_metadata(
            overview=movie.overview,
            genres=movie.genres,
            overall_keywords=movie.overall_keywords,
            plot_keywords=movie.plot_keywords,
            synopsis=synopsis_list,
            plot_summaries=movie.debug_plot_summaries if movie.debug_plot_summaries else None,
            maturity_rating=movie.maturity_rating,
            maturity_reasoning=movie.maturity_reasoning,
            parental_guide_items=parental_guide_dicts,
            reception_summary=movie.reception_summary
        )
        
        # Check if generation was successful
        if vibe_metadata is not None:
            # Create updated movie with new vibe_metadata
            # Using model_copy to create a new instance with updated fields
            updated_movie = movie.model_copy(update={
                "vibe_metadata": vibe_metadata
            })
            return (i, updated_movie, True, "")
        else:
            # Generation failed but keep original movie
            return (i, movie, False, "Generation returned None")
            
    except Exception as e:
        # Error occurred, keep original movie
        return (i, movie, False, str(e))

# Process all movies in parallel using ThreadPoolExecutor
# Using max_workers=None defaults to min(32, (os.cpu_count() or 1) + 4) threads
# This is good for I/O-bound tasks like API calls
print(f"Processing {len(movies)} movies in parallel...")
updated_movies_dict = {}  # Use dict to maintain order by index
failed_movies = []

# Create list of (index, movie) tuples for processing
movie_args = [(i, movie) for i, movie in enumerate(movies)]

# Process movies in parallel
with ThreadPoolExecutor(max_workers=10) as executor:
    # Submit all tasks
    future_to_movie = {executor.submit(process_single_movie, args): args[1] for args in movie_args}
    
    # Process completed tasks as they finish
    completed = 0
    for future in as_completed(future_to_movie):
        movie = future_to_movie[future]
        completed += 1
        
        try:
            index, updated_movie, success, error_msg = future.result()
            updated_movies_dict[index] = updated_movie
            
            if success:
                print(f"[{completed}/{len(movies)}] ✓ {movie.title}")
            else:
                print(f"[{completed}/{len(movies)}] ✗ {movie.title} - {error_msg}")
                failed_movies.append((index, movie.title, error_msg))
        except Exception as e:
            # Handle unexpected errors in result retrieval
            print(f"[{completed}/{len(movies)}] ✗ {movie.title} - Unexpected error: {str(e)}")
            # Find the index for this movie
            for idx, m in enumerate(movies):
                if m == movie:
                    updated_movies_dict[idx] = movie
                    failed_movies.append((idx, movie.title, f"Unexpected error: {str(e)}"))
                    break

# Reconstruct movies list in original order
updated_movies = [updated_movies_dict[i] for i in range(len(movies))]

print(f"\nCompleted processing {len(movies)} movies")
print(f"Successfully updated: {len(updated_movies) - len(failed_movies)}")
print(f"Failed: {len(failed_movies)}")

if failed_movies:
    print("\nFailed movies:")
    for idx, title, reason in failed_movies:
        print(f"  {idx}: {title} - {reason}")

# Save updated movies back to JSON file
json_path = Path("../saved_imdb_movies.json")
# Convert IMDBMovie objects to dictionaries for JSON serialization
movies_data_updated = [movie.model_dump() for movie in updated_movies]

with open(json_path, "w", encoding="utf-8") as f:
    json.dump(movies_data_updated, f, indent=2, ensure_ascii=False)

print(f"\n✓ Saved {len(updated_movies)} movies to {json_path}")

Processing 49 movies in parallel...
[1/49] ✓ hereditary
[2/49] ✓ the lion king
[3/49] ✓ everything everywhere all at once
[4/49] ✓ spider-man: across the spider-verse
[5/49] ✓ the dark knight
[6/49] ✓ a complete unknown
[7/49] ✓ spirited away
[8/49] ✓ the iron giant
[9/49] ✓ dune: part two
[10/49] ✓ klaus
[11/49] ✓ the wild robot
[12/49] ✓ godzilla minus one
[13/49] ✓ it's a wonderful life
[14/49] ✓ poltergeist
[15/49] ✓ the lego movie
[16/49] ✓ seven samurai
[17/49] ✓ leap year
[18/49] ✓ parasite
[19/49] ✓ the godfather
[20/49] ✓ pulp fiction
[21/49] ✓ the shawshank redemption
[22/49] ✓ the matrix
[23/49] ✓ casablanca
[24/49] ✓ inception
[25/49] ✓ 2001: a space odyssey
[26/49] ✓ citizen kane
[27/49] ✓ alien
[28/49] ✓ psycho
[29/49] ✓ blade runner
[30/49] ✓ get out
[31/49] ✓ mad max: fury road
[32/49] ✓ la la land
[33/49] ✓ no country for old men
[34/49] ✓ pan's labyrinth
[35/49] ✓ amélie
[36/49] ✓ the grand budapest hotel
[37/49] ✓ eternal sunshine of the spotless mind
[38/49] ✓ moonl

In [None]:
# PLOT METADATA REGENERATION

import sys
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Tuple
import json
from pathlib import Path

# Add parent directory to path to import from implementation package
# Notebooks are in implementation/notebooks/, so we go up two levels to project root
if str(Path().resolve().parent.parent) not in sys.path:
    sys.path.insert(0, str(Path().resolve().parent.parent))

from implementation.classes.movie import IMDBMovie
from implementation.llms.vector_metadata_generation_methods import generate_plot_metadata

def process_single_movie_plot(args: Tuple[int, IMDBMovie]) -> Tuple[int, IMDBMovie, bool, str]:
    """
    Process a single movie to generate plot metadata.
    
    Args:
        args: Tuple of (index, movie) where index is the movie's position in the original list
        
    Returns:
        Tuple of (index, updated_movie, success, error_message)
        - index: Original position in the list (for maintaining order)
        - updated_movie: Updated IMDBMovie object (or original if generation failed)
        - success: Boolean indicating if generation was successful
        - error_message: Error message if generation failed, empty string otherwise
    """
    i, movie = args
    
    try:
        # Generate plot metadata for this movie
        # generate_plot_metadata returns a tuple: (result_type, plot_metadata)
        result_type, plot_metadata = generate_plot_metadata(
            overview=movie.overview,
            plot_keywords=movie.plot_keywords,
            plot_summaries=movie.plot_summaries if movie.plot_summaries else [],
            synopsis=movie.synopsis if movie.synopsis else [],
            overall_keywords=movie.overall_keywords
        )
        
        # Check if generation was successful
        if plot_metadata is not None:
            # Create updated movie with new plot_metadata
            # Using model_copy to create a new instance with updated fields
            updated_movie = movie.model_copy(update={
                "plot_metadata": plot_metadata
            })
            return (i, updated_movie, True, "")
        else:
            # Generation failed but keep original movie
            return (i, movie, False, "Generation returned None")
            
    except Exception as e:
        # Error occurred, keep original movie
        return (i, movie, False, str(e))

# Load movies from JSON file (relative to notebook location)
json_path = Path("../saved_imdb_movies.json")
with open(json_path, "r", encoding="utf-8") as f:
    movies_data = json.load(f)

# Convert each dictionary to an IMDBMovie object
movies = [IMDBMovie(**movie_dict) for movie_dict in movies_data]

# Process all movies in parallel using ThreadPoolExecutor
# Using max_workers=10 for I/O-bound tasks like API calls
print(f"Processing {len(movies)} movies in parallel...")
updated_movies_dict = {}  # Use dict to maintain order by index
failed_movies = []

# Create list of (index, movie) tuples for processing
movie_args = [(i, movie) for i, movie in enumerate(movies)]

# Process movies in parallel
with ThreadPoolExecutor(max_workers=10) as executor:
    # Submit all tasks
    future_to_movie = {executor.submit(process_single_movie_plot, args): args[1] for args in movie_args}
    
    # Process completed tasks as they finish
    completed = 0
    for future in as_completed(future_to_movie):
        movie = future_to_movie[future]
        completed += 1
        
        try:
            index, updated_movie, success, error_msg = future.result()
            updated_movies_dict[index] = updated_movie
            
            if success:
                print(f"[{completed}/{len(movies)}] ✓ {movie.title}")
            else:
                print(f"[{completed}/{len(movies)}] ✗ {movie.title} - {error_msg}")
                failed_movies.append((index, movie.title, error_msg))
        except Exception as e:
            # Handle unexpected errors in result retrieval
            print(f"[{completed}/{len(movies)}] ✗ {movie.title} - Unexpected error: {str(e)}")
            # Find the index for this movie
            for idx, m in enumerate(movies):
                if m == movie:
                    updated_movies_dict[idx] = movie
                    failed_movies.append((idx, movie.title, f"Unexpected error: {str(e)}"))
                    break

# Reconstruct movies list in original order
updated_movies = [updated_movies_dict[i] for i in range(len(movies))]

print(f"\nCompleted processing {len(movies)} movies")
print(f"Successfully updated: {len(updated_movies) - len(failed_movies)}")
print(f"Failed: {len(failed_movies)}")

if failed_movies:
    print("\nFailed movies:")
    for idx, title, reason in failed_movies:
        print(f"  {idx}: {title} - {reason}")

# Save updated movies back to JSON file
# Convert IMDBMovie objects to dictionaries for JSON serialization
movies_data_updated = [movie.model_dump() for movie in updated_movies]

with open(json_path, "w", encoding="utf-8") as f:
    json.dump(movies_data_updated, f, indent=2, ensure_ascii=False)

print(f"\n✓ Saved {len(updated_movies)} movies to {json_path}")

Processing 50 movies in parallel...
[1/50] ✓ up
[2/50] ✓ mulan
[3/50] ✓ school of rock
[4/50] ✓ shrek
[5/50] ✓ zootopia
[6/50] ✓ the princess bride
[7/50] ✓ frozen
[8/50] ✓ klaus
[9/50] ✓ ferris bueller's day off
[10/50] ✓ coco
[11/50] ✓ the year without a santa claus
[12/50] ✓ raiders of the lost ark
[13/50] ✓ mad max: fury road
[14/50] ✓ the dark knight
[15/50] ✓ star wars
[16/50] ✓ john wick
[17/50] ✓ harry potter and the philosopher's stone
[18/50] ✓ spider-man: across the spider-verse
[19/50] ✓ avengers: endgame
[20/50] ✓ captain america: the first avenger
[21/50] ✓ gladiator
[22/50] ✓ the lord of the rings: the fellowship of the ring
[23/50] ✓ the matrix
[24/50] ✓ jurassic park
[25/50] ✓ inception
[26/50] ✓ the shining
[27/50] ✓ hereditary
[28/50] ✓ arrival
[29/50] ✓ blade runner 2049
[30/50] ✓ interstellar
[31/50] ✓ american psycho
[32/50] ✓ se7en
[33/50] ✓ terrifier 3
[34/50] ✓ insidious
[35/50] ✓ titanic
[36/50] ✓ fight club
[37/50] ✓ forrest gump
[38/50] ✓ saw
[39/50] ✓ paras