In [1]:
from __future__ import annotations

import os
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
import sys
from pathlib import Path

# Add parent directory to path to import from implementation package
# Notebooks are in implementation/notebooks/, so we go up two levels to project root
sys.path.insert(0, str(Path().resolve().parent.parent))

from implementation.vectorize import create_plot_analysis_vector_text, create_plot_events_vector_text
from openai import OpenAI
from implementation.llm_generations import generate_plot_events_metadata, generate_plot_analysis_metadata, generate_plot_metadata
from dotenv import load_dotenv
from typing import List, Optional
from pathlib import Path
from implementation.movie import IMDBMovie
from implementation.schemas import PlotAnalysisMetadata

# Load environment variables (for API key)
load_dotenv()

# Get OpenAI API key from environment and initialize client once at module load
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise ValueError(
        "OPENAI_API_KEY environment variable not set. "
        "Please set it before importing this module."
    )

# Initialize OpenAI client - created once when module is loaded
client = OpenAI(api_key=api_key)

In [2]:
# LOAD MOVIES

json_path = Path("../../saved_imdb_movies.json")
with open(json_path, "r", encoding="utf-8") as f:
    movies_data = json.load(f)

# Convert each dictionary to an IMDBMovie object
movies = [IMDBMovie(**movie_dict) for movie_dict in movies_data]

In [3]:
for index, movie in enumerate(movies):
    print(f"{index}: {movie.title}")

0: ferris bueller's day off
1: zootopia
2: school of rock
3: frozen
4: the princess bride
5: coco
6: klaus
7: up
8: mulan
9: shrek
10: the year without a santa claus
11: mad max: fury road
12: raiders of the lost ark
13: the dark knight
14: john wick
15: captain america: the first avenger
16: spider-man: across the spider-verse
17: avengers: endgame
18: star wars
19: harry potter and the philosopher's stone
20: the lord of the rings: the fellowship of the ring
21: gladiator
22: inception
23: the matrix
24: interstellar
25: blade runner 2049
26: jurassic park
27: arrival
28: hereditary
29: the shining
30: insidious
31: terrifier 3
32: saw
33: se7en
34: parasite
35: get out
36: american psycho
37: fight club
38: titanic
39: forrest gump
40: past lives
41: the pianist
42: the notebook
43: 50 first dates
44: leap year
45: fifty shades of grey
46: murder on the orient express
47: everything everywhere all at once
48: scott pilgrim vs. the world
49: the naked gun: from the files of police sq

In [4]:
movie = movies[17]

print(create_plot_events_vector_text(movie))

In 2018, Clint Barton (Hawkeye) practices archery with his daughter Lila at his rural homestead; she disintegrates into ash during Thanos' snap, leaving Clint devastated. Three weeks later Tony Stark and Nebula are adrift aboard the Guardians' ship Benatar; Tony records a goodbye to Pepper Potts as supplies run out. Carol Danvers (Captain Marvel) finds and returns them to Earth. Tony reunites with Pepper, Steve Rogers, Natasha Romanoff, Bruce Banner and James Rhodes; Rocket joins Nebula as the Guardians mourn. The team locates Thanos' garden ship and ambushes him; they cut off his gauntlet only to learn Thanos destroyed the Infinity Stones to prevent reuse. Thor beheads Thanos. With the stones gone there is no way to reverse the snap. Five years pass. Steve runs a support group; Tony and Pepper are married with daughter Morgan; Clint has become Ronin, killing criminals worldwide; Scott Lang (Ant-Man) escapes the quantum realm and reunites with his now-teenage daughter Cassie.
Scott pro

In [5]:
movie2 = movies[12]

disregard, results = generate_plot_metadata(
    title=movie2.title,
    overview=movie2.overview,
    plot_summaries=movie2.debug_plot_summaries,
    plot_synopses=movie2.debug_synopses,
    plot_keywords=movie2.plot_keywords,
    featured_reviews=movie2.featured_reviews,
)

plot_events_metadata, plot_analysis_metadata = results

movie2.plot_events_metadata = plot_events_metadata
movie2.plot_analysis_metadata = plot_analysis_metadata

Generating plot metadata for raiders of the lost ark


KeyboardInterrupt: 

In [None]:
# GENERATE A BUNCH OF RESULTS SO I CAN COMPARE LATER

import time
from concurrent.futures import ThreadPoolExecutor, as_completed

indices = [3,12,38,46]

movies_to_save = [movies[i] for i in indices]

def process_movie_twice(movie: IMDBMovie) -> tuple[str, list[dict], bool]:
    """
    Process a single movie by generating plot metadata twice and save to file.
    
    Args:
        movie: The IMDBMovie object to process
        
    Returns:
        Tuple of (movie_title, list_of_results, success) where each result is a dict
        containing both plot_events_metadata and plot_analysis_metadata
    """
    results = []
    
    # Generate plot metadata twice for this movie
    for run_num in range(2):
        try:
            # Call generate_plot_metadata - it returns a tuple with status and data
            status, metadata_tuple = generate_plot_metadata(
                title=movie.title,
                overview=movie.overview,
                plot_keywords=movie.plot_keywords,
                plot_summaries=movie.debug_plot_summaries,
                plot_synopses=movie.debug_synopses,
                featured_reviews=movie.featured_reviews,
            )
            
            if metadata_tuple is not None:
                plot_events_metadata, plot_analysis_metadata = metadata_tuple
                # Store both results in a single dict
                result = {
                    "run": run_num + 1,
                    "title": movie.title,
                    "plot_events_metadata": plot_events_metadata.model_dump(),
                    "plot_analysis_metadata": plot_analysis_metadata.model_dump()
                }
                results.append(result)
            else:
                print(f"Warning: Failed to generate metadata for {movie.title} (run {run_num + 1})")
        except Exception as e:
            print(f"Error processing {movie.title} (run {run_num + 1}): {e}")
    
    # Save results to file for this movie (2 JSON objects, one per line)
    if results:
        # Create a safe filename from the movie title
        safe_filename = "".join(c if c.isalnum() or c in (' ', '-', '_') else '_' for c in movie.title)
        safe_filename = safe_filename.replace(' ', '_').lower()
        output_file = f"./{safe_filename}_v2.jsonl"
        
        try:
            with open(output_file, "w", encoding="utf-8") as f:
                for result in results:
                    f.write(json.dumps(result, ensure_ascii=False) + "\n")
            return (movie.title, results, True)
        except Exception as e:
            print(f"Error saving file for {movie.title}: {e}")
            return (movie.title, results, False)
    else:
        return (movie.title, results, False)

# Process all movies in parallel
print(f"Processing {len(movies_to_save)} movies, generating metadata twice for each...")
successful_movies = []
failed_movies = []

# Use ThreadPoolExecutor to parallelize across movies
# Each movie will be processed twice sequentially, but different movies process in parallel
# Each movie's results are saved to its own file during processing
with ThreadPoolExecutor(max_workers=10) as executor:
    # Submit all tasks
    future_to_movie = {
        executor.submit(process_movie_twice, movie): movie 
        for movie in movies_to_save
    }
    
    # Process completed tasks as they finish
    completed = 0
    for future in as_completed(future_to_movie):
        movie = future_to_movie[future]
        completed += 1
        
        try:
            movie_title, results, success = future.result()
            if success:
                successful_movies.append(movie_title)
                print(f"[{completed}/{len(movies_to_save)}] ✓ {movie_title} - Saved {len(results)} results to file")
            else:
                failed_movies.append(movie_title)
                print(f"[{completed}/{len(movies_to_save)}] ✗ {movie_title} - Failed to save results")
        except Exception as e:
            failed_movies.append(movie.title)
            print(f"[{completed}/{len(movies_to_save)}] ✗ {movie.title} - Error: {e}")

print(f"\nDone! Successfully processed {len(successful_movies)} movies")
if failed_movies:
    print(f"Failed: {len(failed_movies)} movies")
    for movie_title in failed_movies:
        print(f"  - {movie_title}")



Processing 4 movies, generating metadata twice for each...
Generating plot metadata for frozen
Generating plot metadata for raiders of the lost ark
Generating plot metadata for titanic
Generating plot metadata for murder on the orient express
Plot events metadata for raiders of the lost ark (completed in 14.68 seconds):
Plot events metadata for murder on the orient express (completed in 19.85 seconds):
Plot events metadata for titanic (completed in 21.07 seconds):
Plot events metadata for frozen (completed in 33.46 seconds):

Plot analysis metadata for raiders of the lost ark (completed in 22.40 seconds):
Generating plot metadata for raiders of the lost ark

Plot analysis metadata for titanic (completed in 25.85 seconds):
Generating plot metadata for titanic

Plot analysis metadata for murder on the orient express (completed in 28.07 seconds):
Generating plot metadata for murder on the orient express

Plot analysis metadata for frozen (completed in 19.44 seconds):
Generating plot metad

In [None]:
# DIRECT VERSION COMPARISON

movie = movies[12]#[3,12,38,46]

safe_filename = "".join(c if c.isalnum() or c in (' ', '-', '_') else '_' for c in movie.title)
safe_filename = safe_filename.replace(' ', '_').lower()
output_file_v1 = f"./{safe_filename}.jsonl"
output_file_v2 = f"./{safe_filename}_v2.jsonl"

with open(output_file_v1, "r", encoding="utf-8") as f:
    v1_data = []
    for line_no, line in enumerate(f, start=1):
        line = line.strip()
        if not line:  # skip blank lines
            continue
        try:
            v1_data.append(json.loads(line))
        except json.JSONDecodeError as e:
            raise ValueError(f"Invalid JSON on line {line_no}: {e}") from e
    v1_data = [PlotAnalysisMetadata(**v1d['plot_analysis_metadata']) for v1d in v1_data]

with open(output_file_v2, "r", encoding="utf-8") as f:
    v2_data = []
    for line_no, line in enumerate(f, start=1):
        line = line.strip()
        if not line:  # skip blank lines
            continue
        try:
            v2_data.append(json.loads(line))
        except json.JSONDecodeError as e:
            raise ValueError(f"Invalid JSON on line {line_no}: {e}") from e
    v2_data = [PlotAnalysisMetadata(**v2d['plot_analysis_metadata']) for v2d in v2_data]

print(f"Generated results for {movie.title}")
print("========== V1 RESULTS ==========")
for result in v1_data:
    print(result)
    print()
print("========== V2 RESULTS ==========")
for result in v2_data:
    print(result)
    print()




Generated results for raiders of the lost ark
In 1936 an archaeologist is recruited to beat a rival authoritarian force to a legendary sacred artifact, decoding clues, infiltrating digs, rescuing allies, and ultimately witnessing the artifact destroy those who exploited it; the story is a race-against-evil treasure hunt that punishes hubris and affirms duty and respect for the sacred.
Race to secure supernatural artifact
Treasure-hunt adventure, Pulp action-adventure
large-scale conflict
linear narrative
recommitment to duty, hubris punished
quest/adventure
Good versus ideological evil, Sacred power versus greed
Greed invites destruction, Respect sacred limits','Duty over personal gain

An archaeologist races to secure a powerful ancient artifact before a fascist rival can exploit its power, assembling a team, solving map-room puzzles, surviving trap-filled tombs and ambushes, and rekindling a past relationship; the race ends when the artifact's supernatural power destroys its abusers 

In [None]:
print(movie2.plot_analysis_metadata)
print()

pmd = movie2.plot_analysis_metadata
print(f"Core Engine: ({pmd.core_engine.core_engine_label}) {pmd.core_engine.explanation_and_justification}")
print()

for arc in pmd.character_arcs:
    print(f"({arc.character_name}) {arc.arc_transformation_label}: {arc.arc_transformation_description}")
    print()

for theme in pmd.themes_primary:
    print(f"{theme.theme_label}: {theme.explanation_and_justification}")
    print()

for lesson in pmd.lessons_learned:
    print(f"{lesson.lesson_label}: {lesson.explanation_and_justification}")
    print()

In the 1930s a resourceful archaeologist is recruited to race rival ideological forces to recover a powerful sacred relic; after locating it amid betrayals and exotic set pieces the relic is seized and opened by the antagonists, whose hubris unleashes supernatural destruction, exposing the moral cost of trying to weaponize sacred power and forcing restraint.
Race to secure a powerful relic: A timed contest to recover a supernatural artifact drives every major beat: the protagonist and antagonists race to secure a relic whose possession promises decisive power.
Pulp action-adventure, Treasure hunt
global conflict
linear narrative
reluctant hero: A pragmatic, self-reliant scholar grows into a decisive protector who prioritizes preventing ideological misuse of a sacred power over personal gain.
ideological hubris: A collaborator driven by ambition and ideology pursues control of the relic and is consumed by hubris, meeting annihilation when the power is unleashed.
quest/adventure
Ambition

In [None]:
# PLOT METADATA REGENERATION

from typing import Tuple

def process_single_movie_plot(args: Tuple[int, IMDBMovie]) -> Tuple[int, IMDBMovie, bool, str]:
    """
    Process a single movie to generate plot metadata.
    
    Args:
        args: Tuple of (index, movie) where index is the movie's position in the original list
        
    Returns:
        Tuple of (index, updated_movie, success, error_message)
        - index: Original position in the list (for maintaining order)
        - updated_movie: Updated IMDBMovie object (or original if generation failed)
        - success: Boolean indicating if generation was successful
        - error_message: Error message if generation failed, empty string otherwise
    """
    i, movie = args
    
    try:
        plot_events_metadata = generate_plot_events_metadata(
            title=movie.title,
            overview=movie.overview,
            plot_keywords=movie.plot_keywords,
            plot_summaries=movie.debug_plot_summaries,
            plot_synopses=movie.debug_synopses
        )

        if not plot_events_metadata:
            raise Exception("Plot events metadata returned None")

        plot_analysis_metadata = generate_plot_analysis_metadata(
            title=movie.title,
            overview=movie.overview,
            plot_synopsis=plot_events_metadata.plot_summary,
            plot_keywords=movie.plot_keywords,
            featured_reviews=movie.featured_reviews
        )

        if not plot_analysis_metadata:
            raise Exception("Plot analysis metadata returned None")

        updated_movie = movie.model_copy(update={
            "plot_events_metadata": plot_events_metadata,
            "plot_analysis_metadata": plot_analysis_metadata
        })
        
        return (i, updated_movie, True, "")
            
    except Exception as e:
        # Error occurred, keep original movie
        return (i, movie, False, str(e))

# Load movies from JSON file (relative to notebook location)
json_path = Path("../../saved_imdb_movies.json")
with open(json_path, "r", encoding="utf-8") as f:
    movies_data = json.load(f)

# Convert each dictionary to an IMDBMovie object
movies = [IMDBMovie(**movie_dict) for movie_dict in movies_data]

# Process all movies in parallel using ThreadPoolExecutor
# Using max_workers=10 for I/O-bound tasks like API calls
print(f"Processing {len(movies)} movies in parallel...")
updated_movies_dict = {}  # Use dict to maintain order by index
failed_movies = []

# Create list of (index, movie) tuples for processing
movie_args = [(i, movie) for i, movie in enumerate(movies)]

# Process movies in parallel
with ThreadPoolExecutor(max_workers=25) as executor:
    # Submit all tasks
    future_to_movie = {executor.submit(process_single_movie_plot, args): args[1] for args in movie_args}
    
    # Process completed tasks as they finish
    completed = 0
    for future in as_completed(future_to_movie):
        movie = future_to_movie[future]
        completed += 1
        
        try:
            index, updated_movie, success, error_msg = future.result()
            updated_movies_dict[index] = updated_movie
            
            if success:
                print(f"[{completed}/{len(movies)}] ✓ {movie.title}")
            else:
                print(f"[{completed}/{len(movies)}] ✗ {movie.title} - {error_msg}")
                failed_movies.append((index, movie.title, error_msg))
        except Exception as e:
            # Handle unexpected errors in result retrieval
            print(f"[{completed}/{len(movies)}] ✗ {movie.title} - Unexpected error: {str(e)}")
            # Find the index for this movie
            for idx, m in enumerate(movies):
                if m == movie:
                    updated_movies_dict[idx] = movie
                    failed_movies.append((idx, movie.title, f"Unexpected error: {str(e)}"))
                    break

# Reconstruct movies list in original order
updated_movies = [updated_movies_dict[i] for i in range(len(movies))]

print(f"\nCompleted processing {len(movies)} movies")
print(f"Successfully updated: {len(updated_movies) - len(failed_movies)}")
print(f"Failed: {len(failed_movies)}")

if failed_movies:
    print("\nFailed movies:")
    for idx, title, reason in failed_movies:
        print(f"  {idx}: {title} - {reason}")

# Save updated movies back to JSON file
# Convert IMDBMovie objects to dictionaries for JSON serialization
movies_data_updated = [movie.model_dump() for movie in updated_movies]

with open(json_path, "w", encoding="utf-8") as f:
    json.dump(movies_data_updated, f, indent=2, ensure_ascii=False)

print(f"\n✓ Saved {len(updated_movies)} movies to {json_path}")

Processing 50 movies in parallel...
[1/50] ✓ raiders of the lost ark
[2/50] ✓ john wick
[3/50] ✓ school of rock
[4/50] ✓ up
[5/50] ✓ coco
[6/50] ✓ the lord of the rings: the fellowship of the ring
[7/50] ✓ zootopia
[8/50] ✓ the year without a santa claus
[9/50] ✓ gladiator
[10/50] ✓ klaus
[11/50] ✓ the princess bride
[12/50] ✓ captain america: the first avenger
[13/50] ✓ mad max: fury road
[14/50] ✓ ferris bueller's day off
[15/50] ✓ frozen
[16/50] ✓ avengers: endgame
[17/50] ✓ the dark knight
[18/50] ✓ star wars
[19/50] ✓ interstellar
[20/50] ✓ the matrix
[21/50] ✓ shrek
[22/50] ✓ mulan
[23/50] ✓ spider-man: across the spider-verse
[24/50] ✓ harry potter and the philosopher's stone
[25/50] ✓ inception
[26/50] ✓ jurassic park
[27/50] ✓ blade runner 2049
[28/50] ✓ the shining
[29/50] ✓ terrifier 3
[30/50] ✓ hereditary
[31/50] ✓ past lives
[32/50] ✓ arrival
[33/50] ✓ american psycho
[34/50] ✓ saw
[35/50] ✓ fight club
[36/50] ✓ 50 first dates
[37/50] ✓ insidious
[38/50] ✓ fifty shades of 

In [None]:
# DEBUGGING BAD MATCHES

# List of title snippets to search for (case-insensitive)
title_snippets = ["jurass", "interstellar", "avengers: endga"]

# For each snippet, find matching movies and display their content vectors
for snippet in title_snippets:
    # Find movies whose titles contain the snippet (case-insensitive)
    matching_movies = [
        movie for movie in movies 
        if snippet.lower() in movie.title.lower()
    ]
    
    if not matching_movies:
        print(f"No movies found matching '{snippet}'")
        print()
        continue
    
    # If multiple matches, print all of them
    if len(matching_movies) > 1:
        print(f"Found {len(matching_movies)} movies matching '{snippet}':")
        print()
    
    # Print each matching movie's title and content vector
    for movie in matching_movies:
        print(movie.title)
        print(create_plot_analysis_vector_text(movie))
        print()

jurassic park
An industrialist opens a theme park of resurrected animals; when scientific hubris causes containment failure—via sabotage and unforeseen biology—invited experts and children must survive cascading system failures and escape the island, exposing the costs of that scientific hubris.
Scientific hubris causes containment failure
Science fiction thriller, Creature feature
large scale conflict
linear narrative
humbling, protective bonding
cautionary tale
Scientific hubris and consequences, Human vulnerability to nature
Unchecked innovation has costs, Control is fragile','Respect natural limits
dinosaur adventure, jungle adventure, action, adventure, sci-fi, thriller

interstellar
As Earth becomes uninhabitable, a former pilot leads an exploratory mission through a wormhole to find a new home, and the mission's exploration and sacrifices drive both a species-level survival effort and a personal struggle where time dilation and betrayal complicate attempts to save humanity; a fi