In [1]:
from __future__ import annotations

import os
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
import sys
from pathlib import Path

# Add parent directory to path to import from implementation package
# Notebooks are in implementation/notebooks/, so we go up two levels to project root
sys.path.insert(0, str(Path().resolve().parent.parent))

from implementation.vectorize import create_plot_analysis_vector_text
from openai import OpenAI
from implementation.llm_generations import generate_plot_events_metadata, generate_plot_analysis_metadata, generate_plot_metadata
from dotenv import load_dotenv
from typing import List, Optional
from pathlib import Path
from implementation.movie import IMDBMovie

# Load environment variables (for API key)
load_dotenv()

# Get OpenAI API key from environment and initialize client once at module load
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise ValueError(
        "OPENAI_API_KEY environment variable not set. "
        "Please set it before importing this module."
    )

# Initialize OpenAI client - created once when module is loaded
client = OpenAI(api_key=api_key)

In [3]:
# LOAD MOVIES

json_path = Path("../../saved_imdb_movies.json")
with open(json_path, "r", encoding="utf-8") as f:
    movies_data = json.load(f)

# Convert each dictionary to an IMDBMovie object
movies = [IMDBMovie(**movie_dict) for movie_dict in movies_data]

In [4]:
import time

movie = movies[39]

plot_events_metadata, plot_analysis_metadata = generate_plot_metadata(
    title=movie.title,
    overview=movie.overview,
    plot_keywords=movie.plot_keywords,
    plot_summaries=movie.plot_summaries,
    plot_synopses=movie.synopsis
)

print(plot_events_metadata)
print(plot_analysis_metadata)

Generating plot metadata for forrest gump
Generating plot metadata for forrest gump
Plot events metadata for forrest gump (completed in 13.65 seconds):
In 1981 Savannah, Georgia, Forrest Gump sits on a bus-stop bench and tells strangers his life story. He begins with childhood in Greenbow, Alabama (born to Mrs. Gump), where he wears leg braces and is bullied. He befriends Jenny Curran on the school bus; Jenny teaches him to read and encourages him to run. When his braces break while fleeing bullies, Forrest discovers he can run extremely fast and earns a football scholarship to the University of Alabama, playing under Coach Bear Bryant and meeting Governor George Wallace and student Vivian Malone during integration. After college Forrest enlists in the U.S. Army (1967). In basic training he befriends Benjamin "Bubba" Blue, who plans a shrimping business with Forrest. In Vietnam Forrest repeatedly saves fellow soldiers and is wounded; Bubba is killed. Forrest carries wounded men to safe

In [15]:
print(f"Plot Summary: {plot_events_metadata.plot_summary}")
print()
print(f"Setting: {plot_events_metadata.setting}")
print()
print(f"Major Characters: {plot_events_metadata.major_characters}")
print()
print("==============================")
print()
print(f"Core Engine: {plot_analysis_metadata.core_engine}")
print()
print(f"Genre Signature: {plot_analysis_metadata.genre_signature}")
print()
print(f"Generalized Plot Overview: {plot_analysis_metadata.generalized_plot_overview}")
print()
print(f"Conflict Scale: {plot_analysis_metadata.conflict_scale}")
print()
print(f"Narrative Delivery: {plot_analysis_metadata.narrative_delivery}")
print()
print(f"Character arcs: {plot_analysis_metadata.character_arc_shapes}")
print()
print(f"Narrative archetypes: {plot_analysis_metadata.narrative_archetype}")
print()
print(f"Themes: {plot_analysis_metadata.themes_primary}")
print()
print(f"Lessons Learned: {plot_analysis_metadata.lessons_learned}")



Plot Summary: In 1981 Savannah, Georgia, Forrest Gump sits on a bus-stop bench and recounts his life. Born in 1944 in Greenbow, Alabama, Forrest (Tom Hanks) grows up with leg braces and an IQ of 75. His mother (Mrs. Gump) runs a boarding house and teaches him self-respect. On the school bus he befriends Jenny Curran; they become inseparable childhood friends. Bullies chase Forrest; when his leg braces break he discovers he can run extremely fast. His running wins him a football scholarship to the University of Alabama, where he meets coach Bear Bryant and once returns a dropped book to Vivian Malone during Governor George Wallace's 1963 “schoolhouse door” standoff. After college in 1967 Forrest enlists in the U.S. Army. In basic he befriends Benjamin "Bubba" Blue, who teaches him about shrimping; they ship to Vietnam with Lieutenant Dan Taylor as their platoon leader. In Vietnam their unit is ambushed: Bubba is killed, Forrest carries several wounded soldiers to safety and is wounded b

In [None]:
# PLOT METADATA REGENERATION

from typing import Tuple

def process_single_movie_plot(args: Tuple[int, IMDBMovie]) -> Tuple[int, IMDBMovie, bool, str]:
    """
    Process a single movie to generate plot metadata.
    
    Args:
        args: Tuple of (index, movie) where index is the movie's position in the original list
        
    Returns:
        Tuple of (index, updated_movie, success, error_message)
        - index: Original position in the list (for maintaining order)
        - updated_movie: Updated IMDBMovie object (or original if generation failed)
        - success: Boolean indicating if generation was successful
        - error_message: Error message if generation failed, empty string otherwise
    """
    i, movie = args
    
    try:
        plot_events_metadata = generate_plot_events_metadata(
            title=movie.title,
            overview=movie.overview,
            plot_keywords=movie.plot_keywords,
            plot_summaries=movie.plot_summaries,
            plot_synopses=movie.synopsis
        )

        if not plot_events_metadata:
            raise Exception("Plot events metadata returned None")

        plot_analysis_metadata = generate_plot_analysis_metadata(
            title=movie.title,
            overview=movie.overview,
            plot_synopsis=plot_events_metadata.plot_summary,
            plot_keywords=movie.plot_keywords,
            reception_summary=movie.reception_summary
        )

        if not plot_analysis_metadata:
            raise Exception("Plot analysis metadata returned None")

        updated_movie = movie.model_copy(update={
            "plot_events_metadata": plot_events_metadata,
            "plot_analysis_metadata": plot_analysis_metadata
        })
        
        return (i, updated_movie, True, "")
            
    except Exception as e:
        # Error occurred, keep original movie
        return (i, movie, False, str(e))

# Load movies from JSON file (relative to notebook location)
json_path = Path("../saved_imdb_movies.json")
with open(json_path, "r", encoding="utf-8") as f:
    movies_data = json.load(f)

# Convert each dictionary to an IMDBMovie object
movies = [IMDBMovie(**movie_dict) for movie_dict in movies_data]

# Process all movies in parallel using ThreadPoolExecutor
# Using max_workers=10 for I/O-bound tasks like API calls
print(f"Processing {len(movies)} movies in parallel...")
updated_movies_dict = {}  # Use dict to maintain order by index
failed_movies = []

# Create list of (index, movie) tuples for processing
movie_args = [(i, movie) for i, movie in enumerate(movies)]

# Process movies in parallel
with ThreadPoolExecutor(max_workers=25) as executor:
    # Submit all tasks
    future_to_movie = {executor.submit(process_single_movie_plot, args): args[1] for args in movie_args}
    
    # Process completed tasks as they finish
    completed = 0
    for future in as_completed(future_to_movie):
        movie = future_to_movie[future]
        completed += 1
        
        try:
            index, updated_movie, success, error_msg = future.result()
            updated_movies_dict[index] = updated_movie
            
            if success:
                print(f"[{completed}/{len(movies)}] ✓ {movie.title}")
            else:
                print(f"[{completed}/{len(movies)}] ✗ {movie.title} - {error_msg}")
                failed_movies.append((index, movie.title, error_msg))
        except Exception as e:
            # Handle unexpected errors in result retrieval
            print(f"[{completed}/{len(movies)}] ✗ {movie.title} - Unexpected error: {str(e)}")
            # Find the index for this movie
            for idx, m in enumerate(movies):
                if m == movie:
                    updated_movies_dict[idx] = movie
                    failed_movies.append((idx, movie.title, f"Unexpected error: {str(e)}"))
                    break

# Reconstruct movies list in original order
updated_movies = [updated_movies_dict[i] for i in range(len(movies))]

print(f"\nCompleted processing {len(movies)} movies")
print(f"Successfully updated: {len(updated_movies) - len(failed_movies)}")
print(f"Failed: {len(failed_movies)}")

if failed_movies:
    print("\nFailed movies:")
    for idx, title, reason in failed_movies:
        print(f"  {idx}: {title} - {reason}")

# Save updated movies back to JSON file
# Convert IMDBMovie objects to dictionaries for JSON serialization
movies_data_updated = [movie.model_dump() for movie in updated_movies]

with open(json_path, "w", encoding="utf-8") as f:
    json.dump(movies_data_updated, f, indent=2, ensure_ascii=False)

print(f"\n✓ Saved {len(updated_movies)} movies to {json_path}")

Processing 50 movies in parallel...
[1/50] ✓ the year without a santa claus
[2/50] ✓ school of rock
[3/50] ✓ zootopia
[4/50] ✓ up
[5/50] ✓ shrek
[6/50] ✓ klaus
[7/50] ✓ mad max: fury road
[8/50] ✓ mulan
[9/50] ✓ gladiator
[10/50] ✓ john wick
[11/50] ✓ ferris bueller's day off
[12/50] ✓ coco
[13/50] ✓ captain america: the first avenger
[14/50] ✓ spider-man: across the spider-verse
[15/50] ✓ the dark knight
[16/50] ✓ the princess bride
[17/50] ✓ inception
[18/50] ✓ frozen
[19/50] ✓ star wars
[20/50] ✓ the lord of the rings: the fellowship of the ring
[21/50] ✓ raiders of the lost ark
[22/50] ✓ the matrix
[23/50] ✓ avengers: endgame
[24/50] ✓ interstellar
[25/50] ✓ harry potter and the philosopher's stone
[26/50] ✓ the shining
[27/50] ✓ blade runner 2049
[28/50] ✓ arrival
[29/50] ✓ saw
[30/50] ✓ hereditary
[31/50] ✓ terrifier 3
[32/50] ✓ se7en
[33/50] ✓ insidious
[34/50] ✓ parasite
[35/50] ✓ titanic
[36/50] ✓ past lives
[37/50] ✓ jurassic park
[38/50] ✓ leap year
[39/50] ✓ 50 first dates


In [None]:
# DEBUGGING BAD MATCHES

# List of title snippets to search for (case-insensitive)
title_snippets = ["jurass", "interstellar", "avengers: endga"]

# For each snippet, find matching movies and display their content vectors
for snippet in title_snippets:
    # Find movies whose titles contain the snippet (case-insensitive)
    matching_movies = [
        movie for movie in movies 
        if snippet.lower() in movie.title.lower()
    ]
    
    if not matching_movies:
        print(f"No movies found matching '{snippet}'")
        print()
        continue
    
    # If multiple matches, print all of them
    if len(matching_movies) > 1:
        print(f"Found {len(matching_movies)} movies matching '{snippet}':")
        print()
    
    # Print each matching movie's title and content vector
    for movie in matching_movies:
        print(movie.title)
        print(create_plot_analysis_vector_text(movie))
        print()

jurassic park
An industrialist opens a theme park of resurrected animals; when scientific hubris causes containment failure—via sabotage and unforeseen biology—invited experts and children must survive cascading system failures and escape the island, exposing the costs of that scientific hubris.
Scientific hubris causes containment failure
Science fiction thriller, Creature feature
large scale conflict
linear narrative
humbling, protective bonding
cautionary tale
Scientific hubris and consequences, Human vulnerability to nature
Unchecked innovation has costs, Control is fragile','Respect natural limits
dinosaur adventure, jungle adventure, action, adventure, sci-fi, thriller

interstellar
As Earth becomes uninhabitable, a former pilot leads an exploratory mission through a wormhole to find a new home, and the mission's exploration and sacrifices drive both a species-level survival effort and a personal struggle where time dilation and betrayal complicate attempts to save humanity; a fi