# Postgres Methods & Experimentation

In [1]:
import os
import sys
import psycopg2
import json
# import redis

# from qdrant_client import QdrantClient
from pathlib import Path
from typing import Optional, Sequence

# Add parent directory to path to import from implementation package
# Notebooks are in implementation/notebooks/, so we go up two levels to project root
sys.path.insert(0, str(Path().resolve().parent.parent))

from implementation.classes.movie import BaseMovie
from implementation.misc.helpers import normalize_string, create_watch_provider_offering_int
from implementation.classes.enums import Genre, MaturityRating, WatchProviderType

# PostgreSQL
pg = psycopg2.connect(
    host=os.getenv("POSTGRES_HOST"),
    dbname=os.getenv("POSTGRES_DB"),
    user=os.getenv("POSTGRES_USER"),
    password=os.getenv("POSTGRES_PASSWORD"),
)

# # Redis
# r = redis.Redis(host="localhost", port=6379, decode_responses=True)

# # Qdrant
# qdrant = QdrantClient(host="localhost", port=6333)

In [2]:
# LOAD MOVIES

json_path = Path("../../saved_imdb_movies.json")
with open(json_path, "r", encoding="utf-8") as f:
    movies_data = json.load(f)

# Convert each dictionary to an IMDBMovie object
movies = [BaseMovie(**movie_dict) for movie_dict in movies_data]

In [3]:
# DATABASE METHODS

def _execute_write(query: str, params: tuple, fetch_one: bool = False):
    """Execute a single write statement and optionally return one row."""
    # Use a context-managed cursor so resources are always released.
    with pg.cursor() as cur:
        cur.execute(query, params)

        if fetch_one:
            row = cur.fetchone()
            # Commit after fetching so the transaction is persisted.
            pg.commit()
            return row

    # Commit non-returning writes immediately.
    pg.commit()
    return None


def upsert_movie_card(
    movie_id: int,
    title: str,
    year: Optional[int],
    poster_url: Optional[str],
    release_ts: Optional[int],
    runtime_minutes: Optional[int],
    maturity_rank: Optional[int],
    genre_ids: Sequence[int],
    watch_offer_keys: Sequence[int],
    audio_language_ids: Sequence[int],
    reception_score: Optional[float],
    title_token_count: int,
) -> None:
    """Upsert a row in public.movie_card for canonical metadata storage."""
    query = """
    INSERT INTO public.movie_card (
        movie_id, title, year, poster_url, release_ts, runtime_minutes,
        maturity_rank, genre_ids, watch_offer_keys, audio_language_ids,
        reception_score, title_token_count, created_at, updated_at
    )
    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, now(), now())
    ON CONFLICT (movie_id) DO UPDATE SET
        title = EXCLUDED.title,
        year = EXCLUDED.year,
        poster_url = EXCLUDED.poster_url,
        release_ts = EXCLUDED.release_ts,
        runtime_minutes = EXCLUDED.runtime_minutes,
        maturity_rank = EXCLUDED.maturity_rank,
        genre_ids = EXCLUDED.genre_ids,
        watch_offer_keys = EXCLUDED.watch_offer_keys,
        audio_language_ids = EXCLUDED.audio_language_ids,
        reception_score = EXCLUDED.reception_score,
        title_token_count = EXCLUDED.title_token_count,
        updated_at = now();
    """
    params = (
        movie_id,
        title,
        year,
        poster_url,
        release_ts,
        runtime_minutes,
        maturity_rank,
        list(genre_ids),
        list(watch_offer_keys),
        list(audio_language_ids),
        reception_score,
        title_token_count,
    )
    _execute_write(query, params)


def upsert_lexical_dictionary(norm_str: str) -> int:
    """Upsert a normalized string in lex.lexical_dictionary and return string_id."""
    query = """
    INSERT INTO lex.lexical_dictionary (norm_str, touched_at, created_at)
    VALUES (%s, now(), now())
    ON CONFLICT (norm_str) DO UPDATE SET
        touched_at = now()
    RETURNING string_id;
    """
    row = _execute_write(query, (norm_str,), fetch_one=True)
    return row[0]


def upsert_title_token_string(string_id: int, norm_str: str) -> None:
    """Upsert a title-token lookup row in lex.title_token_strings."""
    query = """
    INSERT INTO lex.title_token_strings (string_id, norm_str)
    VALUES (%s, %s)
    ON CONFLICT (string_id) DO UPDATE SET
        norm_str = EXCLUDED.norm_str;
    """
    _execute_write(query, (string_id, norm_str))


def insert_title_token_posting(term_id: int, movie_id: int) -> None:
    """Insert one title-token posting row into lex.inv_title_token_postings."""
    query = """
    INSERT INTO lex.inv_title_token_postings (term_id, movie_id)
    VALUES (%s, %s)
    ON CONFLICT (term_id, movie_id) DO NOTHING;
    """
    _execute_write(query, (term_id, movie_id))


def insert_person_posting(term_id: int, movie_id: int) -> None:
    """Insert one person posting row into lex.inv_person_postings."""
    query = """
    INSERT INTO lex.inv_person_postings (term_id, movie_id)
    VALUES (%s, %s)
    ON CONFLICT (term_id, movie_id) DO NOTHING;
    """
    _execute_write(query, (term_id, movie_id))


def insert_character_posting(term_id: int, movie_id: int) -> None:
    """Insert one character posting row into lex.inv_character_postings."""
    query = """
    INSERT INTO lex.inv_character_postings (term_id, movie_id)
    VALUES (%s, %s)
    ON CONFLICT (term_id, movie_id) DO NOTHING;
    """
    _execute_write(query, (term_id, movie_id))


def insert_studio_posting(term_id: int, movie_id: int) -> None:
    """Insert one studio posting row into lex.inv_studio_postings."""
    query = """
    INSERT INTO lex.inv_studio_postings (term_id, movie_id)
    VALUES (%s, %s)
    ON CONFLICT (term_id, movie_id) DO NOTHING;
    """
    _execute_write(query, (term_id, movie_id))


def upsert_genre_dictionary(genre_id: int, name: str) -> None:
    """Upsert a genre lookup row in lex.genre_dictionary."""
    query = """
    INSERT INTO lex.genre_dictionary (genre_id, name)
    VALUES (%s, %s)
    ON CONFLICT (genre_id) DO UPDATE SET
        name = EXCLUDED.name;
    """
    _execute_write(query, (genre_id, name))


def upsert_provider_dictionary(provider_id: int, name: str) -> None:
    """Upsert a provider lookup row in lex.provider_dictionary."""
    query = """
    INSERT INTO lex.provider_dictionary (provider_id, name)
    VALUES (%s, %s)
    ON CONFLICT (provider_id) DO UPDATE SET
        name = EXCLUDED.name;
    """
    _execute_write(query, (provider_id, name))


def upsert_watch_method_dictionary(method_id: int, name: str) -> None:
    """Upsert a watch-method lookup row in lex.watch_method_dictionary."""
    query = """
    INSERT INTO lex.watch_method_dictionary (method_id, name)
    VALUES (%s, %s)
    ON CONFLICT (method_id) DO UPDATE SET
        name = EXCLUDED.name;
    """
    _execute_write(query, (method_id, name))


def upsert_maturity_dictionary(maturity_rank: int, label: str) -> None:
    """Upsert a maturity-rating lookup row in lex.maturity_dictionary."""
    query = """
    INSERT INTO lex.maturity_dictionary (maturity_rank, label)
    VALUES (%s, %s)
    ON CONFLICT (maturity_rank) DO UPDATE SET
        label = EXCLUDED.label;
    """
    _execute_write(query, (maturity_rank, label))

def upsert_language_dictionary(language_id: int, name: str) -> None:
    """Upsert a language lookup row in lex.language_dictionary."""
    query = """
    INSERT INTO lex.language_dictionary (language_id, name)
    VALUES (%s, %s)
    ON CONFLICT (language_id) DO UPDATE SET
        name = EXCLUDED.name;
    """
    _execute_write(query, (language_id, name))

In [None]:
# Ingest a single movie object

def ingest_movie(movie: BaseMovie) -> None:
    """Ingest one BaseMovie into movie_card plus all lexical posting tables."""
    from datetime import datetime, timezone

    def upsert_phrase_term(value: str) -> int | None:
        """Normalize a phrase and upsert it into lexical_dictionary."""
        normalized = normalize_string(value)
        if not normalized:
            return None
        return upsert_lexical_dictionary(normalized)

    # ================================
    # PHASE 1: CANONICAL INGESTION
    # ================================

    movie_id = int(getattr(movie, "tmdb_id") or None)
    if movie_id is None:
        raise ValueError("Movie ingestion failed: ID is required but not found.")

    title = str(getattr(movie, "title") or None)
    if title is None:
        raise ValueError("Movie ingestion failed: Title is required but not found.")

    release_date = getattr(movie, "release_date")
    if not isinstance(release_date, str):
        raise ValueError("Movie ingestion failed: Release date is required but not found.")

    year: Optional[int] = None
    release_ts: Optional[int] = None
    if release_date:
        try:
            parsed_release = datetime.strptime(release_date, "%Y-%m-%d").replace(tzinfo=timezone.utc)
            year = parsed_release.year
            release_ts = int(parsed_release.timestamp())
        except ValueError:
            raise ValueError("Movie ingestion failed: Release date is required but an error occurred during parsing.")

    # Default value used here because BaseMovie does not define poster_url.
    poster_url: Optional[str] = getattr(movie, "poster_url", None)

    runtime_value = getattr(movie, "duration", None)
    if isinstance(runtime_value, int):
        runtime_minutes: Optional[int] = runtime_value
    else:
        raise ValueError("Movie ingestion failed: Duration is required but not found.")

    maturity_rating, maturity_rank = movie.maturity_rating_and_rank()
    # Keep lookup table aligned with encoded maturity ranks.
    upsert_maturity_dictionary(maturity_rank, maturity_rating)

    raw_genres = getattr(movie, "genres", [])
    if not isinstance(raw_genres, list):
        # Default value used here because BaseMovie may not provide genres.
        raw_genres = []

    genre_ids: list[int] = []
    for genre_name in raw_genres:
        normalized_genre = normalize_string(str(genre_name))
        if not normalized_genre:
            continue
        # Derive genre ID from the shared lexical dictionary.
        genre_id = upsert_lexical_dictionary(normalized_genre)
        genre_ids.append(genre_id)
        # Keep lookup table populated for debugging/admin views.
        upsert_genre_dictionary(genre_id, str(genre_name))

    raw_languages = getattr(movie, "languages", [])
    if not isinstance(raw_languages, list):
        # Default value used here because BaseMovie may not provide languages.
        raw_languages = []

    audio_language_ids: list[int] = []
    for language in raw_languages:
        normalized_language = normalize_string(str(language))
        if not normalized_language:
            continue
        # Derive language ID from the shared lexical dictionary.
        language_id = upsert_lexical_dictionary(normalized_language)
        audio_language_ids.append(language_id)
        # Keep lookup table populated for debugging/admin views.
        upsert_language_dictionary(language_id, str(language))

    raw_providers = getattr(movie, "watch_providers", [])
    if not isinstance(raw_providers, list):
        # Default value used here because BaseMovie may not provide watch_providers.
        raw_providers = []

    watch_offer_key_set: set[int] = set()
    for provider in raw_providers:
        provider_name = str(getattr(provider, "name", "") or "")
        normalized_provider = normalize_string(provider_name)
        if not normalized_provider:
            continue

        # Derive provider ID from the shared lexical dictionary.
        provider_id = upsert_lexical_dictionary(normalized_provider)
        # Keep lookup table populated for debugging/admin views.
        upsert_provider_dictionary(provider_id, provider_name)

        provider_types = getattr(provider, "types", [])
        if not isinstance(provider_types, list):
            # Default value used here because WatchProvider may not provide types.
            provider_types = []

        for provider_type in provider_types:
            # provider_type is already an integer from WatchProviderType enum.
            method_id = int(provider_type)
            
            # Get the enum member to fetch its string name for the dictionary.
            try:
                watch_provider_type = WatchProviderType(method_id)
            except ValueError:
                continue
            
            upsert_watch_method_dictionary(method_id, str(watch_provider_type))

            watch_offer_key = create_watch_provider_offering_int(provider_id, method_id)
            watch_offer_key_set.add(watch_offer_key)

    watch_offer_keys = sorted(watch_offer_key_set)

    reception_score = movie.reception_score()

    title_tokens = movie.normalized_title_tokens()
    title_token_count = len(title_tokens)

    # Upsert canonical movie-card metadata first so other systems can reference it.
    upsert_movie_card(
        movie_id=movie_id,
        title=title,
        year=year,
        poster_url=poster_url,
        release_ts=release_ts,
        runtime_minutes=runtime_minutes,
        maturity_rank=maturity_rank,
        genre_ids=genre_ids,
        watch_offer_keys=watch_offer_keys,
        audio_language_ids=audio_language_ids,
        reception_score=reception_score,
        title_token_count=title_token_count,
    )

    # ================================
    # PHASE 2: LEXICAL INGESTION
    # ================================

    # Title token ingestion: dictionary + title_token_strings + postings.
    for token in title_tokens:
        term_id = upsert_lexical_dictionary(token)
        upsert_title_token_string(term_id, token)
        insert_title_token_posting(term_id, movie_id)

    # Person phrase ingestion across actors/directors/writers/composers/producers.
    raw_people_lists = [
        getattr(movie, "actors", []),
        getattr(movie, "directors", []),
        getattr(movie, "writers", []),
        getattr(movie, "composers", []),
        getattr(movie, "producers", []),
    ]

    people_phrases: set[str] = set()
    for people_list in raw_people_lists:
        if not isinstance(people_list, list):
            # Default value used here because BaseMovie may not provide a person list.
            people_list = []
        for name in people_list:
            normalized_name = normalize_string(str(name))
            if normalized_name:
                people_phrases.add(normalized_name)

    for person_phrase in people_phrases:
        term_id = upsert_lexical_dictionary(person_phrase)
        insert_person_posting(term_id, movie_id)

    # Character phrase ingestion.
    raw_characters = getattr(movie, "characters", [])
    if not isinstance(raw_characters, list):
        # Default value used here because BaseMovie may not provide characters.
        raw_characters = []

    for character in raw_characters:
        term_id = upsert_phrase_term(str(character))
        if term_id is not None:
            insert_character_posting(term_id, movie_id)

    # Studio phrase ingestion.
    raw_studios = getattr(movie, "production_companies", [])
    if not isinstance(raw_studios, list):
        # Default value used here because BaseMovie may not provide production_companies.
        raw_studios = []

    for studio in raw_studios:
        term_id = upsert_phrase_term(str(studio))
        if term_id is not None:
            insert_studio_posting(term_id, movie_id)

In [8]:
movie_to_ingest = movies[1]

print(f"Ingesting movie: {movie_to_ingest.title}")

ingest_movie(movie_to_ingest)

Ingesting movie: zootopia
