In [232]:
import os
from typing import Optional, Dict, Any
import requests
from bs4 import BeautifulSoup
import json
import re
import html as html_lib
from collections import OrderedDict

In [170]:
# https://www.imdb.com/title//

IMDB_HEADERS = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    # Safer with requests unless you know you can decode br/zstd:
    "Accept-Encoding": "gzip, deflate",
    "Accept-Language": "en-US,en;q=0.9",
    "Cache-Control": "max-age=0",
    "Priority": "u=0, i",
    "Referer": "https://www.google.com/",
    "Sec-CH-UA": '"Not;A=Brand";v="99", "Google Chrome";v="139", "Chromium";v="139"',
    "Sec-CH-UA-Mobile": "?0",
    "Sec-CH-UA-Platform": '"macOS"',
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Site": "same-origin",
    "Sec-Fetch-User": "?1",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36",
}

def perform_fetch(url: str) -> Dict[str, Any]:
    headers = dict(IMDB_HEADERS)

    # Put your real cookie string in an env var instead of hard-coding it.
    # Example: export IMDB_COOKIE='session-id=...; session-token=...; ...'
    cookie = os.getenv("IMDB_COOKIE")
    if cookie:
        headers["Cookie"] = cookie

    with requests.Session() as session:
        resp = session.get(url, headers=headers, timeout=30)
        resp.raise_for_status()
        return resp
        

def fetch_movie_data(movie_id: str) -> Dict[str, Any]:
    return perform_fetch(f"https://www.imdb.com/title/{movie_id}/")

def fetch_summary_synopsis(movie_id: str) -> Dict[str, Any]:
    return perform_fetch(f"https://www.imdb.com/title/{movie_id}/plotsummary/")

def fetch_plot_keywords(movie_id: str) -> Dict[str, Any]:
    return perform_fetch(f"https://www.imdb.com/title/{movie_id}/keywords/")

def fetch_parent_guide(movie_id: str) -> Dict[str, Any]:
    return perform_fetch(f"https://www.imdb.com/title/{movie_id}/parentalguide/")

def fetch_cast_crew(movie_id: str) -> Dict[str, Any]:
    return perform_fetch(f"https://www.imdb.com/title/{movie_id}/fullcredits/")


In [None]:
item = fetch_movie_data("tt0245429")

with open("sample_imdb_response.html", 'w') as f:
    f.write(item.text)

In [83]:
synopsis = fetch_summary_synopsis("tt0245429")

with open("sample_synopsis.html", 'w') as f:
    f.write(synopsis.text)

In [123]:
plot_keywords = fetch_plot_keywords("tt0245429")

with open("sample_plot_keywords.html", 'w') as f:
    f.write(plot_keywords.text)

In [142]:
parent_guide = fetch_parent_guide("tt0245429")

with open("sample_parent_guide.html", 'w') as f:
    f.write(parent_guide.text)

In [171]:
cast_crew = fetch_cast_crew("tt0245429")

with open("sample_cast_crew.html", 'w') as f:
    f.write(cast_crew.text)

In [None]:
def _safe_get(obj, path, default=None):
    cur = obj
    for key in path:
        if isinstance(cur, dict) and key in cur:
            cur = cur[key]
        else:
            return default
    return cur


def _parse_next_data(soup: BeautifulSoup) -> dict:
    script = soup.find("script", id="__NEXT_DATA__", type="application/json")
    if not script or not script.string:
        return {}
    try:
        return json.loads(script.string)
    except json.JSONDecodeError:
        return {}


def extract_imdb_attributes(html_text: str) -> dict:
    soup = BeautifulSoup(html_text, "html.parser")
    nd = _parse_next_data(soup)

    page_props = _safe_get(nd, ["props", "pageProps"], {}) or {}
    atf = page_props.get("aboveTheFoldData", {}) or {}
    mcd = page_props.get("mainColumnData", {}) or {}

    # Core fields (pulled from __NEXT_DATA__)
    original_title = _safe_get(mcd, ["originalTitleText", "text"])
    maturity_rating = _safe_get(atf, ["certificate", "rating"])  # e.g. "PG" :contentReference[oaicite:3]{index=3}
    overview = _safe_get(atf, ["plot", "plotText", "plainText"])
    popularity = _safe_get(atf, ["meterRanking", "currentRank"])  # e.g. 418 :contentReference[oaicite:4]{index=4}
    imdb_rating = _safe_get(atf, ["ratingsSummary", "aggregateRating"])  # e.g. 8.6 :contentReference[oaicite:5]{index=5}
    metacritic_rating = _safe_get(atf, ["metacritic", "metascore", "score"])  # e.g. 96 :contentReference[oaicite:6]{index=6}

    # Interests -> your "Keywords" list (Japanese, Anime, Coming-of-Age, ...) :contentReference[oaicite:7]{index=7}
    interest_edges = _safe_get(atf, ["interests", "edges"], []) or []
    keywords = [_safe_get(e, ["node", "primaryText", "text"]) for e in interest_edges if _safe_get(e, ["node", "primaryText", "text"])]

    # Review summary (HTML-escaped markdown-ish HTML) :contentReference[oaicite:8]{index=8}
    review_html = _safe_get(mcd, ["reviewSummary", "overall", "medium", "value", "plaidHtml"])
    user_review_summary = None
    if isinstance(review_html, str) and review_html.strip():
        unescaped = html_lib.unescape(review_html)
        user_review_summary = BeautifulSoup(unescaped, "html.parser").get_text(" ", strip=True)

    # Genres :contentReference[oaicite:9]{index=9}
    genre_items = _safe_get(atf, ["genres", "genres"], []) or []
    genres = [(_safe_get(g, ["text"]) or "").strip().lower() for g in genre_items if _safe_get(g, ["text"])]

    # Production companies
    production_companies = _safe_get(atf, ["production", "edges"], []) or []
    production_companies = [(_safe_get(c, ["node", "company", "companyText", "text"]) or "").strip().lower() for c in production_companies if _safe_get(c, ["node", "company", "companyText", "text"])]

    # Countries of origin (this shows up in your response under countriesDetails) :contentReference[oaicite:10]{index=10}
    country_items = _safe_get(mcd, ["countriesDetails", "countries"], []) or []
    countries_of_origin = [
        (_safe_get(c, ["text"]) or "").strip().lower()
        for c in country_items
        if _safe_get(c, ["text"])
    ]

    # Filming locations
    filming_locations = _safe_get(mcd, ["filmingLocations", "edges"], []) or []
    filming_locations = [(_safe_get(fl, ["node", "text"]) or "").strip().lower() for fl in filming_locations if _safe_get(fl, ["node", "text"])]

    # Languages :contentReference[oaicite:11]{index=11}
    lang_items = _safe_get(mcd, ["spokenLanguages", "spokenLanguages"], []) or []
    languages = [(_safe_get(l, ["text"]) or "").strip().lower() for l in lang_items if _safe_get(l, ["text"])]

    # Budget (prefer structured JSON; fallback to Box Office DOM) :contentReference[oaicite:12]{index=12} :contentReference[oaicite:13]{index=13}
    budget = _safe_get(mcd, ["productionBudget", "budget", "amount"])

    return {
        "original_title": original_title,  # e.g. "Sen to Chihiro no kamikakushi"
        "maturity_rating": maturity_rating,  # e.g. "PG"
        "overview": overview,
        "keywords": keywords,  # interest-based keywords list
        "popularity": popularity,
        "imdb_rating": imdb_rating,
        "metacritic_rating": metacritic_rating,
        "user_review_summary": user_review_summary,
        "genres": genres,
        "countries_of_origin": countries_of_origin,
        "production_companies": production_companies,
        "filming_locations": filming_locations,
        "languages": languages,
        "budget": budget,
    }

In [120]:
def extract_summary_attributes(html_text: str) -> dict:
    soup = BeautifulSoup(html_text, "html.parser")
    nd = _parse_next_data(soup)

    page_props = _safe_get(nd, ["props", "pageProps"], {}) or {}
    data = _safe_get(page_props, ["contentData", "data", "title"], {}) or {}
    
    plot_summaries = _safe_get(data, ["plotSummaries", "edges"], []) or []
    plot_summaries_cleaned = []
    for s in plot_summaries:
        plot_html = _safe_get(s, ["node", "plotText", "plaidHtml"])
        if plot_html and isinstance(plot_html, str) and plot_html.strip():
            # Unescape HTML entities and extract plain text (similar to review_html processing)
            unescaped = html_lib.unescape(plot_html)
            plot_text = BeautifulSoup(unescaped, "html.parser").get_text(" ", strip=True)
            plot_summaries_cleaned.append(plot_text)
    plot_summaries = plot_summaries_cleaned

    synopsis = _safe_get(data, ["plotSynopsis", "edges"])
    synopsis_cleaned = []
    for s in synopsis:
        plot_html = _safe_get(s, ["node", "plotText", "plaidHtml"])
        if plot_html and isinstance(plot_html, str) and plot_html.strip():
            unescaped = html_lib.unescape(plot_html)
            plot_text = BeautifulSoup(unescaped, "html.parser").get_text(" ", strip=True)
            synopsis_cleaned.append(plot_text)
    synopsis = synopsis_cleaned
    
    return {
        "plot_summaries": plot_summaries,
        "synopsis": synopsis
    }

extract_summary_attributes(synopsis.text)

{'plot_summaries': ["During her family's move to the suburbs, a sullen 10-year-old girl wanders into a world ruled by gods, witches and spirits, and where humans are changed into beasts.",
  'The fanciful adventures of a ten-year-old girl named Chihiro, who discovers a secret world when she and her family get lost and venture through a hillside tunnel. When her parents undergo a mysterious transformation, Chihiro must fend for herself as she encounters strange spirits, assorted creatures and a grumpy sorceress who seeks to prevent her from returning to the human world.',
  'A young girl named Chihiro becomes trapped in a mysterious place after her parents accidentally enter a seemingly abandoned amusement park. As Chihiro navigates this strange realm, she embarks on a journey to find a way back to her own world. To do so, she must adapt and overcome challenges while displaying resilience and determination. Throughout her adventure, Chihiro encounters various spirits, some friendly and 

In [None]:
def extract_plot_keywords(html_text: str) -> dict:
    soup = BeautifulSoup(html_text, "html.parser")
    nd = _parse_next_data(soup)

    page_props = _safe_get(nd, ["props", "pageProps"], {}) or {}
    data = _safe_get(page_props, ["contentData", "data", "title", 'keywords', 'edges'], {}) or {}

    keywords = [_safe_get(k, ["node", 'keyword', 'text', 'text']) for k in data]
    return keywords

['studio ghibli',
 'turned into a pig',
 'parent child relationship',
 'spirit world',
 'child protagonist',
 'falling from height',
 'real name',
 'human becoming an animal',
 'parallel world',
 'magic',
 'flying',
 'young love',
 'contract',
 'personal growth',
 'witch',
 'greed',
 'river god',
 'rescue',
 'gluttony',
 '2d animation',
 'spirit',
 'bathhouse',
 'girl',
 'asian dragon',
 'female protagonist',
 'magical realism',
 'little girl',
 'dragon',
 'ghost',
 'sister sister relationship',
 'friendship',
 'rivalry',
 'tears',
 'tunnel',
 'holding breath',
 'asking for a job',
 'overcoming fear',
 'isekai',
 'surrealism',
 'pig',
 'work',
 '10 year old',
 'car',
 'transformation',
 'crying',
 'diligence',
 '2000s',
 'husband wife relationship',
 'ghost town',
 'cult film']

In [None]:
def extract_parental_guide(html_text: str) -> dict:
    soup = BeautifulSoup(html_text, "html.parser")
    nd = _parse_next_data(soup)

    page_props = _safe_get(nd, ["props", "pageProps"], {}) or {}
    data = _safe_get(page_props, ["contentData", "data", "title"], {}) or {}

    ratingReasons = _safe_get(data, ["ratingReason", "edges"], []) or []
    ratingReasons = [(_safe_get(r, ['node', 'ratingReason']) or "") for r in ratingReasons if _safe_get(r, ['node', 'ratingReason'])]

    parentsGuide = _safe_get(data, ["parentsGuide", "categories"], []) or []
    formattedParentsGuide = []
    for p in parentsGuide:
        formattedParentsGuide.append({
            'category': _safe_get(p, ['category', 'text']),
            'severity': _safe_get(p, ['severity', 'text'])
        })

    return {
        "ratingReasons": ratingReasons,
        "parentsGuide": formattedParentsGuide
    }


{'ratingReasons': ['Rated PG for some scary moments'],
 'parentsGuide': [{'category': 'Sex & Nudity', 'severity': 'None'},
  {'category': 'Violence & Gore', 'severity': 'Mild'},
  {'category': 'Profanity', 'severity': 'None'},
  {'category': 'Alcohol, Drugs & Smoking', 'severity': 'Mild'},
  {'category': 'Frightening & Intense Scenes', 'severity': 'Moderate'}]}

In [281]:
def extract_cast_crew(html_text: str) -> dict:
    soup = BeautifulSoup(html_text, "html.parser")
    nd = _parse_next_data(soup)

    page_props = _safe_get(nd, ["props", "pageProps"], {}) or {}
    groupings = _safe_get(page_props, ["contentData", "categories"], {}) or {}

    directorsData = {}
    writersData = {}
    castData = {}
    castSplitIndex = -1
    producersData = {}
    composersData = {}

    for grouping in groupings:
        groupName = _safe_get(grouping, ['name'])
        groupingData = _safe_get(grouping, ['section', 'items'], {}) or {}
        if groupName == 'Director':
            directorsData = groupingData
        elif groupName == 'Writers':
            writersData = groupingData
        elif groupName == 'Cast':
            castData = groupingData
            castSplitIndex = _safe_get(grouping, ['section', 'splitIndex']) or -1
        elif groupName == 'Producers':
            producersData = groupingData
        elif groupName == 'Composer':
            composersData = groupingData

    directors = set([_safe_get(d, ['rowTitle']) or "" for d in directorsData])
    writers = set([_safe_get(w, ['rowTitle']) or "" for w in writersData])
    producers = set([_safe_get(d, ['rowTitle']) or "" for d in producersData])
    composers = set([_safe_get(d, ['rowTitle']) or "" for d in composersData])

    # Cast is a bit more complicated
    cast = []
    characters = []
    if castSplitIndex > -1:
        castData = castData[:(castSplitIndex + 1)]
    for actor in castData:
        cast.append(_safe_get(actor, ['rowTitle']) or "")
        for character in _safe_get(actor, ['characters'], []):
            characters.append(character)

    return {
        "directors": directors,
        "writers": writers,
        "cast": cast,
        "characters": characters,
        "producers": producers,
        "composers": composers
    }

In [282]:
extract_cast_crew(cast_crew.text)

{'directors': {'Hayao Miyazaki'},
 'writers': {'Hayao Miyazaki'},
 'cast': ['Rumi Hiiragi',
  'Miyu Irino',
  'Mari Natsuki',
  'Takashi Naitô',
  'Yasuko Sawaguchi',
  'Tatsuya Gashûin',
  'Ryûnosuke Kamiki',
  'Yumi Tamai',
  'Yô Ôizumi',
  'Koba Hayashi',
  'Tsunehiko Kamijô',
  'Takehiko Ono',
  'Bunta Sugawara',
  'Shigeru Wakita',
  'Shirô Saitô',
  'Michiko Yamamoto',
  'Keiko Tsukamoto',
  'Akio Nakamura',
  'Shinji Tokumaru',
  'Kaori Yamagata',
  'Yayoi Kazuki',
  'Masahiro Asano',
  'Kazutaka Hayashida',
  'Ikuko Yamamoto',
  'Mina Meguro',
  'Tetsurô Ishibashi',
  'Katsutomo Shîbara',
  'Shinobu Katabuchi',
  'Noriko Kitou',
  'Naoto Kaji',
  'Yoshitaka Sukegawa',
  'Aki Tachikawa',
  'Noriko Yamaya',
  'Katsuhisa Matsuo',
  'Masayuki Kizu',
  'Yôko Ôno',
  'Sachie Azuma',
  'Ken Yasuda',
  'Shigeyuki Totsugi',
  'Mayumi Sako',
  'Sonoko Soeda',
  'Akiko Tomihira',
  'Minako Masuda',
  'Orika Ono',
  'Rina Yamada',
  'Miwa Takachi',
  'Hiromi Takeuchi',
  'Makiko Oku'],
 'c