In [1]:
import json
import sys
from pathlib import Path

# Add parent directory to path to import from implementation package
# Notebooks are in implementation/notebooks/, so we go up two levels to project root
sys.path.insert(0, str(Path().resolve().parent.parent))

from implementation.scraping.gather_data import fetch_and_create_imdb_movie

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Update imdb_vote_count for every movie in saved_imdb_movies.json
from implementation.scraping.gather_data import fetch_main_page
from implementation.scraping.extract_website_data import extract_imdb_attributes

json_candidates = [
    Path("saved_imdb_movies.json"),
    Path("../saved_imdb_movies.json"),
    Path("../../saved_imdb_movies.json"),
]
json_path = next((path for path in json_candidates if path.exists()), None)
if json_path is None:
    raise FileNotFoundError("Could not find saved_imdb_movies.json from this notebook location")

with open(json_path, "r", encoding="utf-8") as f:
    movies = json.load(f)

updated_count = 0
failed_updates = []

for movie in movies:
    imdb_id = movie.get("id")
    if not imdb_id:
        failed_updates.append({"tmdb_id": movie.get("tmdb_id"), "error": "Missing IMDb id"})
        continue

    try:
        response = fetch_main_page(imdb_id)
        imdb_data = extract_imdb_attributes(response.text)
        movie["imdb_vote_count"] = imdb_data.get("imdb_vote_count", 0)
        updated_count += 1
    except Exception as exc:
        failed_updates.append({"imdb_id": imdb_id, "error": str(exc)})

with open(json_path, "w", encoding="utf-8") as f:
    json.dump(movies, f, indent=2, ensure_ascii=False)

print(f"Updated imdb_vote_count for {updated_count}/{len(movies)} movies")
if failed_updates:
    print(f"Failed updates: {len(failed_updates)}")
    print(failed_updates[:5])


Updated imdb_vote_count for 50/50 movies


In [None]:
# Fetch all movies from the list of TMDB IDs
fetched_movies = [fetch_and_create_imdb_movie(movie_id) for movie_id in movies_to_upsert_to_db]

# Load existing movies from saved_imdb_movies.json
json_path = Path("../saved_imdb_movies.json")
with open(json_path, "r", encoding="utf-8") as f:
    existing_movies = json.load(f)

# Create a dictionary keyed by tmdb_id for efficient lookup and updates
# This allows us to quickly find and replace movies with the same TMDB ID
movies_dict = {movie["tmdb_id"]: movie for movie in existing_movies}

# Upsert fetched movies: if a movie with the same TMDB ID exists, override it; otherwise add it
for result in fetched_movies:
    if result["success"] and result["imdb_movie"]:
        # Convert IMDBMovie object to dictionary for JSON serialization
        movie_dict = result["imdb_movie"].model_dump(mode='json')
        # Upsert: update existing entry or add new one
        movies_dict[movie_dict["tmdb_id"]] = movie_dict
    else:
        print(f"Failed to fetch movie: {result.get('failure_reason', 'Unknown error')}")

# Convert back to list and save to JSON file
updated_movies_list = list(movies_dict.values())
with open(json_path, "w", encoding="utf-8") as f:
    json.dump(updated_movies_list, f, indent=2, ensure_ascii=False)

print(f"Successfully upserted {len(fetched_movies)} movies. Total movies in file: {len(updated_movies_list)}")

Successfully upserted 1 movies. Total movies in file: 50
