In [1]:
import sys
import json
from pathlib import Path
from bs4 import BeautifulSoup

# Add parent directory to path to import from implementation package
# Notebooks are in implementation/notebooks/, so we go up two levels to project root
sys.path.insert(0, str(Path().resolve().parent.parent))

from implementation.gather_data import fetch_featured_reviews, fetch_main_page
from implementation.extract_website_data import extract_featured_reviews
from implementation.movie import IMDBMovie
from implementation.schemas import IMDBReviewTheme

In [3]:
# LOAD MOVIES

json_path = Path("../../saved_imdb_movies.json")
with open(json_path, "r", encoding="utf-8") as f:
    movies_data = json.load(f)

# Convert each dictionary to an IMDBMovie object
movies = [IMDBMovie(**movie_dict) for movie_dict in movies_data]

In [6]:
import concurrent.futures

def process_movie(movie):
    try:
        scraped_reviews = fetch_featured_reviews(movie.id)
        featured_reviews = extract_featured_reviews(scraped_reviews.text)
        movie.featured_reviews = featured_reviews
        return movie, scraped_reviews
    except Exception as e:
        print(f"Error processing movie {movie.id}: {e}")
        return None, None

fixed_movies = []
failed_movies = []
# Variable to hold the last scraped reviews, preserving behavior for subsequent cells
scraped_reviews = None

# Use ThreadPoolExecutor to fetch in parallel
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    # Map returns results in the order of the input iterable
    results = executor.map(process_movie, movies)
    
    for movie, reviews in results:
        if movie:
            fixed_movies.append(movie)
        else:
            failed_movies.append(movie)

if fixed_movies:
    with open("../../saved_imdb_movies.json", "w", encoding="utf-8") as f:
        json.dump([movie.model_dump() for movie in fixed_movies], f, indent=2, ensure_ascii=False)
if failed_movies:
    with open("../../failed_fetches.json", "w", encoding="utf-8") as f:
        json.dump([movie.model_dump() for movie in failed_movies], f, indent=2, ensure_ascii=False)

print(f"Successfully updated {len(fixed_movies)} movies with featured reviews")
print(f"Failed to update {len(failed_movies)} movies")


Successfully updated 50 movies with featured reviews
Failed to update 0 movies


  PydanticSerializationUnexpectedValue(Expected `IMDBFeaturedReview` - serialized value may not be as expected [field_name='featured_reviews', input_value={'summary': '"Life moves ...#39;t changing at all.'}, input_type=dict])
  PydanticSerializationUnexpectedValue(Expected `IMDBFeaturedReview` - serialized value may not be as expected [field_name='featured_reviews', input_value={'summary': 'Not just a c...a good, genuine laugh.'}, input_type=dict])
  PydanticSerializationUnexpectedValue(Expected `IMDBFeaturedReview` - serialized value may not be as expected [field_name='featured_reviews', input_value={'summary': 'Recaptures "...u won&#39;t regret it.'}, input_type=dict])
  PydanticSerializationUnexpectedValue(Expected `IMDBFeaturedReview` - serialized value may not be as expected [field_name='featured_reviews', input_value={'summary': 'When teen mo... it great.  I love it!'}, input_type=dict])
  PydanticSerializationUnexpectedValue(Expected `IMDBFeaturedReview` - serialized value may 

In [4]:
def _parse_next_data(soup: BeautifulSoup) -> dict:
    script = soup.find("script", id="__NEXT_DATA__", type="application/json")
    if not script or not script.string:
        return {}
    try:
        return json.loads(script.string)
    except json.JSONDecodeError:
        return {}

def _safe_get(obj, path, default=None):
    cur = obj
    for key in path:
        if isinstance(cur, dict) and key in cur:
            cur = cur[key]
        else:
            return default
    return cur

In [5]:
def get_movie_review_themes(movie_id: str) -> list[IMDBReviewTheme]:
    main_scrape = fetch_main_page(movie_id)

    main_html = main_scrape.text

    soup = BeautifulSoup(main_html, "html.parser")
    nd = _parse_next_data(soup)

    review_themes = _safe_get(nd, ['props', 'pageProps', 'mainColumnData', 'reviewSummary', 'themes'], []) or []

    parsed_review_themes = []
    for theme in review_themes:
        name = _safe_get(theme, ['label', 'value'], None)
        sentiment = _safe_get(theme, ['sentiment'], None)
        if name and sentiment:
            parsed_review_themes.append(IMDBReviewTheme(name=name.lower(), sentiment=sentiment.lower()))

    return parsed_review_themes


In [7]:
import concurrent.futures

def process_movie(movie):
    try:
        review_themes = get_movie_review_themes(movie.id)
        movie.review_themes = review_themes
        return movie, review_themes
    except Exception as e:
        print(f"Error processing movie {movie.id}: {e}")
        return None, None

fixed_movies = []
failed_movies = []
# Variable to hold the last scraped reviews, preserving behavior for subsequent cells
scraped_reviews = None

# Use ThreadPoolExecutor to fetch in parallel
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    # Map returns results in the order of the input iterable
    results = executor.map(process_movie, movies)
    
    for movie, reviews in results:
        if movie:
            fixed_movies.append(movie)
        else:
            failed_movies.append(movie)

if fixed_movies:
    with open("../../saved_imdb_movies.json", "w", encoding="utf-8") as f:
        json.dump([movie.model_dump() for movie in fixed_movies], f, indent=2, ensure_ascii=False)
if failed_movies:
    with open("../../failed_fetches.json", "w", encoding="utf-8") as f:
        json.dump([movie.model_dump() for movie in failed_movies], f, indent=2, ensure_ascii=False)

print(f"Successfully updated {len(fixed_movies)} movies with featured reviews")
print(f"Failed to update {len(failed_movies)} movies")


Successfully updated 50 movies with featured reviews
Failed to update 0 movies


In [10]:
for movie in movies:
    print(movie.title)
    converted_themes = [str(theme) for theme in movie.review_themes]
    print("\n".join(converted_themes))
    print()

ferris bueller's day off


zootopia
Attribute: ambitious; audience reception: positive
Attribute: character chemistry; audience reception: positive
Attribute: character development; audience reception: positive
Attribute: comedic brilliance; audience reception: positive
Attribute: cultural diversity; audience reception: positive
Attribute: inspirational; audience reception: positive
Attribute: social commentary; audience reception: positive
Attribute: social issues; audience reception: positive
Attribute: visual artistry; audience reception: positive
Attribute: world-building; audience reception: positive

school of rock


frozen


the princess bride
Attribute: action sequences; audience reception: positive
Attribute: character development; audience reception: positive
Attribute: charming; audience reception: positive
Attribute: comedic brilliance; audience reception: positive
Attribute: fantastical elements; audience reception: positive
Attribute: generational appeal; audience recepti

In [4]:
movie_id = "tt11315808"

main_scrape = fetch_main_page(movie_id)

main_html = main_scrape.text

soup = BeautifulSoup(main_html, "html.parser")
nd = _parse_next_data(soup)

In [None]:
# REVIEW THEMES

review_themes = _safe_get(nd, ['props', 'pageProps', 'mainColumnData', 'reviewSummary', 'themes'], []) or []

parsed_review_themes = []
for theme in review_themes:
    name = _safe_get(theme, ['label', 'value'], None)
    sentiment = _safe_get(theme, ['sentiment'], None)
    if name and sentiment:
        parsed_review_themes.append(IMDBReviewTheme(name=name.lower(), sentiment=sentiment.lower()))

for theme in parsed_review_themes:
    print(theme)

Attribute: cinematography, audience reception: positive
Attribute: performance, audience reception: positive
Attribute: ambitious, audience reception: neutral
Attribute: inner conflict, audience reception: neutral
Attribute: psychological depth, audience reception: neutral
Attribute: social commentary, audience reception: neutral
Attribute: adaptation, audience reception: negative
Attribute: character development, audience reception: negative
Attribute: musical score, audience reception: negative
Attribute: pacing, audience reception: negative


In [None]:
scraped_reviews.text

reviews_html = scraped_reviews.text

soup = BeautifulSoup(reviews_html, "html.parser")
nd = _parse_next_data(soup)

reviews_data = nd['props']['pageProps']['contentData']['reviews']

for review_object in reviews_data[:10]:
    review_data = review_object['review']
    summary = review_data['reviewSummary']
    text = review_data['reviewText']
    print(f"({summary}) \n{text}")
    print()

(A Parody That Stands The Test of Time!) 
I find it curious that IMDB lists this soley as an adventure fantasy film. It is, partially, but I&#39;ve always viewed it as an amazingly well done parody.<br/><br/>It pokes fun of the fairy tale tropes while at the same time paying tribute to the genre. All the exaggeration about true love, evil princes, and torture chambers is on point. Its a self aware movie that plays up to its its biggest strength, the characters.<br/><br/>Every character in the film is memorable, even down to the preist who only has a few lines. The casting and writing of these characters all came together and it really shows. You just cant help but wish Wallace Shawn survived longer so his loveable villian character could get a little more screentime.<br/><br/>All in all, you should give this film a chance. Its a hilarious yet family friend entertaining ride that still holds up. As you wishhhhh!

(Funny touching love story (not just the fairy tale)) 
A boy (Fred Savage)

In [13]:
with open('./nd.json', "r", encoding="utf-8") as f:
    scrape_data = json.load(f)

featured_reviews = scrape_data['props']['pageProps']['mainColumnData']['featuredReviews']['edges']

print(len(featured_reviews))

for review in featured_reviews:
    summary = review['node']['summary']['originalText']
    text = review['node']['text']['originalText']['plaidHtml']
    print(f"({summary}) \n{text}")
    print()

5
(True Love Conquers All) 
Cary Elwes, Mandy Patinkin, and Robin Wright Penn star in this classic fairy tale entitled The Princess Bride. It is based on a novel by William Goldman, who also wrote the screenplay. Director Rob Reiner brings life to this story and effectively evokes the enchanting spirit of the witty 1973 novel.<br/><br/>The movie opens with a sick boy (Fred Savage) who receives a visit from his grandfather (Peter Falk) who intends to read to him from his favorite book. The boy is not exactly pleased to be distracted from his world of video games. However, his mood quickly changes as he and the viewer are transported to a place out of time. We are taken to Florin, a kingdom in an imaginary land, complete with dashing heroes, cowardly princes, rhyming giants, rodents of unusual size, fancy swordfights, and yes . . . even some kissing.<br/><br/>
This fairy tale begins on a farm in the countryside. There lives a beautiful, young woman named Buttercup (Robin Wright Penn) who