# Scraping 20k Movies in IMDB

In [7]:
pip install selenium

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: C:\Users\MODERN\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


## Import Modules

In [3]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import pandas as pd
import time
import re
from tqdm import tqdm  # Import tqdm for progress bar
from bs4 import BeautifulSoup
import json 

## WebDriver

In [4]:
# Base class for handling common WebDriver functionalities
class BaseScraper:
    def __init__(self):
        self.driver = self.init_driver()

    def init_driver(self):
        service = Service()
        options = webdriver.ChromeOptions()
        driver = webdriver.Chrome(service=service, options=options)
        return driver

    def close_driver(self):
        self.driver.quit()


## Scraping Movie Details

In [5]:
# MoviesScraper class to fetch movies
class MoviesScraper(BaseScraper):
    def __init__(self, clicks=2):
        super().__init__()  # Call the base class constructor
        self.clicks = clicks  # Number of times to click the "50 more" button
        self.movie_data = []

    def fetch_movies(self):
        url = 'https://www.imdb.com/search/title/?title_type=feature'
        self.driver.get(url)

        # Click on "50 more" button for the specified number of clicks
        with tqdm(total=self.clicks, desc='Loading movies') as pbar:
            for _ in range(self.clicks):
                soup = self.click_see_more_button()  # Re-fetch HTML after each click if successful
                # movies = soup.select('div.sc-59c7dc1-2')  # Adjust the selector based on your data structure
                # print(f"Movies loaded after click {_+1}: {len(movies)}")  # Check the number of loaded movies after each click
                pbar.update(1)  # Update the progress bar for each click

        # Calculate and apply the wait time based on the number of clicks
        wait_time = self._calculate_wait_time(self.clicks)
        time.sleep(wait_time)  # Adjust wait time based on clicks

        # After all clicks, extract the final set of movie data
        html = self.driver.page_source
        soup = BeautifulSoup(html, 'html.parser')
        movies = soup.select('div.sc-59c7dc1-2')
        self.extract_movie_data(movies)  # Extract all data after final click

        self.close_driver()  # Close the driver after fetching movies

        return self.movie_data

    def click_see_more_button(self):
        try:
            # Get the current number of 'ipc-title' elements before clicking
            initial_elements = self.driver.find_elements(By.CLASS_NAME, 'ipc-title')
            initial_count = len(initial_elements)

            see_more_button = WebDriverWait(self.driver, 5).until(
                EC.element_to_be_clickable((By.XPATH, "//span[contains(text(), '50 more')]"))
            )
            self.driver.execute_script("arguments[0].scrollIntoView(); arguments[0].click();", see_more_button)

            # Retry up to 5 times to confirm that new content has been loaded
            for _ in range(5):
                current_elements = self.driver.find_elements(By.CLASS_NAME, 'ipc-title')
                current_count = len(current_elements)

                if current_count > initial_count:
                    break  # New content detected
                time.sleep(1)  # Brief wait before rechecking

            return BeautifulSoup(self.driver.page_source, 'html.parser')

        except Exception as e:
            print(f"Error occurred: {e}")
            return None


    def extract_movie_data(self, movies):
        for movie in movies:
            title_tag = movie.select_one('h3.ipc-title__text') if movie.select_one('h3.ipc-title__text') else None
            link_tag = movie.select_one('a.ipc-title-link-wrapper') if movie.select_one('a.ipc-title-link-wrapper') else None  # if don't have link -> can't find the review or movie id -> skip that movie

            title = title_tag.text.strip() if title_tag else 'N/A'
            link = link_tag['href']

            if link_tag:
                link = link_tag['href']

                # Extract movie ID
                movie_id = link.split('/title/')[1].split('/')[0]
            else:
                movie_id = 'N/A'

            # Clean the title
            title = re.sub(r'^\d+\.\s*', '', title)

            # Adding the data
            self.movie_data.append({
                'Movie ID': movie_id,
                'Title': title,
            })
    
    def _calculate_wait_time(self, clicks):
        """
        Calculate an adaptive wait time based on the number of clicks.
        As the number of clicks increases, the wait time grows exponentially to accommodate website lag.
        """
        base_wait_time = 5  # Base wait time in seconds
        growth_factor = 1.2  # Exponential growth factor
        additional_wait_time = base_wait_time * (growth_factor ** (clicks // 10))  # Increase wait time every 10 clicks
        
        return base_wait_time + additional_wait_time
    
    def save_to_json(self, file_name='movies_data.json'):
        with open(file_name, 'w', encoding='utf-8') as file:
            json.dump(self.movie_data, file, ensure_ascii=False, indent=4)

### Test

In [6]:
scraper = MoviesScraper(clicks=1)

scraper.fetch_movies()
scraper.save_to_json()
print("Movies fetched and saved to movies.json")

Loading movies: 100%|██████████| 1/1 [00:02<00:00,  2.47s/it]


Movies fetched and saved to movies.json


In [17]:
import os
import json

def check_size(file_name):
    # Đọc file JSON
    with open(os.path.join(os.getcwd(), file_name), 'r') as file:
        data = json.load(file)

    # Kiểm tra kiểu dữ liệu và kích thước
    if isinstance(data, list):
        print(f"Số lượng phần tử trong danh sách: {len(data)}")
        # Nếu mỗi phần tử là từ điển, bạn có thể kiểm tra kích thước của từ điển đầu tiên
        if data:
            print(f"Kích thước của phần tử đầu tiên: {len(data[0])} thuộc tính")
    elif isinstance(data, dict):
        print(f"Số lượng thuộc tính trong từ điển: {len(data)}")
    else:
        print("Dữ liệu không phải là danh sách hoặc từ điển.")


In [18]:
check_size(file_name='movies_data.json')

Số lượng phần tử trong danh sách: 100
Kích thước của phần tử đầu tiên: 2 thuộc tính


## Scraping Reviews of each Movies

In [20]:
# ReviewsScraper class to fetch reviews for each movie
class MovieReviewScraper(BaseScraper):
    def __init__(self, json_file_path):
        super().__init__()  # Call the base class constructor
        self.json_file_path = json_file_path  # Path to the JSON file
        self.movie_data = self._load_movies_from_json()  # Load movie data from JSON
        self.movie_reviews = []  # Adjusted to be a list of movie objects
        self.clicks = 0  # Initialize click counter

    def _load_movies_from_json(self):
        with open(self.json_file_path, 'r', encoding='utf-8') as f:
            return json.load(f)  # Load JSON data

    def fetch_reviews(self):
        for movie in self.movie_data:  # Iterate through the list of movies
            movie_id = movie['Movie ID']
            title = movie['Title']
            review_url = f"https://www.imdb.com/title/{movie_id}/reviews"
            self.driver.get(review_url)

            self._load_reviews()

            wait_time = self._calculate_wait_time(10, self.clicks)  # Adjust wait time based on click count
            time.sleep(wait_time)

            html = self.driver.page_source
            soup = BeautifulSoup(html, 'html.parser')
            self._extract_reviews(soup, movie_id, title)

        self.close_driver()
        self.movie_reviews

    def _load_reviews(self):
        # Try to find and click the 'All' reviews button
        try:
            all_reviews_button = WebDriverWait(self.driver, 5).until(
                EC.presence_of_element_located((By.XPATH, "//*[@id='__next']/main/div/section/div/section/div/div[1]/section[1]/div[3]/div/span[2]/button"))
            )
            self.driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", all_reviews_button)
            time.sleep(2)  # Wait for the page to load if needed
            all_reviews_button.click()
        except Exception as e:
            print("Could not find 'All' button, will try to find 'Load More' button.")
            self._load_more_reviews()

    def _load_more_reviews(self):
        # Add progress bar for loading more reviews
        with tqdm(total=10, desc='Loading More Reviews', leave=False) as pbar:
            while True:
                try:
                    load_more_button = WebDriverWait(self.driver, 5).until(
                        EC.presence_of_element_located((By.XPATH, '//*[@id="load-more-trigger"]'))
                    )
                    self.driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", load_more_button)
                    load_more_button.click()

                    self.clicks += 1  # Increment the click count
                    pbar.update(1)  # Update progress bar

                    wait_time = self._calculate_wait_time(1, self.clicks)  # Adjust wait time based on click count
                    time.sleep(wait_time)

                except Exception as e:
                    print("No more 'Load More' buttons to click.")
                    break

    def _calculate_wait_time(self, base_wait_time, clicks):
        """
        Calculate an adaptive wait time based on the number of clicks.
        As the number of clicks increases, the wait time grows exponentially to accommodate website lag.
        """
        growth_factor = 1.2  # Exponential growth factor
        additional_wait_time = base_wait_time * (growth_factor ** (clicks // 10))  # Increase wait time every 10 clicks
        
        return base_wait_time + additional_wait_time


    def _extract_reviews(self, soup, movie_id, title):
        reviews = soup.select('article.user-review-item')  # Attempt to extract reviews using one selector
        movie_info = {
            'Movie ID': movie_id,
            'Reviews': []
        }
        # If no reviews found, try to load more reviews
        if not reviews:  
            reviews = soup.select('div.lister-item.mode-detail.imdb-user-review')
            if not reviews: # If still no reviews available
                print(f"No reviews found for {title}.")
                return

            for review in reviews:
                parsed_review = self._parse_review(review, "load_more")
                movie_info['Reviews'].append(parsed_review)
        else: # If "all" button found
            for review in reviews:
                parsed_review = self._parse_review(review, "all")
                movie_info['Reviews'].append(parsed_review)

        self.movie_reviews.append(movie_info)

        # Count the number of reviews and display it
        num_reviews = len(movie_info['Reviews'])
        print(f"Total number of reviews for '{title}': {num_reviews}")

    def _parse_review(self, review, button_type):
        """
        Extract information from the review and return as a dictionary.
        """
        # Extract information from the review based on its type (load_more or all)
        if button_type == "load_more":
            review_rating = review.select_one('span.rating-other-user-rating span').get_text(strip=True) if review.select_one('span.rating-other-user-rating span') else 'No rating'
            review_summary = review.select_one('a.title').get_text(strip=True) if review.select_one('a.title') else 'No summary'
            review_text = review.select_one('div.text.show-more__control').get_text(strip=True) if review.select_one('div.text.show-more__control') else 'No content'
            author_tag = review.select_one('span.display-name-link a').get_text(strip=True) if review.select_one('span.display-name-link a') else 'Unknown Author'
            review_date = review.select_one('span.review-date').get_text(strip=True) if review.select_one('span.review-date') else 'No date'
        else:
            review_rating = review.select_one('span.ipc-rating-star--rating').get_text(strip=True) if review.select_one('span.ipc-rating-star--rating') else 'No rating'
            review_summary = review.select_one('span[data-testid="review-summary"]').get_text(strip=True) if review.select_one('span[data-testid="review-summary"]') else 'No summary'
            review_text = review.select_one('div.ipc-html-content-inner-div').get_text(strip=True) if review.select_one('div.ipc-html-content-inner-div') else 'No content'
            author_tag = review.select_one('a[data-testid="author-link"]').get_text(strip=True) if review.select_one('a[data-testid="author-link"]') else 'Unknown Author'
            review_date = review.select_one('li.review-date').get_text(strip=True) if review.select_one('li.review-date') else 'No date'

        # Return the review information in the expected format
        return {
            'Review Summary': review_summary,
            'Review': review_text,
            'Rating': review_rating,
            'Author': author_tag,
            'Date': review_date
        }
    def save_to_json(self):
        with open('movies_reviews.json', 'w', encoding='utf-8') as f:
            json.dump(self.movie_reviews, f, ensure_ascii=False, indent=4)
        print("Reviews saved to movies_reviews.json")

### Test 1 movie

#### Test case

In [21]:
import os
#test reviews scraper
data = [
    {
        "Movie ID": "tt17526714",
        "Title": "The Substance"
    }
]

# Save the data to a JSON file
with open('test_movie.json', 'w') as json_file:
    json.dump(data, json_file, indent=4)

path = os.path.join(os.getcwd(), 'test_movie.json')
scraper = MovieReviewScraper(path)

scraper.fetch_reviews()
scraper.save_to_json()
print("Movies fetched and saved to movies_reviews.json")


Could not find 'All' button, will try to find 'Load More' button.


                                                                     

No more 'Load More' buttons to click.




Total number of reviews for 'The Substance': 433
Reviews saved to movies_reviews.json
Movies fetched and saved to movies_reviews.json


In [26]:
import json
import os

# Đọc file JSON với mã hóa utf-8
with open(os.path.join(os.getcwd(), 'movies_reviews.json'), 'r', encoding='utf-8') as file:
    data = json.load(file)

# Đếm số lượng review cho từng phim
for movie in data:
    movie_id = movie["Movie ID"]
    reviews = movie["Reviews"]
    review_count = len(reviews)
    print(f"Movie ID: {movie_id} có {review_count} review(s).")


Movie ID: tt17526714 có 433 review(s).
