# Scraping 20k Movies in IMDB

In [1]:
pip install selenium

Note: you may need to restart the kernel to use updated packages.


## Import Modules

In [2]:
pip install pandas

Collecting pandas
  Downloading pandas-2.2.3-cp311-cp311-win_amd64.whl.metadata (19 kB)
Collecting numpy>=1.23.2 (from pandas)
  Downloading numpy-2.1.2-cp311-cp311-win_amd64.whl.metadata (59 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2024.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2024.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.2.3-cp311-cp311-win_amd64.whl (11.6 MB)
   ---------------------------------------- 0.0/11.6 MB ? eta -:--:--
    --------------------------------------- 0.3/11.6 MB ? eta -:--:--
   - -------------------------------------- 0.5/11.6 MB 1.5 MB/s eta 0:00:08
   -- ------------------------------------- 0.8/11.6 MB 1.3 MB/s eta 0:00:09
   -- ------------------------------------- 0.8/11.6 MB 1.3 MB/s eta 0:00:09
   --- ------------------------------------ 1.0/11.6 MB 1.2 MB/s eta 0:00:09
   ---- ----------------------------------- 1.3/11.6 MB 1.2 MB/s eta 0:00:09
   ----- 

In [3]:
pip install tqdm

Collecting tqdm
  Downloading tqdm-4.66.5-py3-none-any.whl.metadata (57 kB)
Downloading tqdm-4.66.5-py3-none-any.whl (78 kB)
Installing collected packages: tqdm
Successfully installed tqdm-4.66.5
Note: you may need to restart the kernel to use updated packages.


In [4]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import pandas as pd
import time
import re
from tqdm import tqdm  # Import tqdm for progress bar
from bs4 import BeautifulSoup
import json 

## WebDriver

In [5]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

# Base class for handling common WebDriver functionalities
class BaseScraper:
    def __init__(self):
        self.driver = self.init_driver()

    def init_driver(self):
        service = Service()  # Initialize the service
        options = webdriver.ChromeOptions()
        
        # Set Chrome options to reduce memory usage
        options.headless = True  # Enable headless mode
        options.add_argument('--disable-extensions')
        options.add_argument('--disable-gpu')
        options.add_argument('--disable-dev-shm-usage')
        options.add_argument("--no-sandbox")  # Use this if you encounter issues

        driver = webdriver.Chrome(service=service, options=options)
        return driver

    def close_driver(self):
        self.driver.quit()


## Scraping Movie Details

In [62]:
import json
import re
import time
from tqdm import tqdm
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

class MoviesScraper(BaseScraper):
    def __init__(self, clicks=200, batch_size=20, save_file='movies_data.json'):
        super().__init__()
        self.clicks = clicks
        self.batch_size = batch_size
        self.save_file = save_file
        self.movie_data = []

    def fetch_movies(self):
        url = 'https://www.imdb.com/search/title/?title_type=feature&release_date=2024-01-01,2024-10-31'
        self.driver.get(url)

        total_batches = self.clicks // self.batch_size  # Calculate total batches
        remaining_clicks = self.clicks % self.batch_size  # Remaining clicks after batches

        for batch in range(total_batches):
            try:
                print(f"Processing batch {batch + 1}/{total_batches}...")
                self._process_batch(self.batch_size)
            except Exception as e:
                print(f"Error occurred in batch {batch + 1}: {e}")
                break # stop the process after an error

        # Process any remaining clicks
        if remaining_clicks > 0:
            try:
                print(f"Processing remaining {remaining_clicks} clicks...")
                self._process_batch(remaining_clicks)
            except Exception as e:
                print(f"Error occurred: {e}")

        self.close_driver()
        return self.movie_data

    def _process_batch(self, num_clicks):
        """Process a batch with num_clicks of 'see more' clicks."""
        initial_html = self.driver.page_source
        initial_soup = BeautifulSoup(initial_html, 'html.parser')
        initial_movies = initial_soup.select('div.sc-59c7dc1-2')  # Saving first element

        # Extract data for the initial set of movies (phim ban đầu)
        self.extract_movie_data(initial_movies)

        with tqdm(total=num_clicks, desc='Loading movies') as pbar:
            for _ in range(num_clicks):
                soup = self.click_see_more_button()
                pbar.update(1)

        wait_time = self._calculate_wait_time(num_clicks)
        time.sleep(wait_time)

        # After pressing the 'see more' button
        final_html = self.driver.page_source
        final_soup = BeautifulSoup(final_html, 'html.parser')
        final_movies = final_soup.select('div.sc-59c7dc1-2')

        # After each batch, extract movie data (just extract new data)
        new_movies = final_movies[len(initial_movies):]
        self.extract_movie_data(new_movies)

    def click_see_more_button(self):
        try:
            initial_elements = self.driver.find_elements(By.CLASS_NAME, 'ipc-title')
            initial_count = len(initial_elements)

            see_more_button = WebDriverWait(self.driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, "//span[contains(text(), '50 more')]"))
            )
            self.driver.execute_script("arguments[0].scrollIntoView(); arguments[0].click();", see_more_button)

            for _ in range(5):
                current_elements = self.driver.find_elements(By.CLASS_NAME, 'ipc-title')
                current_count = len(current_elements)

                if current_count > initial_count:
                    break
                time.sleep(1)

            return BeautifulSoup(self.driver.page_source, 'html.parser')

        except Exception as e:
            print(f"Error occurred: {e}")
            return None

    def save_partial_data(self, batch_num):
        """Save temporary data after each batch."""
        temp_file_name = f"movies_data_batch_{batch_num}.json"
        with open(temp_file_name, 'w', encoding='utf-8') as file:
            json.dump(self.movie_data, file, ensure_ascii=False, indent=4)
        print(f"Data for batch {batch_num} saved to {temp_file_name}")

    def extract_movie_data(self, movies):
        existing_movie_ids = {movie['Movie ID'] for movie in self.movie_data}  # Set of existing movie IDs

        for movie in movies:
            title_tag = movie.select_one('h3.ipc-title__text')
            link_tag = movie.select_one('a.ipc-title-link-wrapper')

            title = title_tag.text.strip() if title_tag else 'N/A'
            link = link_tag['href'] if link_tag else None

            if link_tag:
                movie_id = link.split('/title/')[1].split('/')[0]
            else:
                movie_id = 'N/A'

            title = re.sub(r'^\d+\.\s*', '', title)

            if movie_id not in existing_movie_ids:  # Only add if movie_id is not already in the list
                self.movie_data.append({
                    'Movie ID': movie_id,
                    'Title': title,
                })

    def _calculate_wait_time(self, clicks):
        base_wait_time = 5
        growth_factor = 1.2
        additional_wait_time = base_wait_time * (growth_factor ** (clicks // 10))
        return base_wait_time + additional_wait_time


In [67]:
import json
import re
import time
from tqdm import tqdm
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

class MoviesScraper(BaseScraper):
    def __init__(self, clicks=200, release_date_from='2024-01-01', release_date_to='2024-10-07'):
        super().__init__()
        self.clicks = clicks
        self.movie_data = []
        self.release_date_from = release_date_from
        self.release_date_to = release_date_to

    def fetch_movies(self):
        url = f'https://www.imdb.com/search/title/?title_type=feature&release_date={self.release_date_from},{self.release_date_to}'
        self.driver.get(url)

        initial_html = self.driver.page_source
        initial_soup = BeautifulSoup(initial_html, 'html.parser')
        initial_movies = initial_soup.select('div.sc-59c7dc1-2')  # Save the initial elements

        # Extract data for the initial set of movies
        self.extract_movie_data(initial_movies)

        with tqdm(total=self.clicks, desc='Loading movies') as pbar:
            for _ in range(self.clicks):
                soup = self.click_see_more_button()
                pbar.update(1)
                time.sleep(1)  # Optional wait time between clicks

        final_html = self.driver.page_source
        final_soup = BeautifulSoup(final_html, 'html.parser')
        final_movies = final_soup.select('div.sc-59c7dc1-2')

        # After all clicks, extract movie data
        new_movies = final_movies[len(initial_movies):]
        self.extract_movie_data(new_movies)

        self.close_driver()
        return self.movie_data

    def click_see_more_button(self):
        try:
            initial_elements = self.driver.find_elements(By.CLASS_NAME, 'ipc-title')
            initial_count = len(initial_elements)

            see_more_button = WebDriverWait(self.driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, "//span[contains(text(), '50 more')]"))
            )
            self.driver.execute_script("arguments[0].scrollIntoView(); arguments[0].click();", see_more_button)

            for _ in range(5):
                current_elements = self.driver.find_elements(By.CLASS_NAME, 'ipc-title')
                current_count = len(current_elements)

                if current_count > initial_count:
                    break
                time.sleep(1)

            return BeautifulSoup(self.driver.page_source, 'html.parser')

        except Exception as e:
            print(f"Error occurred: {e}")
            return None

    def extract_movie_data(self, movies):
        existing_movie_ids = {movie['Movie ID'] for movie in self.movie_data}  # Set of existing movie IDs

        for movie in movies:
            title_tag = movie.select_one('h3.ipc-title__text')
            link_tag = movie.select_one('a.ipc-title-link-wrapper')

            title = title_tag.text.strip() if title_tag else 'N/A'
            link = link_tag['href'] if link_tag else None

            if link_tag:
                movie_id = link.split('/title/')[1].split('/')[0]
            else:
                movie_id = 'N/A'

            title = re.sub(r'^\d+\.\s*', '', title)

            if movie_id not in existing_movie_ids:  # Only add if movie_id is not already in the list
                self.movie_data.append({
                    'Movie ID': movie_id,
                    'Title': title,
                })

    def _calculate_wait_time(self, clicks):
        base_wait_time = 5
        growth_factor = 1.2
        additional_wait_time = base_wait_time * (growth_factor ** (clicks // 10))
        return base_wait_time + additional_wait_time


### Test

In [68]:
# scraper = MoviesScraper(clicks=10, batch_size=4)
# movie_data = scraper.fetch_movies()

scraper = MoviesScraper(clicks=10, release_date_from='2024-01-01', release_date_to='2024-10-07')
movie_data = scraper.fetch_movies()

Loading movies: 100%|██████████| 10/10 [00:27<00:00,  2.77s/it]


In [8]:
import os
import json

def check_size(data):
    # Kiểm tra kiểu dữ liệu và kích thước
    if isinstance(data, list):
        print(f"Số lượng phần tử trong danh sách: {len(data)}")

        # Nếu phần tử là từ điển, kiểm tra thêm về trùng lặp và dữ liệu thiếu
        if data:
            first_element = data[0]
            print(f"Kích thước của phần tử đầu tiên: {len(first_element)} thuộc tính")

            # Kiểm tra trùng lặp (theo Movie ID)
            movie_ids = [item.get('Movie ID') for item in data if 'Movie ID' in item]
            duplicates = {movie_id for movie_id in movie_ids if movie_ids.count(movie_id) > 1}
            if duplicates:
                print(f"Các Movie ID trùng lặp: {duplicates}")
            else:
                print("Không có Movie ID trùng lặp.")

            # Kiểm tra dữ liệu thiếu
            missing_values = []
            for index, movie in enumerate(data):
                for key, value in movie.items():
                    if value in [None, '', 'N/A']:
                        missing_values.append((index, key))
            
            if missing_values:
                print(f"Các giá trị thiếu (index, key): {missing_values}")
            else:
                print("Không có giá trị thiếu.")

    elif isinstance(data, dict):
        print(f"Số lượng thuộc tính trong từ điển: {len(data)}")
    else:
        print("Dữ liệu không phải là danh sách hoặc từ điển.")



In [69]:
check_size(movie_data)

Số lượng phần tử trong danh sách: 550
Kích thước của phần tử đầu tiên: 2 thuộc tính
Không có Movie ID trùng lặp.
Không có giá trị thiếu.


## Scraping Reviews of each Movies

In [30]:
# ReviewsScraper class to fetch reviews for each movie
class MovieReviewScraper(BaseScraper):
    def __init__(self, movie_data):
        super().__init__()  # Call the base class constructor
        self.movie_data = movie_data  # Load movie data from movie data
        self.movie_reviews = []  # Adjusted to be a list of movie objects
        self.clicks = 0  # Initialize click counter

    def fetch_reviews(self):
        for movie in self.movie_data:  # Iterate through the list of movies
            movie_id = movie.get('Movie ID')  # Lấy Movie ID từ từ điển
            title = movie.get('Title')  # Lấy Title từ từ điển
            num_reviews = 0
            if movie_id and title:  # Kiểm tra nếu cả Movie ID và Title đều có giá trị
                for rating_filter in range(1, 11):  # Lặp từ 1 đến 10
                    review_url = f"https://www.imdb.com/title/{movie_id}/reviews?sort=submissionDate&dir=desc&&ratingFilter={rating_filter}&rating={rating_filter}"
                    # print(f"Fetching reviews for {title} ({movie_id}) at {review_url} with rating filter {rating_filter}")
                    self.driver.get(review_url)

                    self._load_reviews()  # Load more reviews by clicking the button

                    wait_time = self._calculate_wait_time(10, self.clicks)  # Điều chỉnh thời gian chờ theo số lần click
                    time.sleep(wait_time)

                    html = self.driver.page_source
                    soup = BeautifulSoup(html, 'html.parser')

                    # Trích xuất đánh giá cho phim hiện tại
                    num_reviews += self._extract_reviews(soup, movie_id, title)
            else:
                print(f"Movie data missing for {movie}. Skipping this movie.")
            
            print(f"Total number of reviews for '{title}': {num_reviews}")

        self.close_driver()
        return self.movie_reviews


    def _load_reviews(self):
        # Try to find and click the 'All' reviews button
        try:
            all_reviews_button = WebDriverWait(self.driver, 5).until(
                EC.presence_of_element_located((By.XPATH, "//*[@id='__next']/main/div/section/div/section/div/div[1]/section[1]/div[3]/div/span[2]/button"))
            )
            self.driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", all_reviews_button)
            time.sleep(2)  # Wait for the page to load if needed
            all_reviews_button.click()
        except Exception as e:
            # Could not find 'All' button, will try to find 'Load More' button.
            self._load_more_reviews()


    def _load_more_reviews(self):
        # Add progress bar for loading more reviews
        with tqdm(desc='Loading More Reviews', leave=False) as pbar:
            while True:
                try:
                    load_more_button = WebDriverWait(self.driver, 5).until(
                        EC.presence_of_element_located((By.XPATH, '//*[@id="load-more-trigger"]'))
                    )
                    self.driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", load_more_button)
                    load_more_button.click()

                    self.clicks += 1  # Increment the click count
                    pbar.update(1)  # Update progress bar

                    wait_time = self._calculate_wait_time(1, self.clicks)  # Adjust wait time based on click count
                    time.sleep(wait_time)

                except Exception as e: # No more 'Load More' buttons to click.
                    break

    def _calculate_wait_time(self, base_wait_time, clicks):
        """
        Calculate an adaptive wait time based on the number of clicks.
        As the number of clicks increases, the wait time grows exponentially to accommodate website lag.
        """
        growth_factor = 1.2  # Exponential growth factor
        additional_wait_time = base_wait_time * (growth_factor ** (clicks // 10))  # Increase wait time every 10 clicks
        
        return base_wait_time + additional_wait_time


    def _extract_reviews(self, soup, movie_id, title):
        reviews = soup.select('article.user-review-item')  # Attempt to extract reviews using one selector
        movie_info = {
            'Movie ID': movie_id,
            'Reviews': []
        }
        # If no reviews found, try to load more reviews
        if not reviews:  
            reviews = soup.select('div.lister-item.mode-detail.imdb-user-review')
            if not reviews: # If still no reviews available
                print(f"No reviews found for {title}.")
                return

            for review in reviews:
                parsed_review = self._parse_review(review, "load_more")
                movie_info['Reviews'].append(parsed_review)
        else: # If "all" button found
            for review in reviews:
                parsed_review = self._parse_review(review, "all")
                movie_info['Reviews'].append(parsed_review)

        self.movie_reviews.append(movie_info)

        # Count the number of reviews and display it
        num_reviews = len(movie_info['Reviews'])
        # print(f"Total number of reviews for '{title}': {num_reviews}")
        return num_reviews

    def _parse_review(self, review, button_type):
        """
        Extract information from the review and return as a dictionary.
        """
        # Extract information from the review based on its type (load_more or all)
        if button_type == "load_more":
            review_rating = review.select_one('span.rating-other-user-rating span').get_text(strip=True) if review.select_one('span.rating-other-user-rating span') else 'No rating'
            review_summary = review.select_one('a.title').get_text(strip=True) if review.select_one('a.title') else 'No summary'
            review_text = review.select_one('div.text.show-more__control').get_text(strip=True) if review.select_one('div.text.show-more__control') else 'No content'
            author_tag = review.select_one('span.display-name-link a').get_text(strip=True) if review.select_one('span.display-name-link a') else 'Unknown Author'
            review_date = review.select_one('span.review-date').get_text(strip=True) if review.select_one('span.review-date') else 'No date'
        else:
            review_rating = review.select_one('span.ipc-rating-star--rating').get_text(strip=True) if review.select_one('span.ipc-rating-star--rating') else 'No rating'
            review_summary = review.select_one('span[data-testid="review-summary"]').get_text(strip=True) if review.select_one('span[data-testid="review-summary"]') else 'No summary'
            review_text = review.select_one('div.ipc-html-content-inner-div').get_text(strip=True) if review.select_one('div.ipc-html-content-inner-div') else 'No content'
            author_tag = review.select_one('a[data-testid="author-link"]').get_text(strip=True) if review.select_one('a[data-testid="author-link"]') else 'Unknown Author'
            review_date = review.select_one('li.review-date').get_text(strip=True) if review.select_one('li.review-date') else 'No date'

        # Return the review information in the expected format
        return {
            'Review Summary': review_summary,
            'Review': review_text,
            'Rating': review_rating,
            'Author': author_tag,
            'Date': review_date
        }
    def save_to_json(self):
        with open('movies_reviews.json', 'w', encoding='utf-8') as f:
            json.dump(self.movie_reviews, f, ensure_ascii=False, indent=4)
        print("Reviews saved to movies_reviews.json")

In [None]:
# ReviewsScraper class to fetch reviews for each movie
class MovieReviewScraper(BaseScraper):
    def __init__(self, movie_data):
        super().__init__()  # Call the base class constructor
        self.movie_data = movie_data  # Load movie data from movie data
        self.movie_reviews = []  # Adjusted to be a list of movie objects
        self.clicks = 0  # Initialize click counter

    def fetch_reviews(self):
        for movie in self.movie_data:  # Iterate through the list of movies
            movie_id = movie.get('Movie ID')  # Lấy Movie ID từ từ điển
            title = movie.get('Title')  # Lấy Title từ từ điển
            num_reviews = 0
            if movie_id and title:  # Kiểm tra nếu cả Movie ID và Title đều có giá trị
                for rating_filter in range(1, 11):  # Lặp từ 1 đến 10
                    review_url = f"https://www.imdb.com/title/{movie_id}/reviews?sort=submissionDate&dir=desc&&ratingFilter={rating_filter}&rating={rating_filter}"
                    # print(f"Fetching reviews for {title} ({movie_id}) at {review_url} with rating filter {rating_filter}")
                    self.driver.get(review_url)

                    self._load_reviews()  # Load more reviews by clicking the button

                    wait_time = self._calculate_wait_time(10, self.clicks)  # Điều chỉnh thời gian chờ theo số lần click
                    time.sleep(wait_time)

                    html = self.driver.page_source
                    soup = BeautifulSoup(html, 'html.parser')

                    # Trích xuất đánh giá cho phim hiện tại
                    num_reviews += self._extract_reviews(soup, movie_id, title)
            else:
                print(f"Movie data missing for {movie}. Skipping this movie.")
            
            print(f"Total number of reviews for '{title}': {num_reviews}")

        self.close_driver()
        return self.movie_reviews


    def _load_reviews(self):
        # Try to find and click the 'All' reviews button
        try:
            all_reviews_button = WebDriverWait(self.driver, 5).until(
                EC.presence_of_element_located((By.XPATH, "//*[@id='__next']/main/div/section/div/section/div/div[1]/section[1]/div[3]/div/span[2]/button"))
            )
            self.driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", all_reviews_button)
            time.sleep(2)  # Wait for the page to load if needed
            all_reviews_button.click()
        except Exception as e:
            # Could not find 'All' button, will try to find 'Load More' button.
            self._load_more_reviews()


    def _load_more_reviews(self):
        # Add progress bar for loading more reviews
        with tqdm(desc='Loading More Reviews', leave=False) as pbar:
            while True:
                try:
                    load_more_button = WebDriverWait(self.driver, 5).until(
                        EC.presence_of_element_located((By.XPATH, '//*[@id="load-more-trigger"]'))
                    )
                    self.driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", load_more_button)
                    load_more_button.click()

                    self.clicks += 1  # Increment the click count
                    pbar.update(1)  # Update progress bar

                    wait_time = self._calculate_wait_time(1, self.clicks)  # Adjust wait time based on click count
                    time.sleep(wait_time)

                except Exception as e: # No more 'Load More' buttons to click.
                    break

    def _calculate_wait_time(self, base_wait_time, clicks):
        """
        Calculate an adaptive wait time based on the number of clicks.
        As the number of clicks increases, the wait time grows exponentially to accommodate website lag.
        """
        growth_factor = 1.2  # Exponential growth factor
        additional_wait_time = base_wait_time * (growth_factor ** (clicks // 10))  # Increase wait time every 10 clicks
        
        return base_wait_time + additional_wait_time


    def _extract_reviews(self, soup, movie_id, title):
        reviews = soup.select('article.user-review-item')  # Attempt to extract reviews using one selector
        # movie_info = {
        #     'Movie ID': movie_id,
        #     'Reviews': []
        # }

        # Check if movie info already exists in the list
        movie_info = next((info for info in self.movie_reviews if info['Movie ID'] == movie_id), None)

        if movie_info is None:
            # Initialize a new movie_info if it doesn't exist
            movie_info = {
                'Movie ID': movie_id,
                'Reviews': []
            }
            
        # If no reviews found, try to load more reviews
        if not reviews:  # load more
            reviews = soup.select('div.lister-item.mode-detail.imdb-user-review')
            if not reviews: # If still no reviews available
                print(f"No reviews found for {title}.")
                return

            for review in reviews:
                parsed_review = self._parse_review(review, "load_more")
                movie_info['Reviews'].append(parsed_review)
        else: # If "all" button found
            for review in reviews:
                parsed_review = self._parse_review(review, "all")
                movie_info['Reviews'].append(parsed_review)

        self.movie_reviews.append(movie_info)

        # Count the number of reviews and display it
        num_reviews = len(movie_info['Reviews'])
        return num_reviews
    
    # def _extract_reviews(self, soup, movie_id, title):
    #     reviews = soup.select('article.user-review-item') or soup.select('div.lister-item.mode-detail.imdb-user-review')
        
    #     # Initialize the movie entry in the dictionary if it doesn't exist
    #     if movie_id not in self.movie_reviews:
    #         self.movie_reviews[movie_id] = {
    #             'Title': title,
    #             'Reviews': []
    #         }

    #     for review in reviews:
    #         parsed_review = self._parse_review(review, "load_more" if reviews[0].select_one('span.rating-other-user-rating span') else "all")
    #         self.movie_reviews[movie_id]['Reviews'].append(parsed_review)

    #     # Print number of reviews for this movie
    #     num_reviews = len(self.movie_reviews[movie_id]['Reviews'])
    #     # print(f"Total number of reviews for '{title}': {num_reviews}")
    #     return num_reviews

    def _parse_review(self, review, button_type):
        """
        Extract information from the review and return as a dictionary.
        """
        # Extract information from the review based on its type (load_more or all)
        if button_type == "load_more":
            review_rating = review.select_one('span.rating-other-user-rating span').get_text(strip=True) if review.select_one('span.rating-other-user-rating span') else 'No rating'
            review_summary = review.select_one('a.title').get_text(strip=True) if review.select_one('a.title') else 'No summary'
            review_text = review.select_one('div.text.show-more__control').get_text(strip=True) if review.select_one('div.text.show-more__control') else 'No content'
            author_tag = review.select_one('span.display-name-link a').get_text(strip=True) if review.select_one('span.display-name-link a') else 'Unknown Author'
            review_date = review.select_one('span.review-date').get_text(strip=True) if review.select_one('span.review-date') else 'No date'
        else:
            review_rating = review.select_one('span.ipc-rating-star--rating').get_text(strip=True) if review.select_one('span.ipc-rating-star--rating') else 'No rating'
            review_summary = review.select_one('span[data-testid="review-summary"]').get_text(strip=True) if review.select_one('span[data-testid="review-summary"]') else 'No summary'
            review_text = review.select_one('div.ipc-html-content-inner-div').get_text(strip=True) if review.select_one('div.ipc-html-content-inner-div') else 'No content'
            author_tag = review.select_one('a[data-testid="author-link"]').get_text(strip=True) if review.select_one('a[data-testid="author-link"]') else 'Unknown Author'
            review_date = review.select_one('li.review-date').get_text(strip=True) if review.select_one('li.review-date') else 'No date'

        # Extract helpful votes
        if button_type == "load_more":
            helpful_text = review.select_one('div.actions.text-muted').get_text(strip=True) if review.select_one('div.actions.text-muted') else ''
            match = re.search(r'(\d+) out of (\d+) found this helpful', helpful_text)
            if match:
                found_helpful = int(match.group(1))
                not_helpful = int(match.group(2)) - found_helpful
        else:
            found_helpful = int(review.select_one('span.ipc-voting__label__count--up').get_text(strip=True)) if review.select_one('span.ipc-voting__label__count--up') else 0
            not_helpful = int(review.select_one('span.ipc-voting__label__count--down').get_text(strip=True)) if review.select_one('span.ipc-voting__label__count--down') else 0

        # Return the review information in the expected format
        return {
            'Review Summary': review_summary,
            'Review': review_text,
            'Rating': review_rating,
            'Author': author_tag,
            'Date': review_date,
            'Helpful': found_helpful,
            'Not Helpful': not_helpful
        }
    def save_to_json(self):
        with open('movies_reviews.json', 'w', encoding='utf-8') as f:
            json.dump(self.movie_reviews, f, ensure_ascii=False, indent=4)
        print("Reviews saved to movies_reviews.json")

In [59]:
# ReviewsScraper class to fetch reviews for each movie
class MovieReviewScraper(BaseScraper):
    def __init__(self, movie_data):
        super().__init__()  # Call the base class constructor
        self.movie_data = movie_data  # Load movie data from movie data
        self.movie_reviews = []  # Adjusted to be a list of movie objects
        self.clicks = 0  # Initialize click counter
        self.is_scraping = True  # Flag to manage scraping status

    def fetch_reviews(self):
        try:
            for movie in self.movie_data:  # Iterate through the list of movies
                movie_id = movie.get('Movie ID')  # Get Movie ID from the dictionary
                title = movie.get('Title')  # Get Title from the dictionary
                total_reviews = 0

                if movie_id and title:  # Check if both Movie ID and Title are present
                    for rating_filter in range(1, 11):  # Loop from 1 to 10
                        review_url = f"https://www.imdb.com/title/{movie_id}/reviews?sort=submissionDate&dir=desc&&ratingFilter={rating_filter}&rating={rating_filter}"
                        self.driver.get(review_url)

                        self._load_reviews()  # Load more reviews by clicking the button

                        wait_time = self._calculate_wait_time(10, self.clicks)  # Adjust wait time based on click count
                        time.sleep(wait_time)

                        html = self.driver.page_source
                        soup = BeautifulSoup(html, 'html.parser')

                        # Extract reviews for the current movie and accumulate total_reviews
                        # num_reviews = self._extract_reviews(soup, movie_id, title)
                        num_reviews = self._extract_reviews(soup, movie_id, title)
                        total_reviews += num_reviews  # Accumulate reviews count

                else:
                    print(f"Movie data missing for {movie}. Skipping this movie.")

                print(f'Movie {movie_id} has {total_reviews} reviews')
        finally:
            self.close_driver()
            self.is_scraping = False
        return self.movie_reviews

    def _load_reviews(self):
        # Try to find and click the 'All' reviews button
        try:
            all_reviews_button = WebDriverWait(self.driver, 5).until(
                EC.presence_of_element_located((By.XPATH, "//*[@id='__next']/main/div/section/div/section/div/div[1]/section[1]/div[3]/div/span[2]/button"))
            )
            # self.driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", all_reviews_button)
            # time.sleep(2)  # Wait for the page to load if needed
            all_reviews_button.click()
        except Exception as e:
            # Could not find 'All' button, will try to find 'Load More' button.
            self._load_more_reviews()


    def _load_more_reviews(self):
        # Add progress bar for loading more reviews
        with tqdm(desc='Loading More Reviews', leave=False) as pbar:
            while True:
                try:
                    load_more_button = WebDriverWait(self.driver, 10).until(
                        EC.presence_of_element_located((By.XPATH, '//*[@id="load-more-trigger"]'))
                    )
                    # self.driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", load_more_button)
                    load_more_button.click()

                    self.clicks += 1  # Increment the click count
                    pbar.update(1)  # Update progress bar

                    wait_time = self._calculate_wait_time(1, self.clicks)  # Adjust wait time based on click count
                    time.sleep(wait_time)

                except Exception as e:  # No more 'Load More' buttons to click.
                    break

    def _calculate_wait_time(self, base_wait_time, clicks):
        """
        Calculate an adaptive wait time based on the number of clicks.
        As the number of clicks increases, the wait time grows exponentially to accommodate website lag.
        """
        growth_factor = 1.2  # Exponential growth factor
        additional_wait_time = base_wait_time * (growth_factor ** (clicks // 10))  # Increase wait time every 10 clicks
        
        return base_wait_time + additional_wait_time


    def _extract_reviews(self, soup, movie_id, title):
        reviews = soup.select('article.user-review-item')  # Attempt to extract reviews using one selector

        # Check if movie info already exists in the list
        movie_info = next((info for info in self.movie_reviews if info['Movie ID'] == movie_id), None)

        if movie_info is None:
            # Initialize a new movie_info if it doesn't exist
            movie_info = {
                'Movie ID': movie_id,
                'Reviews': []
            }
            self.movie_reviews.append(movie_info)  # Append to the main list

        # If no reviews found, try to load more reviews
        if not reviews:  # Load more
            reviews = soup.select('div.lister-item.mode-detail.imdb-user-review')
            if not reviews:  # If still no reviews available
                print(f"No reviews found for {title}.")
                return 0  # Return 0 if no reviews found

        # Parse reviews and add them to the movie_info['Reviews'] list
        for review in reviews:
            # Determine the button type (all vs. load more) based on the presence of specific elements
            button_type = "load_more" if review.select_one('span.rating-other-user-rating span') else "all"
            parsed_review = self._parse_review(review, button_type)

            # Append the parsed review to the 'Reviews' list
            movie_info['Reviews'].append(parsed_review)

        return len(reviews)  # Return the number of reviews processed

    def _parse_review(self, review, button_type):
        """
        Extract information from the review and return as a dictionary.
        """
        found_helpful = 0
        not_helpful = 0
        
        # Define selectors based on button_type
        if button_type == "load_more":
            selectors = {
                'rating': 'span.rating-other-user-rating span',
                'summary': 'a.title',
                'text': 'div.text.show-more__control',
                'author': 'span.display-name-link a',
                'date': 'span.review-date',
                'helpful': 'div.actions.text-muted',
            }
        else:
            selectors = {
                'rating': 'span.ipc-rating-star--rating',
                'summary': 'span[data-testid="review-summary"]',
                'text': 'div.ipc-html-content-inner-div',
                'author': 'a[data-testid="author-link"]',
                'date': 'li.review-date',
                'helpful_up': 'span.ipc-voting__label__count--up',
                'helpful_down': 'span.ipc-voting__label__count--down',
            }

        # Extract data using selectors
        review_rating = review.select_one(selectors['rating']).get_text(strip=True) if review.select_one(selectors['rating']) else 'No rating'
        review_summary = review.select_one(selectors['summary']).get_text(strip=True) if review.select_one(selectors['summary']) else 'No summary'
        review_text = review.select_one(selectors['text']).get_text(strip=True) if review.select_one(selectors['text']) else 'No content'
        author_tag = review.select_one(selectors['author']).get_text(strip=True) if review.select_one(selectors['author']) else 'Unknown Author'
        review_date = review.select_one(selectors['date']).get_text(strip=True) if review.select_one(selectors['date']) else 'No date'

        # Extract helpful votes
        if button_type == "load_more":
            helpful_text = review.select_one(selectors['helpful']).get_text(strip=True) if review.select_one(selectors['helpful']) else ''
            match = re.search(r'(\d+) out of (\d+) found this helpful', helpful_text)
            if match:
                found_helpful = int(match.group(1))
                not_helpful = int(match.group(2)) - found_helpful
        else:
            found_helpful = int(review.select_one(selectors['helpful_up']).get_text(strip=True)) if review.select_one(selectors['helpful_up']) else 0
            not_helpful = int(review.select_one(selectors['helpful_down']).get_text(strip=True)) if review.select_one(selectors['helpful_down']) else 0

        # Return the review information in the expected format
        return {
            'Review Summary': review_summary,
            'Review': review_text,
            'Rating': review_rating,
            'Author': author_tag,
            'Date': review_date,
            'Helpful': found_helpful,
            'Not Helpful': not_helpful
        }


    def save_to_json(self):
        with open('movies_reviews.json', 'w', encoding='utf-8') as f:
            json.dump(self.movie_reviews, f, ensure_ascii=False, indent=4)
        print("Reviews saved to movies_reviews.json")


### Test 1 movie

#### Test case

In [44]:
movie_data_1 = [{'Movie ID': 'tt15435876', 'Title': 'chim canh cut'}]

In [58]:
movie_data[:1]

[{'Movie ID': 'tt11315808', 'Title': 'Joker: Folie à Deux'}]

In [60]:
scraper = MovieReviewScraper(movie_data=movie_data[:1])
scraper.fetch_reviews()



                                             

Movie tt11315808 has 1445 reviews
Reviews saved to movies_reviews.json
Movies fetched and saved to movies_reviews.json


In [61]:
# Calculate the total number of reviews
total_reviews = sum(len(movie["Reviews"]) for movie in scraper.movie_reviews)

print(f"Total number of reviews: {total_reviews}")

Total number of reviews: 1445


In [47]:
def group_reviews_by_movie(movie_reviews):
    grouped_reviews = {}  # Dictionary to hold grouped reviews
    duplicates_count = {}  # Dictionary to hold duplicates count for each movie
    duplicates_reviews = {}  # Dictionary to hold duplicate reviews for each movie

    for movie in movie_reviews:
        movie_id = movie["Movie ID"]
        
        # Initialize a new entry for the movie ID if not already present
        if movie_id not in grouped_reviews:
            grouped_reviews[movie_id] = {
                "Movie ID": movie_id,
                "Reviews": []
            }
            duplicates_count[movie_id] = 0  # Initialize duplicate count for the movie
            duplicates_reviews[movie_id] = []  # Initialize list to hold duplicate reviews

        # Check for duplicates in the reviews using Author as the key
        existing_reviews = {review['Author']: review for review in grouped_reviews[movie_id]["Reviews"]}
        
        for review in movie["Reviews"]:
            # Use the Author as the key to check for duplicates
            if review['Author'] in existing_reviews:
                duplicates_count[movie_id] += 1  # Increment duplicate count
                duplicates_reviews[movie_id].append(review)  # Store the duplicate review
            else:
                existing_reviews[review['Author']] = review  # Add unique reviews

        # Update the grouped_reviews with the unique reviews
        grouped_reviews[movie_id]["Reviews"] = list(existing_reviews.values())

    # Return the grouped reviews, duplicates count, and duplicates reviews
    return list(grouped_reviews.values()), duplicates_count, duplicates_reviews



# Group and filter reviews
unique_movie_reviews, duplicates_count, duplicates_reviews  = group_reviews_by_movie(scraper.movie_reviews)

# Print result
import json
# print(json.dumps(unique_movie_reviews, ensure_ascii=False, indent=4))
print("Duplicates count:", duplicates_count)

# Print duplicate reviews
for movie_id, duplicates in duplicates_reviews.items():
    print(f"\nDuplicate reviews for Movie ID {movie_id}:")
    for review in duplicates:
        print(f"- {review['Review Summary']} by {review['Author']} on {review['Date']}")


Duplicates count: {'tt15435876': 0}

Duplicate reviews for Movie ID tt15435876:


In [26]:
unique_movie_reviews

[{'Movie ID': 'tt11315808',
  'Reviews': [{'Review Summary': 'Why Turn Joker 2 Into a Musical?',
    'Review': 'I\'m so irritated. "Joker" was a gritty, intense psychological thriller that explored complex themes of mental illness and societal breakdown. It was a raw, dark film that didn\'t need gimmicks or unnecessary changes to make a statement. So WHY in the world would anyone think it\'s a good idea to turn "Joker 2" into a musical?This is a slap in the face to fans of the original. We didn\'t ask for dancing and singing - we wanted more of the same unsettling atmosphere, not a Broadway-style number in the middle of Gotham\'s madness. I signed up for a continuation of Arthur Fleck\'s descent into chaos, not a flashy musical break that completely destroys the tone.The first "Joker" was a masterpiece. This sequel choice feels like a desperate attempt to be "different" without respecting the core of what made the character and story so compelling. Ridiculous decision.',
    'Rating': 

In [23]:
import json
import os

# Đọc file JSON với mã hóa utf-8
with open(os.path.join(os.getcwd(), 'movies_reviews.json'), 'r', encoding='utf-8') as file:
    data = json.load(file)

# Đếm số lượng review cho từng phim
for movie in data:
    movie_id = movie["Movie ID"]
    reviews = movie["Reviews"]
    review_count = len(reviews)
    print(f"Movie ID: {movie_id} có {review_count} review(s).")


Movie ID: tt11315808 có 334 review(s).
Movie ID: tt11315808 có 107 review(s).
Movie ID: tt11315808 có 132 review(s).
Movie ID: tt11315808 có 122 review(s).
Movie ID: tt11315808 có 122 review(s).
Movie ID: tt11315808 có 107 review(s).
Movie ID: tt11315808 có 98 review(s).
Movie ID: tt11315808 có 126 review(s).
Movie ID: tt11315808 có 107 review(s).
Movie ID: tt11315808 có 184 review(s).
