# Scraping 20k Movies in IMDB

In [7]:
pip install selenium

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: C:\Users\MODERN\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


## Import Modules

In [19]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import pandas as pd
import time
import re
from tqdm import tqdm  # Import tqdm for progress bar
from bs4 import BeautifulSoup
import requests
import json 

## WebDriver

In [20]:
# Base class for handling common WebDriver functionalities
class BaseScraper:
    def __init__(self):
        self.driver = self.init_driver()

    def init_driver(self):
        service = Service()
        options = webdriver.ChromeOptions()
        driver = webdriver.Chrome(service=service, options=options)
        return driver

    def close_driver(self):
        self.driver.quit()


## Scraping Movie Details

In [30]:
class MoviesScraper(BaseScraper):
    def __init__(self, url, clicks=2):
        super().__init__()  # Call the base class constructor
        self.url = url
        self.clicks = clicks  # Number of times to click the "50 more" button
        self.movie_data = []

    def fetch_movies(self):
        self.driver.get(self.url)

        # Click on "50 more" button for the specified number of clicks
        with tqdm(total=self.clicks, desc='Loading movies') as pbar:
            for _ in range(self.clicks):
                soup = self.click_see_more_button()  # Re-fetch HTML after each click if successful
                pbar.update(1)  # Update the progress bar for each click

        # Calculate and apply the wait time based on the number of clicks
        wait_time = self._calculate_wait_time(self.clicks)
        time.sleep(wait_time)  # Adjust wait time based on clicks

        # After all clicks, extract the final set of movie data
        html = self.driver.page_source
        soup = BeautifulSoup(html, 'html.parser')
        movies = soup.select('div.sc-59c7dc1-2')
        self.extract_movie_data(movies)  # Extract all data after final click

        self.close_driver()  # Close the driver after fetching movies

    def click_see_more_button(self):
        try:
            see_more_button = WebDriverWait(self.driver, 3).until(  # Reduced wait time to 3 seconds
                EC.element_to_be_clickable((By.XPATH, "//span[contains(text(), '50 more')]"))
            )

            # Scroll to and click the button
            self.driver.execute_script("arguments[0].scrollIntoView();", see_more_button)
            ActionChains(self.driver).move_to_element(see_more_button).click().perform()

            # Wait for new content to load with reduced wait time
            WebDriverWait(self.driver, 3).until(  # Reduced wait time to 3 seconds
                EC.presence_of_element_located((By.CLASS_NAME, 'ipc-title'))
            )
            time.sleep(1)  # Reduced sleep time to 1 second

            return BeautifulSoup(self.driver.page_source, 'html.parser')

        except Exception as e:
            print(f"Error occurred: {e}")
            return None

    def extract_movie_data(self, movies):
        for movie in movies:
            title_tag = movie.select_one('h3.ipc-title__text')
            year_tag = movie.select_one('div.sc-ab348ad5-7.cqgETV.dli-title-metadata > span:nth-child(1)')
            rating_tag = movie.select_one('span.ipc-rating-star--rating')
            link_tag = movie.select_one('a.ipc-title-link-wrapper')

            if title_tag and year_tag and rating_tag and link_tag:
                title = title_tag.text.strip()
                year = year_tag.text.strip()
                rating = rating_tag.text.strip()
                link = link_tag['href']

                # Extract movie ID
                movie_id = link.split('/title/')[1].split('/')[0]

                # Clean the title
                title = re.sub(r'^\d+\.\s*', '', title)

                self.movie_data.append({
                    'Movie ID': movie_id,
                    'Title': title,
                    'Year': year,
                    'Rating': rating
                })

    def to_dataframe(self):
        return pd.DataFrame(self.movie_data)

    def _calculate_wait_time(self, clicks):
        """
        Calculate the wait time based on the number of clicks.
        Increase wait time as the number of clicks grows.
        """
        base_wait_time = 2  # Base wait time in seconds
        # Increment wait time by 2 seconds for every 10 clicks
        additional_wait_time = (clicks // 10) * 2  # Add 2 seconds every 10 clicks
        
        return base_wait_time + additional_wait_time

### Test

In [33]:
# Define a sample URL for testing (this URL should lead to a page with movie data)
sample_url = "https://www.imdb.com/search/title/?title_type=feature"  # IMDb's Top Rated Movies

# Initialize the MoviesScraper with the URL and number of clicks
scraper = MoviesScraper(url=sample_url, clicks=200)

# Fetch movie data
scraper.fetch_movies()

# Convert the collected movie data to a DataFrame and display
movies_df = scraper.to_dataframe()
print("Movies DataFrame:")
print(movies_df)

Loading movies: 100%|██████████| 200/200 [3:05:27<00:00, 55.64s/it]   


Movies DataFrame:
        Movie ID                 Title  Year Rating
0     tt11315808   Joker: Folie à Deux  2024    5.3
1     tt17526714         The Substance  2024    7.9
2     tt10128846           Megalopolis  2024    5.0
3     tt14257582                 Wolfs  2024    6.5
4      tt6263850  Deadpool & Wolverine  2024    7.9
...          ...                   ...   ...    ...
9510  tt12447796             Unwelcome  2022    5.3
9511   tt9860728      Falling Inn Love  2019    5.7
9512   tt0080798                Gloria  1980    7.1
9513   tt4392454            13 Cameras  2015    5.2
9514   tt0412536  Brideshead Revisited  2008    6.6

[9515 rows x 4 columns]


In [34]:
movies_df.shape

(9515, 4)

In [36]:
movies_df.to_csv('movies_df.csv')

## Scraping Reviews of each Movies

In [None]:
class MovieReviewScraper(BaseScraper):
    def __init__(self, movie_data):
        super().__init__()  # Call the base class constructor
        self.movie_data = movie_data
        self.movie_reviews = []

    def fetch_reviews(self):
        # Loop through each movie in the provided movie data
        for index, row in self.movie_data.iterrows():
            movie_id = row['Movie ID']
            review_url = f"https://www.imdb.com/title/{movie_id}/reviews"
            self.driver.get(review_url)

            # Scroll to the 'All' or 'Load More' button to load reviews
            self._load_reviews()

            # Wait for a moment to ensure all reviews are fully loaded
            time.sleep(10)

            # Get the HTML of the page after loading reviews
            html = self.driver.page_source
            soup = BeautifulSoup(html, 'html.parser')

            # Extract reviews from the page
            self._extract_reviews(soup, movie_id, row['Title'])

        # Close the browser after fetching all reviews
        self.close_driver()

    def _load_reviews(self):
        # Try to find and click the 'All' reviews button
        try:
            all_reviews_button = WebDriverWait(self.driver, 5).until(
                EC.presence_of_element_located((By.XPATH, "//*[@id='__next']/main/div/section/div/section/div/div[1]/section[1]/div[3]/div/span[2]/button"))
            )
            self.driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", all_reviews_button)
            time.sleep(2)  # Wait for the page to load if needed
            all_reviews_button.click()
        except Exception as e:
            print("Could not find 'All' button, will try to find 'Load More'")
            self._load_more_reviews()

    def _load_more_reviews(self):
        # Use tqdm for loading indicator while clicking 'Load More'
        with tqdm(total=10, desc='Loading More Reviews', leave=False) as pbar:  # Adjust total as needed
            while True:
                try:
                    load_more_button = WebDriverWait(self.driver, 5).until(
                        EC.presence_of_element_located((By.XPATH, '//*[@id="load-more-trigger"]'))
                    )
                    self.driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", load_more_button)
                    time.sleep(2)  # Wait a moment for the page to load
                    load_more_button.click()
                    pbar.update(1)  # Update the progress bar for each click
                    time.sleep(2)  # Wait a moment before clicking again
                except Exception as e:
                    print("No more 'Load More' buttons to click.")
                    break

    def _extract_reviews(self, soup, movie_id, title):
        # Select reviews from the page
        reviews = soup.select('article.user-review-item')
        
        # If no reviews found, try to load more reviews
        if not reviews:
            reviews = soup.select('div.lister-item.mode-detail.imdb-user-review')
            if not reviews:  # If still no reviews available
                print("No reviews to load.")
                return
            
            for review in reviews:
                self._parse_review(review, movie_id, title, "load_more")
        else:  # If reviews were found
            for review in reviews:
                self._parse_review(review, movie_id, title, "all")

    def _parse_review(self, review, movie_id, title, review_type):
        # Extract information from the review based on its type (load_more or all)
        if review_type == "load_more":
            review_rating = review.select_one('span.rating-other-user-rating span').get_text(strip=True) if review.select_one('span.rating-other-user-rating span') else 'No rating'
            review_summary = review.select_one('a.title').get_text(strip=True) if review.select_one('a.title') else 'No summary'
            review_text = review.select_one('div.text.show-more__control').get_text(strip=True) if review.select_one('div.text.show-more__control') else 'No content'
            author_tag = review.select_one('span.display-name-link a').get_text(strip=True) if review.select_one('span.display-name-link a') else 'Unknown Author'
            review_date = review.select_one('span.review-date').get_text(strip=True) if review.select_one('span.review-date') else 'No date'
        else:
            review_rating = review.select_one('span.ipc-rating-star--rating').get_text(strip=True) if review.select_one('span.ipc-rating-star--rating') else 'No rating'
            review_summary = review.select_one('span[data-testid="review-summary"]').get_text(strip=True) if review.select_one('span[data-testid="review-summary"]') else 'No summary'
            review_text = review.select_one('div.ipc-html-content-inner-div').get_text(strip=True) if review.select_one('div.ipc-html-content-inner-div') else 'No content'
            author_tag = review.select_one('a[data-testid="author-link"]').get_text(strip=True) if review.select_one('a[data-testid="author-link"]') else 'Unknown Author'
            review_date = review.select_one('li.review-date').get_text(strip=True) if review.select_one('li.review-date') else 'No date'

        # Store the review information in a list
        self.movie_reviews.append({
            'Movie ID': movie_id,
            'Title': title,
            'Review Summary': review_summary,
            'Review': review_text,
            'Rating': review_rating,
            'Author': author_tag,
            'Date': review_date
        })

    def to_dataframe(self):
        # Convert the collected reviews to a DataFrame
        return pd.DataFrame(self.movie_reviews)


### Test 1 movie

#### Test case

In [221]:
# Create a small DataFrame for movies to test
data = {
    'Movie ID': ['tt6263850'],  # Replace with an actual movie ID
    'Title': ['Deadpool & Wolverine'],
    'Year': ['2024'],
    'Rating': ['7.9']
}
df = pd.DataFrame(data)

# Initialize and run the scraper
scraper = MovieReviewScraper(df)
scraper.fetch_reviews()

# Convert the collected reviews to a DataFrame and display
reviews_df = scraper.to_dataframe()
print(reviews_df)


Could not find 'All' button, will try to find 'Load More'
Clicked 'Load More' button (1/3)
Clicked 'Load More' button (2/3)
Clicked 'Load More' button (3/3)
     Movie ID                 Title                                     Review Summary                                             Review     Rating                  Author               Date
0   tt6263850  Deadpool & Wolverine                          Till you're 90 Wolverine!  Hugh Jackman is the perfect Wolverine. What a ...          9           omar-d-sheikh       25 July 2024
1   tt6263850  Deadpool & Wolverine  "You were always the wrong one, till you weren...  What a crazy blast ! Bonkers !!Sooo !...\nWhat...          9             valmont1702       24 July 2024
2   tt6263850  Deadpool & Wolverine               The Bloodiest Mayhem Of The Century!  We've waited so long for this moment, and it w...          8        MiroslavKyuranov       24 July 2024
3   tt6263850  Deadpool & Wolverine                                  Easter

In [219]:
reviews_df['Review'][0]

"Hugh Jackman is the perfect Wolverine. What a fun movie. I like the dialogue and clever quips with f bombs sprinkled in. It's definitely not taking itself too seriously. There is tons of fun cameos I didn't expect! I normally watch spoiler videos ahead of time, but I didn't on this occasion and I'm glad I didn't because there was some oh snap moments! It's a very good action packed fun film. The breaking the Fox jokes and speaking to the camera jokes are too funny. I can definitely see more sequels for these two on the horizon. They are promoting this movie hard I just watched these two on Hot Ones eating chicken wings They make a dynamic duo. Until your 90 Wolverine ...until your 90 lol."