### Jacob Kopec and Nico Morys
### Data Wrangling Project

### This code is for scraping the data of the Top 1000 Highest-Grossing Movies of All Time from IMDb

#### The data was extracted from this website https://www.imdb.com/list/ls098063263/

In [5]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
import time
import random
import pandas as pd
import re
pd.set_option('display.max_rows', None)

In [6]:
# Initialize WebDriver and open the URL
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get("https://www.imdb.com/list/ls098063263/")

In [7]:
def scroll_and_scrape_imdb(driver):
    # Initialize data lists
    titles, release_years, ratings, gross_earnings, directors = [], [], [], [], []

    def scroll_to_bottom(driver):
        #Scroll the page to the very bottom to ensure all content is loaded.
        last_height = driver.execute_script("return document.body.scrollHeight")
        while True:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(3)  # Wait for content to load
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height

    def ensure_unique_movies(current_titles, scraped_titles):
        #Ensure the scraped titles are unique to avoid duplicates.
        unique_indexes = [i for i, title in enumerate(scraped_titles) if title not in current_titles]
        return unique_indexes

    # First, scroll to the bottom of the page to load all movies
    print("Scrolling to the bottom of the page to load all movies...")
    scroll_to_bottom(driver)
    print("Finished scrolling to the bottom. Starting scraping...")

    page_number = 1
    total_movies_scraped = 0

    while total_movies_scraped < 1000:
        print(f"Scraping Page {page_number}...")

        # Find all movie elements on the current page
        movie_elements = driver.find_elements(By.XPATH, '//*[@id="__next"]/main/div/section/div/section/div/div[1]/section/div[2]/ul/li')

        if not movie_elements:
            print("No more movie elements found. Ending scrape.")
            break

        # Temporary lists for this round of scraping
        temp_titles, temp_release_years, temp_ratings, temp_gross_earnings, temp_directors = [], [], [], [], []

        for movie_element in movie_elements:
            try:
                # Title
                title_element = movie_element.find_element(By.XPATH, ".//h3[@class='ipc-title__text']")
                temp_titles.append(title_element.text)

                # Release Year
                try:
                    year_element = movie_element.find_element(By.XPATH, ".//span[contains(@class, 'dli-title-metadata-item') and (starts-with(text(), '20') or starts-with(text(), '19'))]")
                    temp_release_years.append(year_element.text.strip())
                except Exception:
                    temp_release_years.append("N/A")

                # Rating (e.g., PG, PG-13, etc.)
                try:
                    metadata_element = movie_element.find_element(By.XPATH, ".//div/div/div/div[1]/div[2]/div[2]")
                    metadata_text = metadata_element.text.split("\n")
                    valid_ratings = ["PG", "PG-13", "R", "TV-PG", "Not Rated", "G", "Approved", "TV-Y7", "TV-MA"]
                    movie_rating = next((item for item in metadata_text if item in valid_ratings), "N/A")
                    temp_ratings.append("Not Rated" if movie_rating in ["N/A", "", None] else movie_rating)
                except Exception:
                    temp_ratings.append("N/A")

                # Gross Earnings
                try:
                    gross_element = movie_element.find_element(By.XPATH, ".//span[contains(text(),'Worldwide Lifetime Gross:')]")
                    cleaned_gross = re.search(r'\$([\d,]+)', gross_element.text)
                    temp_gross_earnings.append(cleaned_gross.group(1).replace(",", "") if cleaned_gross else "N/A")
                except Exception:
                    temp_gross_earnings.append("N/A")

                # Directors
                try:
                    director_elements = movie_element.find_elements(By.XPATH, ".//a[@class='ipc-link ipc-link--base dli-director-item']")
                    directors_list = [director.text for director in director_elements]
                    temp_directors.append(", ".join(directors_list))  # Join all directors with a comma
                except Exception:
                    temp_directors.append("N/A")

            except Exception as e:
                print(f"Error while scraping a movie: {e}")

        # Filter only unique movies
        unique_indexes = ensure_unique_movies(titles, temp_titles)
        titles.extend([temp_titles[i] for i in unique_indexes])
        release_years.extend([temp_release_years[i] for i in unique_indexes])
        ratings.extend([temp_ratings[i] for i in unique_indexes])
        gross_earnings.extend([temp_gross_earnings[i] for i in unique_indexes])
        directors.extend([temp_directors[i] for i in unique_indexes])

        # Update the total count of movies scraped
        total_movies_scraped = len(titles)

        print(f"Total unique movies scraped: {total_movies_scraped}")

        # Break the loop if we've scraped 1000 movies or no more movies are loading
        if total_movies_scraped >= 1000:
            break

        # Wait a random amount of time before scrolling again
        wait_time = random.randint(5, 10)
        print(f"Waiting {wait_time} seconds before next scroll...")
        time.sleep(wait_time)

        # Increment the page number
        page_number += 1

    # Split rank and title into separate columns
    ranks = [int(title.split(". ")[0]) if ". " in title else None for title in titles]
    cleaned_titles = [title.split(". ", 1)[1] if ". " in title else title for title in titles]

    # Create a DataFrame from the scraped data
    imdb_data = pd.DataFrame({
        'rank': ranks,
        'movie_name': cleaned_titles,
        'release_year': pd.to_numeric(release_years, errors='coerce').astype('int'),
        'audience_rating': pd.Categorical(ratings),
        'gross_earnings': pd.to_numeric(gross_earnings, errors='coerce').astype('int64'),
        'director(s)': pd.Series(directors, dtype='string')
    })

    return imdb_data

# Usage
imdb_df = scroll_and_scrape_imdb(driver)
print(imdb_df.head())


Scrolling to the bottom of the page to load all movies...
Finished scrolling to the bottom. Starting scraping...
Scraping Page 1...
Total unique movies scraped: 250
Waiting 8 seconds before next scroll...
Scraping Page 2...
Total unique movies scraped: 250
Waiting 6 seconds before next scroll...


KeyboardInterrupt: 

In [None]:
display(imdb_df)

In [None]:
# Save the data to a CSV
imdb_df.to_csv("IMDbHG_raw.csv", index=False)

In [None]:
imdb_df.dtypes