### Jacob Kopec and Nico Morys
### Data Wrangling Project

### The code below can be used to scrape IMDb's Top 1000 Highest Rated Movies of All Time list

#### The link to the list is here: https://www.imdb.com/search/title/?groups=top_1000&count=100&sort=user_rating,desc

In [1]:
#importing pandas and selenium
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By # used to import different ways to access data in the XML or HTML file
from selenium.webdriver.chrome.service import Service # no longer need to download a driver file, use service
from selenium.common.exceptions import NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager # used to manage the Chrome driver to emulate a Chrome web browser
from selenium.common.exceptions import ElementClickInterceptedException
import time
import random

In [9]:
#path to chrome (personal to my computer)
chromedriver_path = r'C://Users//nicom//.wdm//drivers//chromedriver//win64//chromedriver.exe'

service = Service(executable_path=chromedriver_path)

#initializes the browser
browser = webdriver.Chrome(service=service)

#creates empty lists to store the data
Name = []
Date = []
ARating = []


#url for the site we were asked to scrape
url = "https://www.imdb.com/search/title/?groups=top_1000&count=100&sort=user_rating,desc"
browser.get(url)

#starts page number counter at 1
page_num = 1

#loop to scrape through pages
while True:
    if page_num == 1:
        print(f"Scraping the {page_num}st set of 100...") 
    elif page_num < 3: 
        print(f"Scraping the {page_num}nd set of 100...")
    elif page_num == 3: 
        print(f"Scraping the {page_num}rd set of 100...")
    elif page_num > 3: 
        print(f"Scraping the {page_num}th set of 100...")

    #waits for a random time between 2 to 20 seconds
    wait_time = random.randint(1, 2)
    time.sleep(wait_time)

    # Clear the previous data from the lists before appending the new set
    Name.clear()
    Date.clear()
    ARating.clear()
        
    #scrolls to the bottom of the page
    last_height = browser.execute_script("return document.body.scrollHeight")
    while True:
        browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        new_height = browser.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height
   
    #scrapes name 
    name_elements = browser.find_elements(By.XPATH, '//*[@id="__next"]/main/div[2]/div[3]/section/section/div/section/section/div[2]/div/section/div[2]/div[2]/ul/li/div/div/div/div[1]/div[2]/div[1]/a/h3')
    for name in name_elements:
        Name.append(name.text)

    #scrapes date
    date_elements = browser.find_elements(By.XPATH, '//*[@id="__next"]/main/div[2]/div[3]/section/section/div/section/section/div[2]/div/section/div[2]/div[2]/ul/li/div/div/div/div[1]/div[2]/div[2]/span[1]')
    for date in date_elements:
        Date.append(date.text)

    #finds all div elements with the specific class that contains movie metadata
    movie_metadata_elements = browser.find_elements(By.CSS_SELECTOR, "div.sc-300a8231-6.dBUjvq.dli-title-metadata")

    #iterate through each movie metadata element
    for movie_metadata in movie_metadata_elements:
        #find all span elements inside the current div
        arating_elements = movie_metadata.find_elements(By.CSS_SELECTOR, "span.sc-300a8231-7.eaXxft.dli-title-metadata-item")

        #checks if there are at least 3 elements (the third element is the rating)
        if len(arating_elements) >= 3:
            arating = arating_elements[2].text.strip()  #grabs the third span (the rating)
            if arating:  #if the rating exists and isn't empty
                ARating.append(arating)
            else:
                ARating.append("Not Rated")  #appends "Not Rated" if no text is found
        else:
            ARating.append("Not Rated")  #appends "Not Rated" if the third element doesn't exist
   
    
     #hits the "100 more" button if there is one; if not, it ends the while loop and writes no more pages
    try:
        #locates the 100 more button using the XPath from the website
        more_button = browser.find_element(By.XPATH, '//*[@id="__next"]/main/div[2]/div[3]/section/section/div/section/section/div[2]/div/section/div[2]/div[2]/div[2]/div/span/button/span/span')
    
        #clicks the 100 more button
        more_button.click()
    
        #increments the set of 100 number
        page_num += 1
    
        #waits for a short time before continuing to the set of 100
        time.sleep(5)
    
    except NoSuchElementException:
        print("100 more button not found")
        break  #exits the loop if the button is not found

    except ElementClickInterceptedException:
        print("Could not click '100 more' button")
        break  #exits the loop if the button cannot be clicked

print(f"Length of Name: {len(Name)}")
print(f"Length of Date: {len(Date)}")
print(f"Length of ARating: {len(ARating)}")



#attaches the lists that were filled when scraped, to a column with the correct name
IMDbT1 = pd.DataFrame({
    "Movie Name": Name,
    "Release Date": Date,
    "Audience Rating": ARating
})

#saves dataframe to a csv file
IMDbT1.to_csv("IMBbT1_raw.csv", encoding = "utf-8", index = False)

Scraping the 1st set of 100...
Scraping the 2nd set of 100...
Scraping the 3rd set of 100...
Scraping the 4th set of 100...
Scraping the 5th set of 100...
Scraping the 6th set of 100...
Scraping the 7th set of 100...
Scraping the 8th set of 100...
Scraping the 9th set of 100...
Scraping the 10th set of 100...
100 more button not found
Length of Name: 1000
Length of Date: 1000
Length of ARating: 1000


In [10]:
#checks the column names to verify that they are what we want
print(IMDbT1.columns)

Index(['Movie Name', 'Release Date', 'Audience Rating'], dtype='object')


In [11]:
#strips any potential whitespace from column names
IMDbT1.columns = IMDbT1.columns.str.strip()

#splits Movie Name into Rank and Movie Name
split_names = IMDbT1['Movie Name'].str.split('. ', n=1, expand=True)

#assigns the split columns to the DataFrame
IMDbT1['Rank'] = split_names[0].str.strip()  # The rank (before the period)
IMDbT1['Movie_Name'] = split_names[1].str.strip()  # The movie name (after the period)

#drops the original Movie Name column
IMDbT1.drop('Movie Name', axis=1, inplace=True)

#reorders columns to Rank, Movie Name, Release Date, and Audience Rating
IMDbT1 = IMDbT1[['Rank', 'Movie_Name', 'Release Date', 'Audience Rating']]

#renames the columns here to snake case
IMDbT1 = IMDbT1.rename(columns={'Rank': 'rank', 'Movie_Name': 'movie_name', 'Release Date': 'release_date', 'Audience Rating': 'audience_rating'})

#converts data types
IMDbT1 = IMDbT1.astype({
    'rank': 'int64',
    'movie_name': 'string',
    'release_date': 'int64',
    'audience_rating': 'string'
})

In [12]:
#saves dataframe to a csv file
IMDbT1.to_csv("IMDbT1_raw.csv", encoding = "utf-8", index = False)