In [1]:
# Importing dependencies
from splinter import Browser
from splinter.exceptions import ElementDoesNotExist
from bs4 import BeautifulSoup as bs
import pandas as pd
import time

In [2]:
# Defining a function which will take the results of the web scrape and store as a dataframe
def create_df(titles, ratings):
    df = pd.DataFrame({"Title": titles, "Rating": ratings})
    df.reset_index(inplace = True)
    
    df["index"] = df["index"].apply(lambda x: x + 1)
    df.rename(columns = {"index": "Rank"}, inplace = True)
    
    df.set_index("Rank")
    
    return df

In [3]:
# Defining a function which will scrape the page that we are visiting (on IMDB the pages we are interested in have the same HTML structure)
def scrape_page():
    # Extract the HTML from the website and parse with BeautifulSoup
    html = browser.html
    soup = bs(html, 'html.parser')
    
    # Search for the table which contains the information we want to scrape
    table = soup.find("table", class_ = "chart")

    return table

In [4]:
# Creating the path to chromedriver and setting up the browser object
executable_path = {'executable_path': 'chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)

In [5]:
# Defining the URL for scraping & visiting the website
url = "https://www.imdb.com/chart/top/"
browser.visit(url)

time.sleep(2)

In [6]:
# Scraping the page
table = scrape_page()

In [7]:
# Find the titles within the table
titles = table.find_all("td", class_ = "titleColumn")

# Creating an empty list to hold titles
title_list = []

# Append each title name to a list
for title in titles:
    title_list.append(title.find("a").get_text())
    
# Confirming that all 250 titles were found
len(title_list)

250

In [8]:
# Find the ratings within the table
ratings = table.find_all("td", class_ = "ratingColumn")

# Creating an empty list to hold ratings
rating_list = []

# Append each rating to a list
for i in range(0, len(ratings)):
    if i % 2 == 0:
        rating_list.append(ratings[i].find("strong").get_text())
        
# Confirming that all 250 titles were found
len(rating_list)

250

In [9]:
# Creating the dataframe to store the top movies
top_movies_df = create_df(title_list, rating_list)

In [10]:
# Using splinter to click the Most Popular Movies link
browser.click_link_by_text(" Most Popular Movies ")

# Waiting for the page to load completely
time.sleep(2)



In [11]:
# Scraping the page
table = scrape_page()

In [12]:
# Find the titles within the table
titles = table.find_all("td", class_ = "titleColumn")

# Creating an empty list to hold titles
title_list = []

# Append each title name to a list
for title in titles:
    title_list.append(title.find("a").get_text())
    
# Confirming that all 100 titles were found
len(title_list)

100

In [13]:
# Find the ratings within the table
ratings = table.find_all("td", class_ = "ratingColumn")

# Creating an empty list to hold ratings, and the index value of any missing ratings
rating_list = []
missing_index = []

# Append each rating to a list
for i in range(0, len(ratings)):
    if i % 2 == 0:
        try:
            rating_list.append(ratings[i].find("strong").get_text())
        except:
            missing_index.append(i/2)
        
# Confirming that all 100 titles were found
print(f"Ratings found: {len(rating_list)}\nRatings missing: {len(missing_index)}")

Ratings found: 87
Ratings missing: 13


In [14]:
# Adding blank values to the rating list where the rating could not be found
for index in missing_index:
    rating_list.insert(int(index), None)

# Confirming the list now has 100 values
len(rating_list)

100

In [15]:
# Creating the dataframe to store the most popular movies
most_popular_df = create_df(title_list, rating_list)

In [16]:
# Renaming the columns
top_movies_df.rename(columns = {"Rank": "Rating Rank"}, inplace = True)

most_popular_df.rename(columns = {"Rank": "Popularity Rank"}, inplace = True)

In [17]:
# Exporting the dataframes as CSVs
top_movies_df.to_csv("top_rated_movies.csv", index = False)
most_popular_df.to_csv("most_popular_movies.csv", index = False)

In [18]:
# Ending the browser session
browser.quit()