## 1. Import the libraries

In [31]:
import requests
import time
import re
from bs4 import BeautifulSoup 
import pandas as pd

## 2. Scrape the webpage

Once the libraries have been loaded, the code below will loop through the URLs specified in the list *web_urls*, extract the relevant information, and populate it into lists.

In [32]:
# Define the URLs to loop over. Necessary due to no standard format between the first page and the other pages.
web_urls = ['https://www.imdb.com/search/title/?count=100&groups=top_1000&sort=user_rating',
            'https://www.imdb.com/search/title/?groups=top_1000&sort=user_rating,desc&count=100&start=101&ref_=adv_nxt',
            'https://www.imdb.com/search/title/?groups=top_1000&sort=user_rating,desc&count=100&start=201&ref_=adv_nxt',
            'https://www.imdb.com/search/title/?groups=top_1000&sort=user_rating,desc&count=100&start=301&ref_=adv_nxt',
            'https://www.imdb.com/search/title/?groups=top_1000&sort=user_rating,desc&count=100&start=401&ref_=adv_nxt',
            'https://www.imdb.com/search/title/?groups=top_1000&sort=user_rating,desc&count=100&start=501&ref_=adv_nxt',
            'https://www.imdb.com/search/title/?groups=top_1000&sort=user_rating,desc&count=100&start=601&ref_=adv_nxt',
            'https://www.imdb.com/search/title/?groups=top_1000&sort=user_rating,desc&count=100&start=701&ref_=adv_nxt',
            'https://www.imdb.com/search/title/?groups=top_1000&sort=user_rating,desc&count=100&start=801&ref_=adv_nxt',
            'https://www.imdb.com/search/title/?groups=top_1000&sort=user_rating,desc&count=100&start=901&ref_=adv_nxt']

In [33]:
# Prepare empty lists for the column names  
title = []
year = []
runtime = []
ratings = []
votes = []
movie_summary = []
genre = []
certification = []

In [34]:
#Loop over each URL, get the page and parse the HTML
for url in web_urls:
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')

# Source the Title and Year of the movie
    movies = soup.find_all('h3', class_ = 'lister-item-header')
    for movie in movies:
        movie_title = movie.find('a')
        movie_title_text = movie_title.get_text()
        title.append(movie_title_text)    
        movie_year_element = movie.find('span', class_ = 'lister-item-year text-muted unbold')
        movie_year = movie_year_element.get_text()
        movie_year_clean = re.findall('\d+', movie_year)
        year.append(movie_year_clean[0])
        
# Source the Runtime of the movie
    runs = soup.find_all('span', class_ = 'runtime')
    for run in runs:
        movie_runtime_text = run.get_text()
        movie_runtime = re.findall('\d+', movie_runtime_text)
        for run in movie_runtime:
            run = int(run)
            runtime.append(run)
            
# Source the IMDB Rating of the movie        
    imdb_ratings = soup.find_all('div', class_ = 'inline-block ratings-imdb-rating')
    for element in imdb_ratings:
        rating_str = element.get('data-value')
        rating = float(rating_str)
        ratings.append(rating)
        
# Source the Votes of the movie        
    vote_bar = soup.find_all('p', class_ = 'sort-num_votes-visible')
    for vote in vote_bar:
        votes_string = vote.get_text('', strip = True)
        vote_elements = votes_string.split('|')
        if len(vote_elements) >= 2:
            vote_num = vote_elements[0]
            vote_num_num = vote_num.split(':')[1]
            vote_actual = int(re.sub(',', '', vote_num_num))
            votes.append(vote_actual)
        else:
            vote_num = vote_elements[0]
            vote_num_num = vote_num.split(':')[1]
            vote_actual = int(re.sub(',', '', vote_num_num))
            votes.append(vote_actual)
            
# Source the Summary, Certification and Genre of the movie   
    description = soup.find_all('p', class_ = 'text-muted')
    i = 0
    for element in description:
        i = i +1
        if i%2 == 0:
            summary = (element.get_text('', strip = True))
            movie_summary.append(summary)        
        else:
            rest = (element.get_text('', strip = True))
            if len(rest.split('|')) >= 3:
                genre_item = rest.split('|')[2]
                genre.append(genre_item)
                cert = rest.split('|')[0]
                certification.append(cert)
            else:
                genre_iten = 'N/A'
                genre.append(genre_item)
                cert = 'N/A'
                certification.append(cert)

## 4. Generate the dataframe

In [35]:
# Generate a dataframe from the lists
imdb_movies = pd.DataFrame()

imdb_movies['Title'] = title
imdb_movies['Year'] = year
imdb_movies['Certification'] = certification
imdb_movies['Runtime'] = runtime
imdb_movies['Genre'] = genre
imdb_movies['Summary'] = movie_summary
imdb_movies['Votes'] = votes
imdb_movies['Ratings'] = ratings

In [36]:
imdb_movies.head()

Unnamed: 0,Title,Year,Certification,Runtime,Genre,Summary,Votes,Ratings
0,The Shawshank Redemption,1994,MA,142,Drama,Two imprisoned men bond over a number of years...,2164040,9.3
1,The Godfather,1972,R,175,"Crime, Drama",The aging patriarch of an organized crime dyna...,1485485,9.2
2,The Dark Knight,2008,M,152,"Action, Crime, Drama",When the menace known as the Joker wreaks havo...,2138907,9.0
3,The Godfather: Part II,1974,M,202,"Crime, Drama",The early life and career of Vito Corleone in ...,1034512,9.0
4,The Lord of the Rings: The Return of the King,2003,M,201,"Adventure, Drama, Fantasy",Gandalf and Aragorn lead the World of Men agai...,1536487,8.9


In [37]:
imdb_movies.tail()

Unnamed: 0,Title,Year,Certification,Runtime,Genre,Summary,Votes,Ratings
995,Munich,2005,MA15+,164,"Biography, Crime, Drama",Based on the true story of the Black September...,201658,7.5
996,Kiss Kiss Bang Bang,2005,MA15+,103,"Action, Comedy, Crime",A murder mystery brings together a private eye...,205502,7.5
997,Gangs of New York,2002,MA,167,"Crime, Drama","In 1862, Amsterdam Vallon returns to the Five ...",380693,7.5
998,RoboCop,1987,R,102,"Action, Crime, Sci-Fi","In a dystopic and crime-ridden Detroit, a term...",219216,7.5
999,The Fly,1986,R,96,"Drama, Horror, Sci-Fi",A brilliant but eccentric scientist begins to ...,148821,7.5


In [38]:
# Write out to a csv file for later analysis
imdb_movies.to_csv('imdb_movies.csv')