In [1]:
#Import necessary libraries 
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import numpy as np
import time

#normally, the '.' in regex represents ALL characters except \n, but re.DOTALL includes \n as well
re.DOTALL

re.DOTALL

## Experiment - Comparing Regex and Beautiful Soup for iMDB Scraping

In [2]:
def extract_imdb_data(type):
    startTime = time.time()
    
    rankings = []
    titles = []
    years = []
    ratings = []
    
    if type == "movies":
        html = requests.get('https://www.imdb.com/chart/moviemeter/?ref_=nv_mv_mpm').text
    elif type == "tv":
        html = requests.get('https://www.imdb.com/chart/tvmeter/?ref_=nv_tvv_mptv').text
    else:
        raise ValueError("Must be movies or tv")
    soup = BeautifulSoup(html, 'lxml')
    movie_boxes = soup.find_all('tr')
    movie_boxes = movie_boxes[1:]

    for movie_box in movie_boxes:
        movie = movie_box.find("td", "titleColumn")

        title = movie.find("a").text
        titles.append(title)

        year = movie.find("span", "secondaryInfo").text[1:-1]
        years.append(year)

        rating = movie_box.find("td", "ratingColumn").text.strip()
        if rating == "":
            rating = 0
        ratings.append(float(rating))
    
    dict = {'Title': titles, 'Year': years, 'Rating': ratings} 
    df = pd.DataFrame(dict)
    
#     df = df.drop(df[df.Rating == 0].index)
    df.index = np.arange(1, len(df)+1)
    
    endTime = (time.time() - startTime)
    
    print("Code Executed in", endTime, "seconds")
    return df

In [3]:
tvs = extract_imdb_data("tv")
movies = extract_imdb_data("movies")

Code Executed in 1.4616799354553223 seconds
Code Executed in 1.5651764869689941 seconds


In [4]:
tvs

Unnamed: 0,Title,Year,Rating
1,Yellowstone,2018,8.7
2,Arcane: League of Legends,2021,9.4
3,Dexter: New Blood,2021,9.1
4,Succession,2018,8.7
5,Narcos: México,2018,8.4
...,...,...,...
96,Riverdale,2017,6.7
97,Shetland,2013,8.2
98,I Know What You Did Last Summer,2021,5.3
99,The Flash,2014,7.6


In [5]:
movies

Unnamed: 0,Title,Year,Rating
1,Eternals,2021,6.8
2,Dune: Part One,2021,8.2
3,The Harder They Fall,2021,6.6
4,Red Notice,2021,6.4
5,No Time to Die,2021,7.4
...,...,...,...
96,Scream,2022,0.0
97,Joker,2019,8.4
98,1917,2019,8.3
99,Reminiscence,2021,5.9


In [250]:
rating_pattern = re.compile(r'(?<=>Eternals)((.|\n)*)(?=class="secondaryInfo">)')

ratings_list = []

ratings = rating_pattern.finditer(html)
for rating in ratings:
    print(rating)
    rating_item = rating.group(0)
    ratings_list.append(rating_item)
    
print(len(ratings_list))

<re.Match object; span=(93976, 263759), match='</a>\n        <span class="secondaryInfo">(2021)<>
1


In [6]:
def extract_imdb_data(type):
    
    startTime = time.time()
    
    if type == "movies":
        html = requests.get('https://www.imdb.com/chart/moviemeter/?ref_=nv_mv_mpm').text
    elif type == "tv":
        html = requests.get('https://www.imdb.com/chart/tvmeter/?ref_=nv_tvv_mptv').text
    else:
        raise ValueError("Must be movies or tv")

    html = requests.get('https://www.imdb.com/chart/moviemeter/?ref_=nv_mv_mpm').text
    
    # REGEX pattern for matching movie titles
    title_pattern = re.compile(r'(?<= >)(.*)(?=</a>\n        <span class="secondaryInfo">)')

    titles_list = []

    titles = title_pattern.finditer(html)
    for title in titles:
    #     print(title)
        title_item = title.group(0)
        titles_list.append(title_item)

    # print(titles_list)
    
    # REGEX pattern for matching movie ratings
    rating_pattern = re.compile(r'(?<=user ratings">)(.*)(?=</strong>)')

    ratings_list = []

    ratings = rating_pattern.finditer(html)
    for rating in ratings:
    #     print(rating)
        rating_item = rating.group(0)
        ratings_list.append(rating_item)

    # print(ratings_list)
    
    # REGEX pattern for matching movie release years
    year_pattern = re.compile(r'(?<=<span class="secondaryInfo">)(.*)(?=</span>)')

    years_list = []

    years = year_pattern.finditer(html)
    for year in years:
        year_item = year.group(0)[1:-1]
        years_list.append(year_item)

    # print(years_list)

    dict = {'Title': titles_list, 'Year': years_list} 
    df = pd.DataFrame(dict)

    #df = df.drop(df[df.Rating == 0].index)
    df.index = np.arange(1, len(df)+1)

    endTime = (time.time() - startTime)    
    print("Code Executed in", endTime, "seconds")
    
    return(df)

In [7]:
tvs = extract_imdb_data("tv")
movies = extract_imdb_data("movies")

Code Executed in 2.763698101043701 seconds
Code Executed in 2.8685131072998047 seconds


In [8]:
tvs

Unnamed: 0,Title,Year
1,Eternals,2021
2,Dune: Part One,2021
3,The Harder They Fall,2021
4,Red Notice,2021
5,No Time to Die,2021
...,...,...
96,Scream,2022
97,Joker,2019
98,1917,2019
99,Reminiscence,2021


In [9]:
movies

Unnamed: 0,Title,Year
1,Eternals,2021
2,Dune: Part One,2021
3,The Harder They Fall,2021
4,Red Notice,2021
5,No Time to Die,2021
...,...,...
96,Scream,2022
97,Joker,2019
98,1917,2019
99,Reminiscence,2021
