In [1]:
# Purpose: Web scraping Metacritic with Python

# urllib3 is a powerful, user-friendly HTTP client for Python.
import urllib3

# certifi is a Python package that provides Mozilla's carefully curated collection of Root Certificates 
# for validating the trustworthiness of SSL certificates while verifying the identity of TLS hosts.
import certifi

year = 2014
page = 1

In [2]:
# the link to the page we want to scrape
link = f"https://www.metacritic.com/browse/movie/all/all/{year}/metascore/?page={page}"

# create a PoolManager, which handles all details of connection pooling, thread-safety, and certificate management.
http = urllib3.PoolManager(ca_certs=certifi.where())

In [3]:
# request the page and get the response, which is an HTTPResponse object
# the response object has data, status, and headers attributes 
response = http.request('GET', link, headers={'User-Agent': 'Mozilla/5.0'})
datastring = str(response.data, "utf-8")

# print the characters fetched and the status code
print(f"Fetched {len(datastring)} characters from {link}: {response.status}")

Fetched 367175 characters from https://www.metacritic.com/browse/movie/all/all/2014/metascore/?page=1: 200


In [4]:
# re is a module that provides regular expression matching operations
import re

In [5]:
# compile() returns a Regex pattern object
# the pattern we want to match is a movie title in the HTML
movie_title = re.compile(r'<div data-title="(.*?)\"')
# findall() returns all non-overlapping matches of pattern in string, as a list of strings
movie_title_matches = movie_title.findall(datastring)

In [6]:
# compile() returns a Regex pattern object
# the pattern we want to match is a release date in the HTML
release_date = re.compile(r'<span class="u-text-uppercase">\s+(.*)\s+<\/span>')
dates = release_date.findall(datastring)


In [7]:
# the pattern we want to match is a metascore in the HTML
metascore = re.compile(r'<div title="Metascore\s(.*?)"')
metascore_matches = metascore.findall(datastring)

In [8]:
# the pattern we want to match is a thumbnail URL in the HTML
thumbnail = re.compile(r'<img src="(.*?)" height')
thumbnail_matches = thumbnail.findall(datastring)

fixed = []
for link in thumbnail_matches:
    #print(link)
    link = link.replace("&amp;", "&")
    fixed.append(link)
#print(fixed)
thumbnail_matches = fixed

In [9]:
# the pattern we want to match is a description in the HTML
description = re.compile(r'<div class="c-finderProductCard_description"><span>(.*?)<')
description_matches = description.findall(datastring)

In [10]:
# print the number of matches
print(f"Found {len(movie_title_matches)} matches")

Found 24 matches


In [11]:
# pandas is a fast, powerful, flexible and easy to use open source data analysis and manipulation tool,
# built on top of the Python programming language.
# openpyxl is a Python library to read/write Excel 2010 xlsx/xlsm/xltx/xltm files.
import pandas as pd
import openpyxl

In [12]:
# create a dataframe with the data we scraped
# like a spreadsheet or SQL table, or a dict of Series objects.
# the data we scraped is stored in lists, so we can create a dataframe from a dict of lists.

df = pd.DataFrame()
df['Movie Title'] = movie_title_matches
df['Release Date'] = dates
df['Metascore'] = metascore_matches
df['Thumbnail'] = thumbnail_matches
df['Description'] = description_matches

In [13]:
# a way to organize columns in data frames if called upon
# data = {'Movie Title': movie_title_matches, 'Release Date': dates}

## print dataframe instead of opening excel file output  
#print(df)

# write the dataframe to an excel file
df.to_excel("newoutput.xlsx")