In [13]:
# Purpose: Web scraping Metacritic with Python

# urllib3 is a powerful, user-friendly HTTP client for Python.
import urllib3

# certifi is a Python package that provides Mozilla's carefully curated collection of Root Certificates 
# for validating the trustworthiness of SSL certificates while verifying the identity of TLS hosts.
import certifi

year = 2014
page = 1

# the link to the page we want to scrape
link = f"https://www.metacritic.com/browse/movie/all/all/{year}/metascore/?page={page}"

# create a PoolManager, which handles all details of connection pooling, thread-safety, and certificate management.
http = urllib3.PoolManager(ca_certs=certifi.where())

# request the page and get the response, which is an HTTPResponse object
# the response object has data, status, and headers attributes 
response = http.request('GET', link, headers={'User-Agent': 'Mozilla/5.0'})
datastring = str(response.data, "utf-8")

# print the characters fetched and the status code
print(f"Fetched {len(datastring)} characters from {link}: {response.status}")

# re is a module that provides regular expression matching operations
# compile() returns a Regex pattern object
import re

# the pattern we want to match is a movie title in the HTML
movie_title = re.compile(r'<div data-title="(.*?)\"')

# findall() returns all non-overlapping matches of pattern in string, as a list of strings
matches = movie_title.findall(datastring)

# print the number of matches
print(f"Found {len(matches)} matches")

Fetched 363654 characters from https://www.metacritic.com/browse/movie/all/all/2014/metascore/?page=1: 200
Found 24 matches
