# Importing Modules, Setting Up Regular Expressions and HTTP Pool

In [50]:
import re
import urllib3
import certifi
import json
import pymongo
import pandas
import time

# Setting up all relevant regular expressions
# Score_regex source: Stuart Ketcham from DA320
title_regex = re.compile(r"class=\"title\"><h3(.+)</h3>")
date_regex = re.compile(r"class=\"clamp-details\">\s+<span>(.+)</span>")
description_regex = re.compile(r"<div class=\"summary\">\s*([\S\s]+?)\s*<\/div>")
score_regex = re.compile(r">(.*)<\/div>\s+<\/a>\s+<\/div>\s+<s")
image_regex = re.compile(r"<a href=\"/movie/.*\"><img src=\"(.*)\" alt=\"")

# Setting up an HTTP pool for connections
http = urllib3.PoolManager(ca_certs=certifi.where())

# Connecting to MongoDB

In [51]:
# Loading the json file that contains my secret MongoDB connection string
with open ('/Users/hong.vince/Desktop/DA320/credentials.json') as u:
    data = json.load(u)

    secret_key = data['my-secret-key']

# Connecting to the database using known good certificates
client = pymongo.MongoClient(secret_key, tlsCAFile=certifi.where())

# Fetching my database titled "DA320"
da320_database = client['DA320']

# Accessing my collections from my "DA320" database
allCollections = da320_database['metacritic']

# Creating the Web Scraping Function

In [52]:
def metacritic_scraper(year: int, page: int) -> pandas.DataFrame:

    # Fetching webpage
    url = f"https://www.metacritic.com/browse/movies/score/metascore/year/filtered?year_selected={year}&sort=desc&view=detailed&page={page}"
    response = http.request('GET', url, headers={'User-Agent': 'Mozilla/5.0'})
    datastring = str(response.data, "utf-8")

    #Executing all the regular expressions
    titles = title_regex.findall(datastring)
    dates = date_regex.findall(datastring)
    descriptions = description_regex.findall(datastring)
    scores = score_regex.findall(datastring)
    images = image_regex.findall(datastring)

    # Returning a unified collection
    dataset = {"title": titles, "date": dates, "description": descriptions, "score": scores, "image": images}
    return pandas.DataFrame(dataset)
   

# Looping Through Years and Pages

In [53]:
# Writing a CSV file with the data
for year in range (2000, 2023):
    page = 0 
    print(f"Collecting data for {year} page {page}...")

    # Retry a page multiple times if necessary
    while True:
        data = metacritic_scraper(year, page)

        # Stop when we reach a page with zero rows
        if len(data) == 0:
            break

        # Converting the dataframe into a list of movies to insert into MongoDB
        movies_to_insert =[]
        for row in data.itertuples():
            movie = {
                "title": row.title,
                "release_date": row.date,
                "description": row.description,
                "metascore": row.score,
                "image_url": row.image,
            }
            movies_to_insert.append(movie)

        # Insert records into MongoDB
        print(f"Inserting {len(movies_to_insert)} movies for the year {year} page {page}")
        allCollections.insert_many(movies_to_insert)
        page = page + 1

Collecting data for 2000 page 0...
Inserting 100 movies for the year 2000 page 0
Inserting 100 movies for the year 2000 page 1
Inserting 100 movies for the year 2000 page 2
Inserting 65 movies for the year 2000 page 3
Collecting data for 2001 page 0...
Inserting 100 movies for the year 2001 page 0
Inserting 100 movies for the year 2001 page 1
Inserting 100 movies for the year 2001 page 2
Inserting 85 movies for the year 2001 page 3
Collecting data for 2002 page 0...
Inserting 100 movies for the year 2002 page 0
Inserting 100 movies for the year 2002 page 1
Inserting 100 movies for the year 2002 page 2
Inserting 100 movies for the year 2002 page 3
Inserting 30 movies for the year 2002 page 4
Collecting data for 2003 page 0...
Inserting 100 movies for the year 2003 page 0
Inserting 100 movies for the year 2003 page 1
Inserting 100 movies for the year 2003 page 2
Inserting 100 movies for the year 2003 page 3
Inserting 9 movies for the year 2003 page 4
Collecting data for 2004 page 0...
In