# Scraping

This notebook contains the code written to scrape Metacritic for game details.

In [3]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests
import time
import warnings
from openpyxl import load_workbook



In [121]:
# Send request. I'm not sure why I need this header, but it doesn't work without it.
meta = requests.get(url, headers = user_agent)
meta = BeautifulSoup(meta.text)

In [141]:
# This is required otherwise the request won't work
user_agent = {'User-agent': 'Mozilla/5.0'}
urls = []
names = []
# Metacritic starts index at 0. There are currently 192 pages of games, and this is easier than pulling that informtaion thru scraping.
num_pages = 191
for page in range(0, num_pages):
    url = f"https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?view=condensed&page={page}"
    meta = requests.get(url, headers = user_agent)
    meta = BeautifulSoup(meta.text)
    # These classes contain the games. Four chunks of information separated by ads..
    games = meta.find_all("div", {"class": ["browse_list_wrapper one browse-list-large", "browse_list_wrapper two browse-list-large", "browse_list_wrapper three browse-list-large", "browse_list_wrapper four browse-list-large"]})

    for i in range(4):
        name = games[i].find_all("h3")
        names = names + name

        # Extract just the url
        url = games[i].find_all("a", {"class" : "title"}, href = True)
        for a in url:
            urls.append(a["href"])
    time.sleep(0.5)
# Change to string objects, remove html tage
names = [name.text for name in names]

In [142]:
# <h3> was used as tag for game name and 'User Score', so we need to remove elements that are user score.
# This gives an error, because when it pops the length of the list is reduced. The task is done by the time it fails though, so it doesn't really matter. Use except clause to end gracefully

for i in range(len(names) - 1):
    try:
        if(names[i] == 'User Score'):
            names.pop(i)
    except:
        None

In [144]:
games_df = pd.DataFrame(list(zip(names, urls)), columns = ["Name", "URL"])


In [145]:
#games_df.to_csv(r'allgames.csv', index = False)

## Scraping Individual game pages

Could get a few more dimensions (ESRB Descriptors and some developer credits) if we go to details page. I don't think these dimensions are interesting enough to warrant a rewrite though.

In [31]:
# If from saved csv
games_df = pd.read_csv("allgames.csv")

main = "https://www.metacritic.com"
games_df["Metascore"] = np.nan
games_df["User_score"] = np.nan
games_df["Summary"] = np.nan
games_df["Publisher"] = np.nan
games_df["Release_Date"] = np.nan
games_df["Developer"] = np.nan
games_df["Genres"] = np.nan
games_df["ESRB"] = np.nan
games_df["Number_of_Players"] = np.nan
games_df["Platform"] = np.nan


In [19]:
# Set up excel writer. I created an excel file called gamedata.xlsx with column headers
# Can't do csv because the format of some of the dimensions breaks it

writer = pd.ExcelWriter("gamedata.xlsx", engine = "openpyxl")
writer.book = load_workbook("gamedata.xlsx")

reader = pd.read_excel(r"gamedata.xlsx")

In [32]:
# Iterate through entries in games_df. My computer keeps crshing before this finishes, so I'm trying to append each row to the excel file as we go
user_agent = {'User-agent': 'Mozilla/5.0'}

# Sketchy, but I get a warning when assigning genres that is annoying when iterated 10000 times
warnings.filterwarnings("ignore")

for i in range(len(games_df)):
    try:
        url = main + games_df["URL"][i]
        # Retrieve html
        meta = requests.get(url, headers = user_agent)
        meta = BeautifulSoup(meta.text)

        # Scrape relevant info
        scores = meta.find_all("a", {"class": "metascore_anchor"})
        metascore = scores[0].text.replace("\n", "")
        userscore = scores[1].text.replace("\n", "")

        try:
            summary = meta.find("span", {"class": "blurb blurb_expanded"}).text
        except:
            summary = np.nan

        details = meta.find_all("ul", {"class": "summary_details"})
        # Three sections with relevant information labelled in this way. Second section is the summary which we already have the expanded version of.
        top = details[0].find_all("span", {"class": "data"})
        try:
            publisher = top[0].text.replace("\n", "").replace("  ", "") # Random bunch of spaces for some reason.
        except:
            publisher = np.nan
        try:
            release_date = top[1].text.replace("\n", "")
        except:
            release_date = np.nan

        try:
            developer = meta.find("li", {"class": "summary_detail developer"}).find("a", {"class" : "button"}).text
        except:
            developer = np.nan

        try:
            genres = meta.find("li", {"class": "summary_detail product_genre"}).find_all("span", {"class": "data"})
            genres = [genre.text for genre in genres]
        except:
            genres = np.nan

        try:
            rating =  meta.find("li", {"class": "summary_detail product_rating"}).find("span", {"class": "data"}).text
        except:
            rating = np.nan

        # This data doesn't exist for some games
        try:
            num_players = meta.find("li", {"class": "summary_detail product_players"}).find("span", {"class": "data"}).text
        except:
            num_players = np.nan

        try:
            platform = meta.find("span", {"class": "platform"}).text.replace("\n", "").replace("  ", "").replace("\t", "")
        except:
            platform = np.nan

        # Assign to dataframe

        games_df.loc[i, "Metascore"] = metascore
        games_df.loc[i, "User_score"] = userscore
        games_df.loc[i, "Summary"] = summary
        games_df.loc[i, "Publisher"] = publisher
        games_df.loc[i, "Release_Date"] = release_date
        games_df.loc[i, "Developer"] = developer
        # Since genres is a list, the other strategy doesn't work and I don't know a better way. Need to suppress these warnings
        games_df["Genres"][i] = genres
        games_df.loc[i, "ESRB"] = rating
        games_df.loc[i, "Number_of_Players"] = num_players
        games_df.loc[i, "Platform"] = platform

    except:
        print(f"Error on row {i}")
    #Append line to excel file
    pd.DataFrame(games_df.iloc[i]).transpose().to_csv(r'gamedata.csv', header = False, index = False, mode = 'a')
    #Avoid DOS
    time.sleep(0.5)
    
# csv gets messed up because of the commas and escape chars


Error on row 1713
Error on row 3620
Error on row 4335
Error on row 10116
Error on row 11094
Error on row 13387
Error on row 13519
Error on row 14700
Error on row 15184
Error on row 15185
Error on row 15268
Error on row 16693
Error on row 16723
Error on row 18262


In [26]:
games_df[:10].to_csv("test.csv", sep =";")

In [16]:
pd.DataFrame(games_df.iloc[1]).transpose()

Unnamed: 0,Name,URL,Metascore,User_score,Summary,Publisher,Release_Date,Developer,Genres,ESRB,Number_of_Players,Platform
1,Tony Hawk's Pro Skater 2,/game/playstation/tony-hawks-pro-skater-2,98,7.4,The soundtrack of my youth. Just the music mak...,Activision,"Sep 20, 2000",Neversoft Entertainment,"[Sports, Alternative, Skateboarding]",T,1-2,PlayStation
