# Data Acquisition - Crawling
*We acquire our data from* [MetaCritic Best Game of All Time](https://www.metacritic.com/browse/games/score/metascore/all/all/filtered)

In [9]:
from bs4 import BeautifulSoup 
from collections import defaultdict
import requests
import pandas as pd
import numpy as np
import requests
import time

**Dictionary for all the info we are scraping from MetaCritic for each game**

In [10]:
dict = {"Index":[],"Title":[],"Release Date":[],"Platform":[],"MetaScore":[],"NumberOfCritics":[],"PositiveMetaScore":[],"MixedMetaScore":[],"NegativeMetaScore":[]
,"UserScore":[],"NumberOfUsers":[],"PositiveUserScore":[],"MixedUserScore":[],"NegativeUserScore":[],"Awards":[],"Rating":[],"OfficialSite":[],"Developer":[],"# Players":[],"Genres":[]
,"ESRB Descriptors":[],"Connectivity":[],"# Online Players":[],"OnlineModes":[],"OfflineModes":[],"Resolution":[],"SpecialControllers":[],"Sound":[]
,"Compatibility":[],"Customization":[],"SplitScreenOnlinePlayers":[],"SplitScreenOfflinePlayers":[]}

In [11]:
pageURL = "/browse/games/score/metascore/all/all/filtered"
baseURL = "https://www.metacritic.com"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"}
response = requests.get(baseURL + pageURL ,headers=headers)
soup = BeautifulSoup(response.content ,'html.parser')

**Getting details from the games detail table**

In [12]:
def fillGameDetails(soupForGamePage,detailsDict):
    try:
        time.sleep(0.2)
        gameDetailsURL = baseURL + soupForGamePage.find('div',attrs={'id':'main','class':'col main_col'}).find('li',attrs={'class':'nav nav_details'}).find('a').attrs['href']
        responseForGameDetails = requests.get(gameDetailsURL ,headers=headers)
        soupForGameDetails = BeautifulSoup(responseForGameDetails.content ,'html.parser')
        gameDetailsDataTable = soupForGameDetails.find('div',attrs={'id':'main','class':'col main_col'}).find_all('div',attrs={'class':'product_details'})[1].find_all('tr')
        getTextFromTable(gameDetailsDataTable,detailsDict)
        return True
    except AttributeError:
        return False

In [13]:
#The attributes in the table is different for each game so we decided to build a list with all the possible attributes for 
#any game and to check "manually" if the attribute is present , if not - adding NaN instead

def getTextFromTable(tableSoup,detailsDict):
    columnsToAddNaN = ["Rating","OfficialSite","Developer","# Players","Genres","ESRB Descriptors","Connectivity"
    ,"# Online Players","OnlineModes","OfflineModes","Resolution","SpecialControllers","Sound","Compatibility"
    ,"Customization","SplitScreenOnlinePlayers","SplitScreenOfflinePlayers"]
    for rowInTable in tableSoup:
        if rowInTable.find('th').get_text() ==  "Genre(s):":
            detailsDict["Genres"].append(rowInTable.find('td').get_text().replace(" ","").replace("\r","").replace("\n",""))
            columnsToAddNaN.remove("Genres")
            continue
        
        if rowInTable.find('th').get_text() == "Rating:":
            detailsDict["Rating"].append(rowInTable.find('td').get_text())
            columnsToAddNaN.remove("Rating")
            continue

        if rowInTable.find('th').get_text() == "Developer:":
            detailsDict["Developer"].append(rowInTable.find('td').get_text())
            columnsToAddNaN.remove("Developer")
            continue

        if rowInTable.find('th').get_text() == "Official Site:":
            detailsDict["OfficialSite"].append(rowInTable.find('td').get_text())
            columnsToAddNaN.remove("OfficialSite")
            continue
        
        if rowInTable.find('th').get_text() == "Connectivity:":
            detailsDict["Connectivity"].append(rowInTable.find('td').get_text())
            columnsToAddNaN.remove("Connectivity")
            continue

        if rowInTable.find('th').get_text() == "Customization":
            detailsDict["Customization"].append(rowInTable.find('td').get_text())
            columnsToAddNaN.remove("Customization")
            continue

        if rowInTable.find('th').get_text() == "ESRB Descriptors:":
            detailsDict["ESRB Descriptors"].append(rowInTable.find('td').get_text())
            columnsToAddNaN.remove("ESRB Descriptors")
            continue

        if rowInTable.find('th').get_text() == "Number of Online Players:	":
            detailsDict["# Online Players"].append(rowInTable.find('td').get_text())
            columnsToAddNaN.remove("# Online Players")
            continue

        if rowInTable.find('th').get_text() == "Number of Players:":
            detailsDict["# Players"].append(rowInTable.find('td').get_text())
            columnsToAddNaN.remove("# Players")
            continue

        if rowInTable.find('th').get_text() == "Online Modes:":
            detailsDict["OnlineModes"].append(rowInTable.find('td').get_text())
            columnsToAddNaN.remove("OnlineModes")
            continue

        if rowInTable.find('th').get_text() == "Offline Modes:":
            detailsDict["OfflineModes"].append(rowInTable.find('td').get_text())
            columnsToAddNaN.remove("OfflineModes")
            continue

        if rowInTable.find('th').get_text() == "Resolution:":
            detailsDict["Resolution"].append(rowInTable.find('td').get_text())
            columnsToAddNaN.remove("Resolution")
            continue

        if rowInTable.find('th').get_text() == "Sound":
            detailsDict["Sound"].append(rowInTable.find('td').get_text())
            columnsToAddNaN.remove("Sound")
            continue

        if rowInTable.find('th').get_text() == "Special Controllers:":
            detailsDict["SpecialControllers"].append(rowInTable.find('td').get_text())
            columnsToAddNaN.remove("SpecialControllers")
            continue
        
        if rowInTable.find('th').get_text() == "Split Screen Online Players:":
            detailsDict["SplitScreenOnlinePlayers"].append(rowInTable.find('td').get_text())
            columnsToAddNaN.remove("SplitScreenOnlinePlayers")
            continue

        if rowInTable.find('th').get_text() == "Split Screen Offline Players:":
            detailsDict["SplitScreenOfflinePlayers"].append(rowInTable.find('td').get_text())
            columnsToAddNaN.remove("Sound")
            continue

        if rowInTable.find('th').get_text() == "Compatibility":
            detailsDict["Compatibility"].append(rowInTable.find('td').get_text())
            columnsToAddNaN.remove("Compatibility")
            continue
        
        
    for colName in columnsToAddNaN:
        detailsDict[colName].append("NaN")

**Acquiring MetaScore and Score attribute and their info:**
- MetaScore:
    - Number of critics
    - Amount of positive reviews
    - Amount of mixed reviews
    - Amount of negative reviews
- UserScore:
    - Number of users
    - Amount of positive reviews
    - Amount of mixed reviews
    - Amount of negative reviews

In [14]:
def getScoreDetails(soupForGamePage, dict):
    time.sleep(0.2)
    #Getting 2 urls , First is for Metascore review page and the Second is for userscore review page
    metascoreURL = baseURL + soupForGamePage.find('div',attrs={'class':"details main_details"}).find('a').attrs['href']
    userscoreURL = baseURL + soupForGamePage.find('div',attrs={'class':"details side_details"}).find('a').attrs['href']

    #Getting metascore details : 
    responseFromMetascorePage = requests.get(metascoreURL ,headers=headers)
    soupForMetaScoreDetails = BeautifulSoup(responseFromMetascorePage.content ,'html.parser')
    scoreWrapper = soupForMetaScoreDetails.find('div',attrs={'class':'module score_details_module'})
    try:
        dict["MetaScore"].append(scoreWrapper.find('div',attrs={'class': lambda e: e.startswith('metascore_w') if e else False}).find('span').get_text())
        dict["NumberOfCritics"].append(scoreWrapper.find('strong').get_text().strip())
        dict["PositiveMetaScore"].append(scoreWrapper.find_all('li',attrs={'class':'score_count'})[0].find('span',attrs={'class':'count'}).get_text())
        dict["MixedMetaScore"].append(scoreWrapper.find_all('li',attrs={'class':'score_count'})[1].find('span',attrs={'class':'count'}).get_text())
        dict["NegativeMetaScore"].append(scoreWrapper.find_all('li',attrs={'class':'score_count'})[2].find('span',attrs={'class':'count'}).get_text())
    except AttributeError:
        # The data from the site is missing so we use the value from the game before because the score should be identical
        dict["MetaScore"].append(dict["MetaScore"][-1])
        dict["NumberOfCritics"].append("NaN")
        dict["PositiveMetaScore"].append("NaN")
        dict["MixedMetaScore"].append("NaN")
        dict["NegativeMetaScore"].append("NaN")

    #Getting userscore details :
    responseForUserScorePage = requests.get(userscoreURL ,headers=headers)
    soupForUserScoreDetails = BeautifulSoup(responseForUserScorePage.content ,'html.parser')
    scoreWrapper2 = soupForUserScoreDetails.find('div',attrs={'class':'module score_details_module'})
    try:
        dict["UserScore"].append(scoreWrapper2.find('div',attrs={'class': lambda e: e.startswith('metascore_w user') if e else False}).get_text())
        dict["NumberOfUsers"].append(scoreWrapper2.find('strong').get_text().strip())
        dict["PositiveUserScore"].append(scoreWrapper2.find_all('li',attrs={'class':'score_count'})[0].find('span',attrs={'class':'count'}).get_text())
        dict["MixedUserScore"].append(scoreWrapper2.find_all('li',attrs={'class':'score_count'})[1].find('span',attrs={'class':'count'}).get_text())
        dict["NegativeUserScore"].append(scoreWrapper2.find_all('li',attrs={'class':'score_count'})[2].find('span',attrs={'class':'count'}).get_text())
    except AttributeError:
        # This time we can't use the value from the game before this one because the values can be different
        dict["UserScore"].append("NaN")
        dict["NumberOfUsers"].append("NaN")
        dict["PositiveUserScore"].append("NaN")
        dict["MixedUserScore"].append("NaN")
        dict["NegativeUserScore"].append("NaN")

**Acquiring Awards attribute**

In [15]:
def getAwardAndRankings(soupForGamePage, dict):
    awardsList = []
    try:
        awardsAndRankingsBoxWrapper = soupForGamePage.find('div',attrs={'class':'module list_rankings contain_module'})
        for tableRow in awardsAndRankingsBoxWrapper.find('div',attrs={'class':'body'}).find('table',attrs={'class':'rankings'}).find_all('div',attrs={'class':'ranking_title'}):
            awardTitle = tableRow.get_text().strip()
            awardsList.append(awardTitle)
        #The game didn't win any awards or special rankings
        if not awardsList:
            dict["Awards"].append("NaN")
        #The game won awards or has special rankings
        else:
            dict["Awards"].append(" || ".join(awardsList))
    except AttributeError:
        dict["Awards"].append("NaN")

**Main code cell of the crawling**

In [None]:
#There are 191 pages in total , 100 Games for each page
for i in range(191):
    contentWarper = soup.find('div',attrs={'class':'title_bump'})
    browse_list_wrapperElements = contentWarper.find_all('div',attrs={'class':'browse_list_wrapper'})

    #Each browse_list_wrapperElements contains 100 games
    for item in browse_list_wrapperElements:
        game = item.find_all('td',attrs={'class':'clamp-summary-wrap'})
        for i in range(len(game)):
            time.sleep(1)
            try:
                gamePageURL = baseURL + game[i].find('a',attrs={'class':'title'}).attrs['href']
                responseForGamePage = requests.get(gamePageURL ,headers=headers)
                soupForGamePage = BeautifulSoup(responseForGamePage.content ,'html.parser')
        
                #Scraping for Scores and Award attributes
                getScoreDetails(soupForGamePage, dict)
                getAwardAndRankings(soupForGamePage, dict)

                # After some time of scraping the site started to return "blank pages" , so we decided to try at least 10 times before giving up on the game,
                # altough after a day of break from scraping thorugh the site there was no blank pages returned anymore , but we still decided to keep the code.
                numberOfTries = 0
                while not fillGameDetails(soupForGamePage ,dict) and numberOfTries != 10:
                   numberOfTries+=1
                if(numberOfTries != 10):
                   dict["Index"].append(game[i].find('span',attrs={'class':'title numbered'}).get_text().strip())
                   dict["Title"].append(game[i].find('h3').get_text())
                   dict["Platform"].append(game[i].find('span',attrs={'class':'data'}).text.strip())
                   dict["Release Date"].append(game[i].find('div',attrs={'class':'clamp-details'}).find_all('span')[-1].get_text())

            except AttributeError:
                continue

    # Go through the next 100 games in the next page       
    pageURL = contentWarper.find('a',attrs={"class":"action","rel":"next"}).attrs['href']
    response = requests.get(baseURL + pageURL ,headers=headers)
    soup = BeautifulSoup(response.content ,'html.parser')

Building the dataframe after acquiring the necessary data and writing it to a csv

In [16]:
dataFrame = pd.DataFrame(dict)
dataFrame.replace(to_replace="NaN",value=np.nan,inplace=True)
dataFrame.to_csv("GamesDataFrame.csv")