In [1]:
import pandas as pd
import numpy as np
import json
from selenium import webdriver
import requests
import time
from tqdm import tqdm

# Call RAWG API
    1. Access API
    2. Compile the necessary data
    3. Save the data

In [2]:
# the rawg api call url
# example: requests.get('https://api.rawg.io/api/games?page=1&page_size=40').json()
rawg_api = 'https://api.rawg.io/api/games' # ?page=<number>&page_size=<number>

In [3]:
def get_genres(result):
    '''
    Input:
        results: RAWG video game API 'results' json
    Returns:
        list of game genres
    '''
    return [genre['name'] for genre in result['genres']]

In [4]:
def list_games(json):
    '''
    Input:
        json: RAWG video game API 'results' json
    Returns:
        A dictionary containing the games'  id, name
        rating, ratings breakdown, release date,
        metacritic score, ratings count, and genres
    '''
    games = []
    for result in json['results']:
        games_dict = {}
        genres = get_genres(result)
        
        games_dict['id'] = result['id']
        games_dict['name'] = result['name']
        games_dict['rating'] = result['rating']
        games_dict['ratings'] = result['ratings']
        games_dict['released'] = result['released']
        games_dict['metacritic'] = result['metacritic']
        games_dict['ratings_count'] = result['ratings_count']
        games_dict['genres'] = genres
        
        # if more than 3 people voted for the rating save the data
        if games_dict['ratings_count'] > 3:
            games.append(games_dict)
    return games

In [5]:
def to_json(game, filename):
    '''
    Input:
        game: dictionary containing the data to write to the file
        filename: a string stating the name and path you wish to save the file at
    Returns:
        json file
    '''
    with open(f'{filename}.json', 'a') as file:
            file.write(f'{json.dumps(game)}\n')
    return

In [6]:
def call_api(api, pages, filename):
    '''
   Input:
       api: rawg api url
       pages: which pages I want to scrape from the api
       filename: the filename and path I want to save the file
    Returns:
        json file containing the the games'  id, name
        rating, ratings breakdown, release date,
        metacritic score, ratings count and genres, and a list
        of games that failed to save to json    
    '''
    not_added =[]
    total_games = []
    
    # make and api call extract what you need
    for page in tqdm(range(pages[0], pages[1])):
        json = requests.get(f'{api}?page={page}&page_size=40').json()
        games = list_games(json)
        try: # try save the file to json
            for game in games: 
                to_json(game, filename)
        except: # if failed, save in a python list to address later
            not_added.append(game)
        time.sleep(1)       
    return not_added

In [7]:
not_added = call_api(rawg_api, [1, 8300], 'raw_data')

100%|████████████████████████████████████████████████████████████████████████████| 8299/8299 [4:37:58<00:00,  2.01s/it]


In [7]:
df = pd.read_json('raw_data.json', lines=True)
df.head()

Unnamed: 0,id,name,rating,ratings,released,metacritic,ratings_count,genres
0,3498,Grand Theft Auto V,4.48,"[{'id': 5, 'title': 'exceptional', 'count': 16...",2013-09-17,96.0,2758,"[Action, Shooter]"
1,4200,Portal 2,4.61,"[{'id': 5, 'title': 'exceptional', 'count': 16...",2011-04-19,95.0,2405,"[Shooter, Puzzle]"
2,3328,The Witcher 3: Wild Hunt,4.68,"[{'id': 5, 'title': 'exceptional', 'count': 20...",2015-05-18,93.0,2552,[RPG]
3,5679,The Elder Scrolls V: Skyrim,4.39,"[{'id': 5, 'title': 'exceptional', 'count': 10...",2011-11-11,,2015,"[Action, RPG]"
4,12020,Left 4 Dead 2,4.08,"[{'id': 4, 'title': 'recommended', 'count': 76...",2009-11-17,89.0,1431,"[Action, Shooter]"


In [8]:
print(f'{df.describe()}\n')
print(f'{df.info()}\n')
print(f"released NA {df['released'].isna().sum()}\n")
print(f"metacritic NA {df['metacritic'].isna().sum()}\n")

                  id        rating   metacritic  ratings_count
count   11338.000000  11338.000000  2302.000000   11338.000000
mean    34876.510496      2.721947    73.806255      59.900423
std     56948.401972      1.503848    10.530756     172.539700
min         2.000000      0.000000    24.000000       4.000000
25%     10611.250000      2.030000    68.000000       6.000000
50%     19579.500000      3.250000    75.000000      13.000000
75%     38285.250000      3.880000    81.000000      37.000000
max    394092.000000      5.000000    97.000000    3121.000000

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11338 entries, 0 to 11337
Data columns (total 8 columns):
id               11338 non-null int64
name             11338 non-null object
rating           11338 non-null float64
ratings          11338 non-null object
released         11128 non-null object
metacritic       2302 non-null float64
ratings_count    11338 non-null int64
genres           11338 non-null object
dtypes: float

In [9]:
# view which games are missing release dates and order it by how many people it was rated
df[df.released.isna()].sort_values(by='ratings_count', ascending=False).head(25)

Unnamed: 0,id,name,rating,ratings,released,metacritic,ratings_count,genres
226,5687,The Playroom,2.21,"[{'id': 1, 'title': 'skip', 'count': 210, 'per...",,,441,[]
548,42309,Dead Island: Epidemic,2.2,"[{'id': 1, 'title': 'skip', 'count': 63, 'perc...",,,136,[Arcade]
794,42391,Nosgoth,2.54,"[{'id': 1, 'title': 'skip', 'count': 40, 'perc...",,,97,"[Action, Shooter]"
1491,10096,3DMark,3.43,"[{'id': 4, 'title': 'recommended', 'count': 44...",,,79,[]
1247,23576,GOD EATER RESURRECTION,3.2,"[{'id': 3, 'title': 'meh', 'count': 28, 'perce...",,,71,[Action]
2516,9551,Deep Rock Galactic,4.14,"[{'id': 4, 'title': 'recommended', 'count': 40...",,,61,"[Action, Indie]"
2104,10156,Aliens: Colonial Marines Collection,2.6,"[{'id': 1, 'title': 'skip', 'count': 21, 'perc...",,45.0,54,[Action]
4212,5673,Terminator Salvation,2.7,"[{'id': 3, 'title': 'meh', 'count': 21, 'perce...",,,40,"[Action, Shooter]"
807,18785,Sam & Max 303: They Stole Max's Brain!,2.76,"[{'id': 1, 'title': 'skip', 'count': 13, 'perc...",,,38,[Adventure]
3325,11587,Kenshi,4.16,"[{'id': 5, 'title': 'exceptional', 'count': 16...",,,38,"[Action, RPG, Strategy, Simulation, Indie]"


# Cleaning the dataset
    1. Filling or removing missing values
    2. Save the new dataset to a new file

In [10]:
# check which game is missing release date and manually fill those with over 30 votes
def add_release_dates(df):
    '''
    This function fills specific observations with specific release dates.
    Input:
        df: Original raw data dataframe
    Output:
        Partially filled release dates in the dataframe
    '''
    # star wars: obi wan
    df.at[11169, 'released'] = '2001-12-19'
    # god eater resurrection
    df.at[1247, 'released'] = '2015-10-29' 
    # deep rock galactic
    df.at[2516, 'released'] = '2018-02-28'
    # aliens: Colonial Marines Collection
    df.at[2104, 'released'] = '2013-02-12'
    # terminator Salvation
    df.at[4212, 'released'] = '2009-05-01'
    # kenshi
    df.at[3325, 'released'] = '2018-12-06'
    # Sam & Max 303: They Stole Max's Brain!
    df.at[807, 'released'] = '2010-06-22'
    # Sam & Max 303: Beyond the Alley of the Dolls
    df.at[812, 'released'] = '2010-07-20'
    # Sam & Max 303: The Penal Zone
    df.at[814, 'released'] = '2010-04-02'
    # Sam & Max 303: The City That Dares Not Sleep
    df.at[811, 'released'] = '2010-08-30'
    # Sam & Max 303:  The Tomb of Sammun-Mak
    df.at[813, 'released'] = '2010-05-18'
    # Sword of the Stars: The Pit
    df.at[2396, 'released'] = '2013-02-21'
    # SpellForce 2 Anniversary Edition
    df.at[2399, 'released'] = '2017-04-12'
    # raft
    df.at[4281, 'released'] = '2018-05-23'
    # Unheard
    df.at[5799, 'released'] = '2019-03-29'
    # Hector: Episode 2
    df.at[1167, 'released'] = '2011-06-25'
    
    # most of the games with higher vote count that are missing release date are closed, 
    #suspended or aren't rated.
    df.dropna(subset=['released'], inplace=True)
    return df

# drop games that do not contain genres
df = df[df['genres'].map(lambda n: len(n)) > 0]

# add some release dates
df = add_release_dates(df)

# asses the NAN value situation
print(f"Missing metacritic values: {df['metacritic'].isna().sum()}\n")
print(f'{df.info()}/n')

Missing metacritic values: 8264

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10549 entries, 0 to 11337
Data columns (total 8 columns):
id               10549 non-null int64
name             10549 non-null object
rating           10549 non-null float64
ratings          10549 non-null object
released         10549 non-null object
metacritic       2285 non-null float64
ratings_count    10549 non-null int64
genres           10549 non-null object
dtypes: float64(2), int64(2), object(4)
memory usage: 741.7+ KB
None/n


## Filling the metacritic scores for as many games as possible
    1. Retrieve the names of all games without metacritic score
    2. Scrape the missing values

In [11]:
# list all the game without metacritic scores
no_metacritic = np.array(df['name'][df['metacritic'].isna()])

In [12]:
# scrape the missing metacritic scores using selenium
def scrape_metacritic(names, filename):
    '''
    Input:
        names: list of games to scrape their score from metacritic
        filename: a string for saving the json file
    Returns:
        json file containing a list of dictionarys with the name of the game and the 
    '''
    scores = []
    for name in tqdm(names):
        game = {}
        # open metacritic
        driver = webdriver.Chrome()
        driver.implicitly_wait(4)
        driver.get("https://www.metacritic.com/game")
        driver.implicitly_wait(15)
        
        try:
            # find search bar to search the, enter the game name, and click search
            driver.find_element_by_id('primary_search_box').send_keys(f'{name}')
            driver.implicitly_wait(2)
            driver.find_element_by_id('primary_menu_item_enter_search').click()

            # filter out non games
            driver.implicitly_wait(5)
            parent = driver.find_element_by_class_name("filter_area")
            driver.implicitly_wait(2)
            element = parent.find_elements_by_class_name("title")[2].click()
            
            # good reviews
            if driver.find_elements_by_xpath('(.//span[@class = "metascore_w medium game positive"][1])'):
                driver.implicitly_wait(0.5)
                score = driver.find_elements_by_xpath('(.//span[@class = "metascore_w medium game positive"][1])')[0].text
            
            # no review yet
            elif driver.find_elements_by_xpath('(.//span[@class = "metascore_w medium game tbd"][1])'):
                driver.implicitly_wait(0.5)
                score = driver.find_elements_by_xpath('(.//span[@class = "metascore_w medium game tbd"][1])')[0].text
            
            # mixed reviews
            elif driver.find_elements_by_xpath('(.//span[@class = "metascore_w medium game mixed"][1])'):
                driver.implicitly_wait(0.5)
                score = driver.find_elements_by_xpath('(.//span[@class = "metascore_w medium game mixed"][1])')[0].text
            
            # bad reviews
            elif driver.find_elements_by_xpath('(.//span[@class = "metascore_w medium game negative"][1])'):
                driver.implicitly_wait(0.5)
                score = driver.find_elements_by_xpath('(.//span[@class = "metascore_w medium game negative"][1])')[0].text
            
            else:
                score = "Not Found"             
        except:
            score = "Selenium Fail" 
        
        game[name] = score
        to_json(game, filename)
        scores.append(game)    
        driver.close()
    
    driver.quit()
    return scores
# last scraped 5000:5400
metacritic_scores = scrape_metacritic(no_metacritic[5000:5400], 'metacritic_scores')

100%|██████████████████████████████████████████████████████████████████████████████| 400/400 [2:34:09<00:00, 23.12s/it]


In [13]:
metacritic_scores

[{'Unruly Heroes': '75'},
 {"The Bard's Tale IV: Barrows Deep": 'tbd'},
 {"Rock n' Roll Racing": '80'},
 {'Dino Crisis 2': '86'},
 {'Freeman: Guerrilla Warfare': 'tbd'},
 {'SENRAN KAGURA Peach Beach Splash': 'tbd'},
 {'Teenage Mutant Ninja Turtles: Mutants in Manhattan': 'tbd'},
 {'Mass Effect 2: Arrival': '63'},
 {'DAH! Path of the Furon': 'tbd'},
 {'BorderZone': 'tbd'},
 {'A Land Fit For Heroes': 'tbd'},
 {'Disney•Pixar Toy Story Mania!': 'Not Found'},
 {'Basement': 'tbd'},
 {'Hell`s Little Story': 'Not Found'},
 {'Rocketbirds 2 Evolution': 'tbd'},
 {'INVASION!': '87'},
 {'Reflex Arena': 'Not Found'},
 {'NFL Blitz': '85'},
 {'THE LAST BLADE 2': '77'},
 {'Wanted Corp.': 'tbd'},
 {'Pilgrims': '79'},
 {'The Shapeshifting Detective': '77'},
 {'Love Letter': 'tbd'},
 {'UNO & Friends': 'tbd'},
 {'Slayer Of Traitors': 'tbd'},
 {'Switchblade (2018)': 'Not Found'},
 {'Neo cab': '76'},
 {'Midnight Deluxe': 'tbd'},
 {'Scream Collector': 'tbd'},
 {'Transport Defender': 'tbd'},
 {'Rogue Trooper R

In [25]:
no_metacritic[1976]

'Rusty Lake Paradise'

In [None]:
def reshape_metacritic_df(filename):
    '''
    '''
    metacritic_scores = pd.read_json(filename, lines=True)
    metacritic_scores.set_index([metacritic_scores.columns], inplace=True)
    return pd.DataFrame(np.diag(metacritic_scores), index=metacritic_scores.index, columns=['metacritic'])

def extract_selenium_fails(filename):
    '''
    Input:
        filename: file name for a json containing games and their respective metacritic scores
    Retruns:
        numpy array containing only the games that do not have selenium scores  
    '''
    with open(filename, 'r') as file:
            meta_scores = [json.loads(f) for f in file]
    
    fails = []
    for game in meta_scores:
        for k, v in game.items():
            if v == 'Not Found' or v == 'Selenium Fail':
                fails.append(k)

    return np.array(fails)



In [None]:
def fill_metacritic(df, filename, column):
    '''
    '''  
    # create a single column df with all the scraped data found on the diagonal
    scores_column = reshape_metacritic_df(filename)

    # get the games that did not scrape the first time to try again later.
    selenium_fails = extract_selenium_fails(scores_column)
    
    # fill the missing results from the scraped scores by index
    df.set_index('name', inplace=True)
    df['metacritic'] = df[column].fillna(scores_column[column])
    return df, selenium_fails

df, rescraping = fill_metacritic(df, 'metacritic_scores.json', 'metacritic')