# 01_Getting_Data

This notebook is used to gather data using GiantBomb's api.  
* Scrape game names and id's using the `games` endpoint filtered by console
* Scrape individual game content for each game from previous scrape
    * Want the `description`, `genres`, `themes`, `producers`, `devs`, maybe even the `people` involved
* Scrape all user reviews for console
    * Want the `score`, `username`, and `content` of review


## Imports, Constants, logging functions

In [1]:
from bs4 import BeautifulSoup
import requests
import json
import csv
import time

In [2]:
key = 'f05d9dbc286a3eb1f52355ea793a3e96c66a347f' #api access key
platform = '146'                                 # Only scraping playstation games.  This will be expanded on in the future
headers = {'User-agent': 'LookAYoYo\'s open source recommender system'}


In [3]:
# These functions are used to write to a log when scraping. 
# They are useful for estimating percentage completed, recording
# errors, and in general moderating how well the scrape is going.
def log_init():
    with open('log.txt', 'w') as log:
            log.write('Beginning Log\n')
def log_write(text):
    with open('log.txt', 'a') as log:
            log.write(f'{text}\n')

## Games
* The API can retrieve up to 100 games per request, however to get all the info per game, each game must be requested one at a time.  
* This series of functions are to be used to get a preliminary list of game names and their guid called `games_list` for any given platform

In [4]:
# Used to get the correct api request url.
def getURL(offset = 0):
    games_field_list = ['name', 'guid']
    games_url = f'https://www.giantbomb.com/api/games/?api_key={key}&format=json&offset={offset}&platforms={platform}&sort=number_of_user_reviews:desc&field_list={",".join(games_field_list)}'
    return games_url

# This function acutally makes the api request and returns its json response
def getJsonFromResponse(url):
    res = requests.get(url, headers= headers)
    soup = BeautifulSoup(res.content, "lxml")
    json_response = json.loads(soup.find('body').text)
    return json_response

# This little guy tells us when we've gotten all the data we need.  It will be used in later scrapes as well. 
def isDone(json_response):
    return json_response['number_of_page_results'] + json_response['offset'] > json_response['number_of_total_results']

# This is the function to call to get the list. 
def getAllGames(platform):
    '''
    Call this function with a given platform id to return a games_list of games from that console and their guid.
    '''
    log_init()
    start=time.time()
    flag = True
    offset = 0
    games_list = []
    while flag:
        url = getURL(offset)
        json_response = getJsonFromResponse(url)
        games_list.extend(json_response['results'])
        log_write(f'offset = {offset}')
        log_write(f'status = {json_response["error"]}')
        log_write(f'list_length = {len(games_list)}')
        log_write(f'runtime = {time.time() - start} seconds')
        if isDone(json_response):
            flag = False
        else:
            offset += 100
    return games_list

In [6]:
print(getURL())

https://www.giantbomb.com/api/games/?api_key=f05d9dbc286a3eb1f52355ea793a3e96c66a347f&format=json&offset=0&platforms=146&sort=number_of_user_reviews:desc&field_list=name,guid


#### Now we retrieve the `games_list` and save it 

In [5]:
games_list = getAllGames(platform)

In [8]:
with open("games_list.json", 'w+') as f:
    json.dump(games_list, f)

# Game Metadata
* Now that we have the list of games to scrape, we can go through one-by-one to get all the metadata we need. 
* Each game will be saved to its own file as it gets scraped.  Gotta keep that memory available.  

In [7]:
# returns the api request url for a given game's guid
def getMetaURL(guid):
    # The final project only used:
    # concepts, developers, genres, themes, publishers
    # The other fields were to be used with some potential NLP algorithms that didn't seem relevant
    meta_field_list = ['aliases', 'concepts', 'deck', 'developers', 
                       'franchises', 'genres', 'similar_games',
                       'themes', 'description', 'publishers']
    meta_url = f'https://www.giantbomb.com/api/game/{guid}/?api_key={key}&format=json&field_list={",".join(meta_field_list)}'
    return meta_url


# Given a request url for a game, this function returns a dictionary of the metadata
def getGameMetaData(url):
    ret_dict = {}
    json_response = getJsonFromResponse(url)
    log_write(f'status = {json_response["error"]}')
    json_response = json_response['results']
    ret_dict['aliases'] = json_response['aliases']
    try:
        ret_dict['concepts'] = [concept['name'] for concept in json_response['concepts']]
    except: 
        ret_dict['concepts'] = ''
    ret_dict['deck'] = json_response['deck']
    try:
        ret_dict['developers'] = [dev['name'] for dev in json_response['developers']]
    except:
        ret_dict['developers'] = ''
    try:
         ret_dict['franchises'] = [franch['name'] for franch in json_response['franchises']]
    except:
        ret_dict['franchises']=''
    try:
         ret_dict['publishers'] = [franch['name'] for franch in json_response['publishers']]
    except:
        ret_dict['publishers']=''
    try:
        ret_dict['genres'] = [genre['name'] for genre in json_response['genres']]
    except:
        ret_dict['genres']=''
    try:
        ret_dict['similar_games'] = [sgame['name'] for sgame in json_response['similar_games']]
    except:
        ret_dict['similar_games']=''
    try:
         ret_dict['people'] = [person['name'] for person in json_response['people']]
    except:
        ret_dict['publishers']=''
    try:
        ret_dict['themes'] = [theme['name'] for theme in json_response['themes']]
    except:
         ret_dict['themes'] = ''
    ret_dict['description'] = json_response['description']
    return ret_dict

# This function loops through the games_list and saves a json file of the game's metadata for each game.  
def getAllMetaData(games):
    log_init()
    start=time.time()
    #games should just be a list of dictionaries containing guid and name
    for i, game in enumerate(games):
        guid = game['guid']
        name = game['name']
        url = getMetaURL(guid)
        log_write(f'Game = {name}, {url}')
        log_write(f'{100*i/len(games)}%')
        metadata = getGameMetaData(url)
        metadata.update(game)
        log_write(f'runtime = {time.time() - start} seconds')
        log_write('')
        with open(f'./game_meta_data/meta_data_{metadata["guid"]}.json', 'w') as f:
            json.dump(metadata, f)
        time.sleep(2)
        

In [None]:
getAllMetaData(games_list)