In [1]:
from bs4 import BeautifulSoup
import requests
import json
import requests
import csv
import time
import pandas as pd

In [2]:
def log_init():
    with open('log.txt', 'w') as log:
            log.write('Beginning Log\n')
def log_write(text):
    with open('log.txt', 'a') as log:
            log.write(f'{text}\n')

In [3]:
def extract_data_from_console(console):
    log_init()
    log_write('Start Extracting metadata from console')
    start_time = time.time()
    total = len(console)
    game_list = []
    for game in console:
        new_game_data = extract_data_from_game(game)
        game_list.append(new_game_data)
        log_write(f'Runtime: {time.time()-start_time} ')
        log_write(f'Finished {int(game["GID"])+1}/{total}')
        
    log_write(f"FINISHED IN {time.time()-start_time} SECONDS.")
    return game_list


def extract_data_from_game(game):
    log_write(f'Extracting data from {game["title"]}')
    #generate url from game object
    url = "http://www.metacritic.com" + game['page'] + "/details"
    
    # get page request and soup object
    headers = {'User-agent': f'1.{game["title"]}'}
    res = requests.get(url, headers=headers)
    
    log_write(f"Accessing page at {url} with status_code {res.status_code}")
    
    if(res.status_code > 300):
        log_write("Couldn't access page. ")
        return game
    
    page_source = res.content
    soup = BeautifulSoup(page_source, 'html.parser')
    #Get meta data
    return scrape_meta_data(soup)
    
def scrape_meta_data(soup):
    meta = {}
    soup.find()
    summary_soup = soup.find('div', attrs={'class': 'product_summary'})
    product_soup = soup.find('table', attrs={'cellspacing':'0'})

    try:
        meta['publisher'] = soup.find('li', attrs={'class': 'publisher'}).find('a').text.strip()
    except:
        meta['publisher'] = None
    try:
        meta['release_date'] = soup.find('li', attrs={'class': 'release_data'}).find('span', attrs={'class' : 'data'}).text
    except:
        meta['release_date'] = None
    try:
        meta['summary'] = summary_soup.find('span', attrs={'class':'data'}).text
    except:
        meta['summary'] = None
    try:
        meta['rating'] = product_soup.find(text='Rating:').parent.parent.find('td').text
    except:
        meta['rating'] = None
    try:
        meta['developer'] = product_soup.find(text='Developer:').parent.parent.find('td').text
    except:
        meta['developer'] = None
    try:
        meta['genres'] = product_soup.find(text='Genre(s):').parent.parent.find('td').text.strip().replace("  ", "")
    except:
        meta['genres'] = None
    try:
        meta['online'] = (product_soup.find(text='Number of Online Players:').parent.parent.find('td').text == 'No Online Multiplayer')*1
    except:
        meta['online'] = None
    try:
        meta['num_credits'] = len(soup.find('table', attrs={'class':'credits'}).find_all('tr'))-1
    except:
        meta['num_credits'] = None
    return meta

In [4]:

with open('ps4.csv', 'r') as g:
    read = csv.DictReader(g)
    ps4 = [dict(row) for row in read]
        

In [5]:
ps4[0:5]

[{'GID': '0',
  'critic_score': '97',
  'page': '/game/playstation-4/grand-theft-auto-v',
  'title': 'Grand Theft Auto V',
  'user_score': '8.3'},
 {'GID': '1',
  'critic_score': '95',
  'page': '/game/playstation-4/the-last-of-us-remastered',
  'title': 'The Last of Us Remastered',
  'user_score': '9.1'},
 {'GID': '2',
  'critic_score': '94',
  'page': '/game/playstation-4/god-of-war',
  'title': 'God of War',
  'user_score': '9.2'},
 {'GID': '3',
  'critic_score': '94',
  'page': '/game/playstation-4/xcom-2-war-of-the-chosen',
  'title': 'XCOM 2: War of the Chosen',
  'user_score': '6.2'},
 {'GID': '4',
  'critic_score': '93',
  'page': '/game/playstation-4/persona-5',
  'title': 'Persona 5',
  'user_score': '9.1'}]

In [6]:
new_ps4 = extract_data_from_console(ps4)

for g1, g2 in zip(ps4, new_ps4):
    g1.update(g2)

ps4[0:10]

[{'GID': '0',
  'critic_score': '97',
  'developer': 'Rockstar North',
  'genres': 'Action Adventure,Modern,Open-World',
  'num_credits': 16,
  'online': 0,
  'page': '/game/playstation-4/grand-theft-auto-v',
  'publisher': 'Rockstar Games',
  'rating': 'M',
  'release_date': 'Nov 18, 2014',
  'summary': 'The sprawling sun-soaked metropolis of Los Santos is chock full of self-help coaches, starlets and C-List celebrities, once on top of the media world, now struggling to stay relevant in time of economic malaise and lowest-common-denominator reality TV. Amidst this madness, three unique criminals plan their own chances of survival and success: Franklin, a street-level hustler in search of opportunities for serious money; Michael, an ex-con whose "retirement" is a less rosy than he hoped it would be; and Trevor, a violent dude driven by the chance for a quick high and the next big score. Nearly out of options, the crew risks it all in a series of daring and dangerous heists that could s

In [7]:
fields = ps4[0].keys()

with open('ps4.csv', 'w') as g:
    dw = csv.DictWriter(g, fieldnames=fields)
    dw.writeheader()
    dw.writerows(ps4)

UnicodeEncodeError: 'charmap' codec can't encode characters in position 100-101: character maps to <undefined>

In [None]:
df = pd.read_csv('ps4.csv')
df.loc[0, 'summary']