In [13]:
from bs4 import BeautifulSoup
import requests
import json
import time
import csv

In [2]:
def log_init():
    with open('log.txt', 'w') as log:
            log.write('Beginning Log\n')
def log_write(text):
    with open('log.txt', 'a') as log:
            log.write(f'{text}\n')

In [3]:
with open('ps4.csv', 'r') as f:
    read = csv.DictReader(f)
    ps4 = [dict(row) for row in read]

# Review scraping functions
#### things to add:
* include url in each review dict, (and maybe status code)
* handle 429 better


In [6]:
def extract_reviews_by_console(console, filename):
    start = time.time()
    log_init()
    with open(filename, 'w') as f:
        reviews = []
        i = 0
        for game in console:
            log_write(f"Starting scraping reviews for {game['title']}")
            game_reviews = extract_reviews_from_game(game)
            [review.update({'GID': game['GID']}) for review in game_reviews]
            reviews.append(game_reviews)
            if i ==0:
                fields=game_reviews[0].keys()
                dw = csv.DictWriter(f, fieldnames=fields)
                dw.writeheader()
            dw.writerows(game_reviews)
            i +=1
            log_write(f"Finished Scraping reviews for {game['title']}")
            log_write(f"Scraped {game['GID']}/{len(console)}")
            log_write(f"Elapsed Time: {time.time() - start}")
        log_write(f"Scraped all reviews in {time.time() - start} seconds")
        return reviews

In [7]:
def extract_reviews_from_game(game, agent= 0):
    review_list = []
    #generate url from game object
    url = "http://www.metacritic.com" + game['page'] + "/user-reviews"
    # get page request and soup object
    headers = {'User-agent': f'1.{game["title"]}'}
    res = requests.get(url, headers=headers)
    if(res.status_code > 300):
        log_write(f"Could not reach game: {url}, status code: {res.status_code}")
        if agent > 3:
            return review_list
        else:
            log_write(f"Trying again: {agent}")
            time.sleep(5)
            extract_reviews_from_page(url, game, agent+1)
    
    page_source = res.content
    soup = BeautifulSoup(page_source, 'html.parser')
    try:
        num_pages = soup.find('li', attrs={'class': 'last_page'}).find('a').text
    except:
        num_pages = 1
    for i in range(int(num_pages)):
        time.sleep(1)
        new_url = f"{url}?page={i}"
        revs = extract_reviews_from_page(new_url, game)
        review_list.extend(revs)
    return review_list

In [8]:
def extract_reviews_from_page(url, game, agent = 0):
    
    review_list = []
    
    # get page request and soup object
    headers = {'User-agent': f"{game['title']} . {url[-1]} . {agent}"}
    res = requests.get(url, headers=headers)
    log_write(f"Accessing page at {url}")
    if(res.status_code > 300):
        log_write(f"Could not reach page: {url}, status code: {res.status_code}")
        if agent > 3:
            return review_list
        else:
            time.sleep(5)
            extract_reviews_from_page(url, game, agent+1)
    
    page_source = res.content
    soup = BeautifulSoup(page_source, 'html.parser')
    
    #save a list of reviews
    try:
        reviews = soup.find('ol', attrs={'class' : 'user_reviews'}).find_all('div', attrs={'class': 'review_content'})
    except:
        log_write(f"No reviews found on page: {url}")
        return review_list
    # loop through all reviews:
    for review in reviews:
        # extract the text, the review score, and the author, and date
        try:
            review_list.append(extract_single_review(review))
        except: 
            print(review)
    return review_list
    
       

In [9]:
def extract_single_review(soup):
    review = {}
    review['author'] = soup.find('a').text
    review['date'] = soup.find('div', attrs={'class':'date'}).text
    review['score'] = soup.find('div', attrs={'class': 'metascore_w'}).text
    review['blurb'] = extract_text(soup.find('div', attrs={'class': 'review_body'}))
    return review

In [10]:
def extract_text(review):
    if(not review.find('span', attrs={'class': 'blurb_expanded'})):
        return review.find('span').text
    else:
        return review.find('span', attrs={'class': 'blurb_expanded'}).text

In [None]:
reviews = extract_reviews_by_console(ps4, 'reviews.csv')

In [15]:
headers = {'User-agent': 'alsdkjflkjasdflk'}
res = requests.get('http://www.metacritic.com/game/playstation-4/the-witcher-3-wild-hunt/user-reviews?page=5', headers=headers)

In [28]:
res.ok

True

In [29]:
res.ok

True

In [31]:
soup = BeautifulSoup(res.content)



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "html5lib")

  markup_type=markup_type))
