In [1]:
from bs4 import BeautifulSoup
import requests
import selenium
from selenium import webdriver
import json

In [20]:
with open('games.txt', 'r') as f:
    games = json.load(f)

In [99]:
games

{'ps4': {'Grand Theft Auto V': {'critic_score': '97',
   'page': '/game/playstation-4/grand-theft-auto-v',
   'title': 'Grand Theft Auto V',
   'user_score': '8.3'},
  'The Last of Us Remastered': {'critic_score': '95',
   'page': '/game/playstation-4/the-last-of-us-remastered',
   'title': 'The Last of Us Remastered',
   'user_score': '9.1'},
  'God of War': {'critic_score': '94',
   'page': '/game/playstation-4/god-of-war',
   'title': 'God of War',
   'user_score': '9.2'},
  'XCOM 2: War of the Chosen': {'critic_score': '94',
   'page': '/game/playstation-4/xcom-2-war-of-the-chosen',
   'title': 'XCOM 2: War of the Chosen',
   'user_score': '6.2'},
  'Persona 5': {'critic_score': '93',
   'page': '/game/playstation-4/persona-5',
   'title': 'Persona 5',
   'user_score': '9.1'},
  'Metal Gear Solid V: The Phantom Pain': {'critic_score': '93',
   'page': '/game/playstation-4/metal-gear-solid-v-the-phantom-pain',
   'title': 'Metal Gear Solid V: The Phantom Pain',
   'user_score': '8.2

In [24]:
sample_url = 'http://www.metacritic.com/game/playstation-4/life-is-strange/user-reviews?page=1'

In [25]:
headers = {'User-agent': 'Maxine .1'}
res = requests.get(sample_url, headers=headers)
print(f"Accessing page at {sample_url}")
print(res.status_code)
page_source = res.content

Accessing page at http://www.metacritic.com/game/playstation-4/life-is-strange/user-reviews?page=1
200


## ok so looking through the html, I see that ones that need to be expanded are in the html even without being expanded, so we don't need to click. However, I figured out how to click.  

* span class='blurb blurb_expanded'
* the issue is that reviews without a need for collapse don't have this span at all

In [26]:
soup = BeautifulSoup(page_source, 'html.parser')

In [27]:
reviews = soup.find_all('div', attrs={'class': 'review_body'})

## Here is a review that needed to be expanded

In [28]:
reviews[0]

<div class="review_body">
<strong class="bold">
            This review contains spoilers<span class="toggle_text_visibility">, click expand to view</span>.
        </strong>
<span class="inline_expand_collapse inline_collapsed" id="review_blurb_6435927"><span class="blurb blurb_collapsed"></span><span class="blurb blurb_expanded">This game is heart breaking
<br/>For me personally, i can't relate to the character until around episode 3 where it all starts to get good, and at that point on, this game was amazing.
<br/>It does not have the best start, Episode 1 is Ok, And Episode 2 is great, but the real moment of this game is episode 3 and 4, Extremely amazing
<br/> <br/>This game teaches us, that sometimes, what matter are the journey we go through.
<br/>A True Roller Coaster of feeling!</span><span class="blurb_etc">…</span> <a class="toggle_expand_collapse toggle_expand" href="/game/playstation-4/life-is-strange/user-reviews?page=1&amp;user_review_id=6435927" rel="nofollow">Expand</a

In [29]:
# This has quotes for some reason
reviews[0].find('span', attrs={'class': 'blurb_expanded'}).text

"This game is heart breaking\rFor me personally, i can't relate to the character until around episode 3 where it all starts to get good, and at that point on, this game was amazing.\rIt does not have the best start, Episode 1 is Ok, And Episode 2 is great, but the real moment of this game is episode 3 and 4, Extremely amazing\r This game teaches us, that sometimes, what matter are the journey we go through.\rA True Roller Coaster of feeling!"

## Here is a review that didn't need to be expanded

In [30]:
reviews[1]

<div class="review_body">
<span>I am a strong man, never have wanted to cry and and rip my heart out more tha when I played this game. I was so emotionally  invested  in a game Icared more for them than me. Artistic  and beautiful  beyond words. Am must play for all story  seeking gamers</span>
</div>

In [32]:
review_text = []

for review in reviews:
    review_text.append(extract_text(review))

AttributeError: 'NoneType' object has no attribute 'text'

In [35]:
review_text[-1]

'Excelente , la música es increíble y la historia  me atrapo en el primer momento, hay escenas que sacan muchas lagrimas en el transcurso del juego 10/10 el juego'

In [36]:
reviews[-1]

<div class="review_body">
                                Despite its obvious flaws the reactive story and compelling characters make this one of the best storytelling experiences of the current generation.
                            </div>

# Writing a function

In [155]:
def extract_reviews_from_page(url):
    review_list = []
    
    # get page request and soup object
    headers = {'User-agent': 'game["title"]'}
    res = requests.get(url, headers=headers)
    print(f"Accessing page at {url}")
    print(res.status_code)
    if(res.status_code > 300):
        return
    
    page_source = res.content
    soup = BeautifulSoup(page_source, 'html.parser')
    
    #save a list of reviews
    reviews = soup.find('ol', attrs={'class' : 'user_reviews'}).find_all('div', attrs={'class': 'review_content'})
    
    # loop through all reviews:
    for review in reviews:
        # extract the text, the review score, and the author, and date
        try:
            review_list.append(extract_single_review(review))
        except: 
            print(review)
    return review_list
    
       

In [156]:
def extract_reviews_from_game(game):
    review_list = []
    #generate url from game object
    url = "http://www.metacritic.com" + game['page'] + "/user-reviews"
    print(url)
    # get page request and soup object
    headers = {'User-agent': f'1.{game["title"]}'}
    res = requests.get(url, headers=headers)
    print(f"Accessing page at {url}")
    print(res.status_code)
    if(res.status_code > 300):
        return
    
    page_source = res.content
    soup = BeautifulSoup(page_source, 'html.parser')
    num_pages = soup.find('li', attrs={'class': 'last_page'}).find('a').text
    for i in range(int(num_pages)):
        new_url = f"{url}?page={i}"
        review_list.extend(extract_reviews_from_page(new_url))
    return review_list

In [157]:
def extract_text(review):
    if(not review.find('span', attrs={'class': 'blurb_expanded'})):
        return review.find('span').text
    else:
        return review.find('span', attrs={'class': 'blurb_expanded'}).text

In [158]:
def extract_single_review(soup):
    review = {}
    review['author'] = soup.find('a').text
    review['date'] = soup.find('div', attrs={'class':'date'}).text
    review['score'] = soup.find('div', attrs={'class': 'metascore_w'}).text
    review['blurb'] = extract_text(soup.find('div', attrs={'class': 'review_body'}))
    return review

In [159]:
def extract_reviews_by_console(console):
    reviews = []
    for title in games[console]:
        game = games[title]
        reviews.extend(extract_reviews_from_game(game))
        

In [161]:
gow = extract_reviews_from_game(games['ps4']['God of War'])

http://www.metacritic.com/game/playstation-4/god-of-war/user-reviews
Accessing page at http://www.metacritic.com/game/playstation-4/god-of-war/user-reviews
200
Accessing page at http://www.metacritic.com/game/playstation-4/god-of-war/user-reviews?page=0
200
Accessing page at http://www.metacritic.com/game/playstation-4/god-of-war/user-reviews?page=1
200
Accessing page at http://www.metacritic.com/game/playstation-4/god-of-war/user-reviews?page=2
200
Accessing page at http://www.metacritic.com/game/playstation-4/god-of-war/user-reviews?page=3
200
Accessing page at http://www.metacritic.com/game/playstation-4/god-of-war/user-reviews?page=4
200
Accessing page at http://www.metacritic.com/game/playstation-4/god-of-war/user-reviews?page=5
200
Accessing page at http://www.metacritic.com/game/playstation-4/god-of-war/user-reviews?page=6
200
Accessing page at http://www.metacritic.com/game/playstation-4/god-of-war/user-reviews?page=7
200
Accessing page at http://www.metacritic.com/game/playsta

In [163]:
gow[0:5]

[{'author': 'Beatensloth',
  'blurb': "Amazing game haven't played something like this in a long time. Great story, awesome fights immersive visuals and sound design. It's definitely deserves game of the year title.",
  'date': 'May  1, 2018',
  'score': '10'},
 {'author': 'VikramSundar',
  'blurb': "Incredible game. I don't finish much of the video games that I start, but with God of War I could simply not put down the controller. I finished the game in two days playing 10+ hours straight each day and I'm still not tired of playing the game. The graphics and details are impeccable. The story and mythology are well-written and incorporated amazingly into Kratos' continuing story. i highly recommend playing this game!",
  'date': 'May  1, 2018',
  'score': '10'},
 {'author': 'Backoff',
  'blurb': "If u have Ps4 u will like this but if u have no Ps4 U can watch only gameplay and trailer.... lol\rBest Game Series Game : GOD OF WAR\r5 years... Fcking long time and play only 48 hours no no 