In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import json

In [2]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import os, time


# Scraping Notebook
This notebook is used to scrape and compile all data needed for the project. First, it will scrape the top 15 games of each genre from steampowered.com with their game id's, then use those ids to gather review data from SteamWorks API

In [270]:
# # Gathering 15 games and id numbers from each genre

# #each url is formatted the same for the top rated games, so creating a list of the genres to cycle through
# genre_list = ['Action', 'Adventure', 'Casual', 'Indie', 'Massively%20Multiplayer',
#              'Racing', 'RPG','Simulation', 'Sports', 'Strategy' ]


# games = {}
# #cycle through each genre, pull 15 games
# for genre in genre_list:
#     response = requests.get('https://store.steampowered.com/tags/en/{}/#p=0&tab=TopRated'.format(genre))
#     page = response.text
#     soup = BeautifulSoup(page, 'html5lib')
#     table = soup.find('div', id = 'TopRatedRows')
#     rows = [row for row in table.find_all('a')]
    
#     #for each genre, get the 15 titles and ids
#     for row in rows:
#         title = (row.find(class_='tab_item_name').text)
#         app_id = row.get('href').split('/')[4]
    
#         games[title] = (genre, app_id)
        
#     # adding second page for each genre, making it 30 games per genre
#     # so even though the "#p=" signifies page number, it always starts on the first page, regardless of number passsed
#     # this means I will have to use selenium, or stick to 15 games.
#     # lets try selenim
    
# #     response = requests.get('https://store.steampowered.com/tags/en/{}/#p=1&tab=TopRated'.format(genre))
# #     page = response.text
# #     soup = BeautifulSoup(page, 'html5lib')
# #     table = soup.find('div', id = 'TopRatedRows')
# #     rows = [row for row in table.find_all('a')]
    
# #     #for each genre, get the 15 titles and ids
# #     for row in rows:
# #         title = (row.find(class_='tab_item_name').text)
# #         app_id = row.get('href').split('/')[4]
    
# #         games[title] = (genre, app_id)
    
    

# Increasing Games
I wanted a larger corpus. To increase from 15 to 30 games I thought would be as easy as changine the page number in the url; however, while p=0 switches to p=1 when navigating the site, it always loads the first page regardless of what number requests pulls. So I am switching over to selenium, and manually clicking the second page button. Re-writing the above code to be compatible with selenium.

In [3]:
chromedriver = "/Applications/chromedriver" 
os.environ["webdriver.chrome.driver"] = chromedriver
driver = webdriver.Chrome(chromedriver)

#initializing the page, but not pulling any information
driver.get('https://store.steampowered.com')



# Gathering 15 games and id numbers from each genre

#each url is formatted the same for the top rated games, so creating a list of the genres to cycle through
genre_list = ['Action', 'Adventure', 'Casual', 'Indie', 'Massively%20Multiplayer',
             'Racing', 'RPG','Simulation', 'Sports', 'Strategy' ]


games = {}

#cycle through each genre, pull 15 games
for genre in genre_list:
    driver.get('https://store.steampowered.com/tags/en/{}/#p=0&tab=TopRated'.format(genre))
    time.sleep(2)
    
    soup = BeautifulSoup(driver.page_source, 'html5lib')
    table = soup.find('div', id = 'TopRatedRows')
    rows = [row for row in table.find_all('a')]
    
    #for each genre, get the 15 titles and ids
    for row in rows:
        title = (row.find(class_='tab_item_name').text)
        app_id = row.get('href').split('/')[4]
    
        games[title] = (genre, app_id)
        
    # going to second page of 15 titles
    button = driver.find_element_by_xpath('//*[@id="TopRated_links"]/span[2]')
    button.click()   
    
    #needed to have it pauce or the second page_source never loaded.
    time.sleep(2)
    
    #repeats above process
    soup = BeautifulSoup(driver.page_source, 'html5lib')
    table = soup.find('div', id = 'TopRatedRows')
    rows2 = [row for row in table.find_all('a')]
    
    #for each genre, get the 15 titles and ids
    for row in rows2:
        title = (row.find(class_='tab_item_name').text)
        app_id = row.get('href').split('/')[4]
    
        games[title] = (genre, app_id)




In [4]:
driver.quit()

In [5]:
len(games)

241

In [280]:
#games is a dictionary, with game titles as keys, and rpg and gameid as values

#games

In [6]:
df = pd.DataFrame(games).T

In [7]:
df.reset_index(inplace=True)
df.rename(columns = {'index':'title',
                         0:'genre',
                         1:'appid'}, inplace = True)

In [8]:
df

Unnamed: 0,title,genre,appid
0,Counter-Strike: Global Offensive,Action,730
1,Grand Theft Auto V,Action,271590
2,Tom Clancy's Rainbow Six® Siege,Action,359550
3,Left 4 Dead 2,Action,550
4,Warframe,Action,230410
...,...,...,...
236,ENDLESS™ Space 2,Strategy,392110
237,Shadow Tactics: Blades of the Shogun,Strategy,418240
238,Total War: THREE KINGDOMS,Strategy,779340
239,Crusader Kings III,Strategy,1158310


In [9]:
df.title.nunique() #all unique

241

In [12]:
# cannot for the life of me figure out why it pulls increasingly less from each genre. Future work
df.genre.value_counts()

Sports                     30
Strategy                   30
RPG                        27
Massively%20Multiplayer    26
Simulation                 24
Indie                      23
Casual                     23
Adventure                  22
Racing                     18
Action                     18
Name: genre, dtype: int64

# Pulling reviews for each game
the max pull per game is 100 reviews per request, setting to maximum for largest possible corpus

In [13]:
def get_reviews(idnum):
    reviews=[]
    response = requests.get('http://store.steampowered.com/appreviews/{}?json=1&num_per_page=100'.format(idnum))
    temp=response.json()
    #cursor = temp['cursor']   #a unique code per pull, signifying what the next batch of reviews are
    for i in range(len(temp['reviews'])):
        reviews.append(temp['reviews'][i]['review'])
    return reviews

In [14]:
df['reviews'] = df.appid

In [15]:
df.reviews = df.reviews.apply(get_reviews)

In [16]:
df.head(1)

Unnamed: 0,title,genre,appid,reviews
0,Counter-Strike: Global Offensive,Action,730,[Your team in every random competitive game:\n...


In [17]:
df.iloc[0]['reviews'][0]

'Your team in every random competitive game:\n\n- You\n- Russian Guy\n- Another Russian Guy\n- A 6 year old kid who slept with your mother\n- Russian Guy who speaks English\n\n10/10'

In [18]:
len(df.iloc[0]['reviews']) #great success

100

# Expanding Data Frame
using pd.explode to expand the list of reviews captured for every game into its own row in the dataframe

In [19]:
df.head()

Unnamed: 0,title,genre,appid,reviews
0,Counter-Strike: Global Offensive,Action,730,[Your team in every random competitive game:\n...
1,Grand Theft Auto V,Action,271590,"[Trevor Philips Enterprises 🥵, After Ep*c Game..."
2,Tom Clancy's Rainbow Six® Siege,Action,359550,[> Play as French guy with giant shield\n> Sto...
3,Left 4 Dead 2,Action,550,"[h, Every steam library needs L4D2. Seriously?..."
4,Warframe,Action,230410,[look at my hours]


In [76]:
# # this allows me to pull which row from game dataframe the review is in, provided it is unaltered
# tester = df.iloc[0]['reviews'][0]

# for i in range(len(df)-1):
#     if tester in df.reviews[i]:
#         print(df.iloc[i].title)

In [20]:
## working code
finaldf = df.explode('reviews').reset_index()

In [21]:
finaldf.shape

(10836, 5)

In [23]:
finaldf.drop(columns=['index'],inplace=True)

In [24]:
finaldf.head(1)

Unnamed: 0,title,genre,appid,reviews
0,Counter-Strike: Global Offensive,Action,730,Your team in every random competitive game:\n\...


In [25]:
finaldf.title.value_counts()

It Takes Two                             100
Monster Hunter: World                    100
PlanetSide 2                             100
Hollow Knight                            100
Terraria                                 100
                                        ... 
Legends of IdleOn - Idle MMO               1
Later Alligator                            1
Dota Underlords                            1
Aokana - Four Rhythms Across the Blue      1
The Jackbox Party Pack 6                   1
Name: title, Length: 241, dtype: int64

In [29]:
finaldf.drop_duplicates(subset='reviews', inplace=True)

In [30]:
finaldf.shape # unsure how duplicate reviews are appearing, but this iteration lost a lot less

(10180, 4)

In [31]:
finaldf.to_csv(r'/users/michaelharnett/desktop/metis/projects/steam2_redux/data/finaldf.csv', index = False)

# Mini appendix. 
It took an ardous two days to find pd.explode, and to also use the correct syntax. The following is a graveyard of failed attempts

# oh wait, there's zip!
should be able to zip each review to the title and genre when creating the list, making a larger data frame wher each review is its own thing

In [162]:
df['reviews2']= df.appid

In [163]:
df.head()

Unnamed: 0,title,genre,appid,reviews,reviews2
0,Counter-Strike: Global Offensive,Action,730,[Your team in every random competitive game:\n...,730
1,Grand Theft Auto V,Action,271590,"[Trevor Philips Enterprises 🥵, After Ep*c Game...",271590
2,Tom Clancy's Rainbow Six® Siege,Action,359550,[> Play as French guy with giant shield\n> Sto...,359550
3,Left 4 Dead 2,Action,550,"[h, why did they leave me for dead for thats n...",550
4,Warframe,Action,230410,"[look at my hours, look at my hours]",230410


In [174]:
test_dict = {}

def get_reviews_new(idnum):
    reviews=[]
    response = requests.get('http://store.steampowered.com/appreviews/{}?json=1'.format(idnum))
    temp=response.json()
    cursor = temp['cursor']
    for i in range(len(temp['reviews'])):
        #games_dict=dict(zip(idnum,(temp['reviews'][i]['review'])))
        reviews.append(temp['reviews'][i]['review'])
    response = requests.get('http://store.steampowered.com/appreviews/{}?json=1{}'.format(idnum, cursor))
    temp = response.json()
    for i in range(len(temp['reviews'])):
        games_dict=dict(zip(idnum,(temp['reviews'][i]['review'])))             
        #reviews.append(temp['reviews'][i]['review'])
    return games_dict

In [180]:
idlist[0]

'730'

In [186]:
df.reviews2.apply(get_reviews_new)

0                         {'7': 'H', '3': 'e', '0': 'l'}
1      {'2': 'v', '7': 'e', '1': 'r', '5': 'y', '9': ...
2               {'3': 'I', '5': 'e', '9': 'u', '0': 'd'}
3                                   {'5': 'r', '0': 'e'}
4      {'2': 'l', '3': 'o', '0': 'a', '4': 'k', '1': ...
                             ...                        
120    {'4': 'E', '2': 'l', '7': 'c', '5': 'e', '0': ...
121    {'5': 'v', '9': 'k', '4': 'a', '7': 'e', '0': ...
122    {'3': 'e', '9': 'f', '4': 't', '6': 'r', '0': ...
123    {'3': 'W', '2': 'h', '4': 'a', '7': 't', '0': ...
124    {'6': 'i', '4': 'h', '5': 's', '7': ' ', '0': ...
Name: reviews2, Length: 125, dtype: object

In [187]:
games_dict

NameError: name 'games_dict' is not defined

In [189]:
get_reviews_new(idlist[0])

{'7': 'H', '3': 'e', '0': 'l'}

In [236]:
#get_reviews(idlist[0])

In [191]:
df2 = pd.DataFrame()

In [192]:
df2

In [195]:
df.explode(df.reviews.str.split(','))

ValueError: column must be a scalar

In [211]:
df.reviews.explode()

0      Your team in every random competitive game:\n\...
0                                Pay $15 to unlock level
0                 counter-fart: globally offensive scent
0      This game saved my life.\n\nI am 27.\n\nMy ex-...
0      Absolutely garbage servers and net-code for pu...
                             ...                        
124                                Damnn this game 10/10
124    10/10 a very fun gameplay, so many different b...
124    Easily the best and most diverse card builder ...
124    A phenomenal card game. Probably the best roug...
124    This game is an easy way to blow through an en...
Name: reviews, Length: 4120, dtype: object

In [258]:
## working code
explodeddf = df.explode('reviews')

In [259]:
explodeddf.to_csv(r'/users/michaelharnett/desktop/metis/projects/steam2_redux/data/explodedf.csv')

In [222]:
len(megalist2)

4120

In [260]:
df.iloc[0]['reviews'][0]

'Your team in every random competitive game:\n\n- You\n- Russian Guy\n- Another Russian Guy\n- A 6 year old kid who slept with your mother\n- Russian Guy who speaks English\n\n10/10'

In [224]:
megalist3 = []

for rows in df.reviews:
    for items in rows:
        megalist3.append(items)

In [225]:
len(megalist3)

4120

In [227]:
df.iloc[0]['reviews'][0]

'Your team in every random competitive game:\n\n- You\n- Russian Guy\n- Another Russian Guy\n- A 6 year old kid who slept with your mother\n- Russian Guy who speaks English\n\n10/10'