In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import json

# Scraping Notebook
This notebook is used to scrape and compile all data needed for the project. First, it will scrape the top 15 games of each genre from steampowered.com with their game id's, then use those ids to gather review data from SteamWorks API

In [27]:
# Gathering 15 games and id numbers from each genre

#each url is formatted the same for the top rated games, so creating a list of the genres to cycle through
genre_list = ['Action', 'Adventure', 'Casual', 'Indie', 'Massively%20Multiplayer',
             'Racing', 'RPG','Simulation', 'Sports', 'Strategy' ]

total_list = []
games = {}
#cycle through each genre
for genre in genre_list:
    response = requests.get('https://store.steampowered.com/tags/en/{}/#p=0&tab=TopRated'.format(genre))
    page = response.text
    soup = BeautifulSoup(page, 'html5lib')
    table = soup.find('div', id = 'TopRatedRows')
    rows = [row for row in table.find_all('a')]
    
    #for each genre, get the 15 titles and ids
    for row in rows:
        title = (row.find(class_='tab_item_name').text)
        app_id = row.get('href').split('/')[4]
    
        games[title] = (genre, app_id)
    
    

In [29]:
len(games)

125

In [33]:
#games is a dictionary, with game titles as keys, and rpg and gameid as values

games

{'Counter-Strike: Global Offensive': ('Action', '730'),
 'Grand Theft Auto V': ('Action', '271590'),
 "Tom Clancy's Rainbow Six® Siege": ('Action', '359550'),
 'Left 4 Dead 2': ('Action', '550'),
 'Warframe': ('Action', '230410'),
 'Portal 2': ('Action', '620'),
 'Borderlands 2': ('RPG', '49520'),
 'PAYDAY 2': ('Action', '218620'),
 'Hades': ('RPG', '1145360'),
 'Counter-Strike': ('Action', '10'),
 'Dying Light': ('Action', '239140'),
 'Destiny 2': ('Action', '1085660'),
 'Tomb Raider': ('Adventure', '203160'),
 'Risk of Rain 2': ('Action', '632360'),
 'Counter-Strike: Source': ('Action', '240'),
 'Terraria': ('Adventure', '105600'),
 'The Witcher® 3: Wild Hunt': ('RPG', '292030'),
 'The Forest': ('Adventure', '242760'),
 'ARK: Survival Evolved': ('Adventure', '346110'),
 'Red Dead Redemption 2': ('Adventure', '1174180'),
 'Slime Rancher': ('Adventure', '433340'),
 'Sea of Thieves': ('Adventure', '1172620'),
 'Totally Accurate Battle Simulator': ('Simulation', '508440'),
 "Don't Starve

In [37]:
df = pd.DataFrame(games).T

In [41]:
df

Unnamed: 0,0,1
Counter-Strike: Global Offensive,Action,730
Grand Theft Auto V,Action,271590
Tom Clancy's Rainbow Six® Siege,Action,359550
Left 4 Dead 2,Action,550
Warframe,Action,230410
...,...,...
Factorio,Strategy,427520
Total War: WARHAMMER II,Strategy,594570
Hearts of Iron IV,Strategy,394360
STAR WARS™ Empire at War - Gold Pack,Strategy,32470


In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 125 entries, Counter-Strike: Global Offensive to Slay the Spire
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       125 non-null    object
 1   1       125 non-null    object
dtypes: object(2)
memory usage: 2.9+ KB


In [46]:
df[0].value_counts()

Sports                     15
Strategy                   15
Simulation                 15
Massively%20Multiplayer    13
RPG                        12
Adventure                  12
Action                     12
Casual                     11
Indie                      10
Racing                     10
Name: 0, dtype: int64

In [48]:
df[df[0] == "Indie"]

Unnamed: 0,0,1
Don't Starve,Indie,219740
The Binding of Isaac: Rebirth,Indie,250900
Hollow Knight,Indie,367520
Dead Cells,Indie,588650
FTL: Faster Than Light,Indie,212680
Beat Saber,Indie,620980
Grimm,Indie,252150
Helltaker,Indie,1289310
To the Moon,Indie,206440
The Binding of Isaac,Indie,113200


# Pulling reviews for each game
each api pull retrieves 20 commens, but adding the cursor from the first pull, it will take the next 20. This will get 40 reviews per game.

In [73]:
def get_reviews(idnum):
    reviews=[]
    response = requests.get('http://store.steampowered.com/appreviews/{}?json=1'.format(idnum))
    temp=response.json()
    cursor = temp['cursor']
    for i in range(len(temp['reviews'])):
        reviews.append(temp['reviews'][i]['review'])
    response = requests.get('http://store.steampowered.com/appreviews/{}?json=1{}'.format(idnum, cursor))
    temp = response.json()
    for i in range(len(temp['reviews'])):
        reviews.append(temp['reviews'][i]['review'])
    return reviews

In [74]:
df['idnum'] = df[1]

In [77]:
df.idnum = df.idnum.apply(get_reviews)

In [79]:
df.rename(columns = {'idnum':'reviews'}, inplace=True)

In [99]:
df.head(1)

Unnamed: 0,0,1,reviews
Counter-Strike: Global Offensive,Action,730,[Your team in every random competitive game:\n...


In [84]:
len(df.iloc[1]['reviews'])

TypeError: 'int' object is not subscriptable

In [85]:
df.iloc[0]['reviews'][0]

'Your team in every random competitive game:\n\n- You\n- Russian Guy\n- Another Russian Guy\n- A 6 year old kid who slept with your mother\n- Russian Guy who speaks English\n\n10/10'

In [86]:
rando =' Your team in every random competitive game:\n\n- You\n- Russian Guy\n- Another Russian Guy\n- A 6 year old kid who slept with your mother\n- Russian Guy who speaks English\n\n10/10'

In [87]:
rando in df

False

In [88]:
rando in df.reviews

False

# Trying to seperate list of reviews but keep game name

In [119]:
for i in range(len(df)):
    if rando in df.iloc[i]['reviews']:
        print('true')
#     else:
#         print('not true')

In [101]:
rando in list(df.iloc[0]['reviews'])

False

In [107]:
df = df.reset_index()
df.rename(columns={'index':'title'}, inplace=True)

In [108]:
df.head(1)

Unnamed: 0,title,0,1,reviews
0,Counter-Strike: Global Offensive,Action,730,[Your team in every random competitive game:\n...


In [109]:
megadict={}

def megadicter(list):
    for items in list:
        megadict[df.title]=(items)

In [110]:
df.reviews.apply(megadicter)

TypeError: 'Series' objects are mutable, thus they cannot be hashed

In [112]:
for rows in df:
    megadict[df.title]=(df.reviews)

TypeError: 'Series' objects are mutable, thus they cannot be hashed

In [113]:
tester = enumerate(df.iloc[0]['reviews'])

TypeError: 'enumerate' object is not subscriptable

In [123]:
megalist=[]

def megalister(list):
    for items in list:
        megalist.append(items)

In [124]:
df.head(1)

Unnamed: 0,title,0,1,reviews
0,Counter-Strike: Global Offensive,Action,730,[Your team in every random competitive game:\n...


In [125]:
df.reviews.apply(megalister)

0      None
1      None
2      None
3      None
4      None
       ... 
120    None
121    None
122    None
123    None
124    None
Name: reviews, Length: 125, dtype: object

In [126]:
# list of 4k reviews, but no game title attached. 

en(megalist)

4120

In [129]:
tester = megalist[0]

In [134]:
len(df.reviews)

125

In [139]:
tester in df.reviews[124]

False

In [146]:
# this allows me to pull which row from game dataframe the review is in

for i in range(len(df)-1):
    if tester in df.reviews[i]:
        print(i)

0


In [148]:
megalist.to_csv(r'users/michaelharnett/desktop/metis/projects/steam2_redux/megalist.csv')

AttributeError: 'list' object has no attribute 'to_csv'

In [150]:
df.head()

Unnamed: 0,title,0,1,reviews
0,Counter-Strike: Global Offensive,Action,730,[Your team in every random competitive game:\n...
1,Grand Theft Auto V,Action,271590,"[Trevor Philips Enterprises 🥵, After Ep*c Game..."
2,Tom Clancy's Rainbow Six® Siege,Action,359550,[> Play as French guy with giant shield\n> Sto...
3,Left 4 Dead 2,Action,550,"[h, why did they leave me for dead for thats n..."
4,Warframe,Action,230410,"[look at my hours, look at my hours]"


In [156]:
df.to_csv(r'/users/michaelharnett/desktop/metis/projects/steam2_redux/megalist.csv', index=False)