In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import requests

In [2]:
def get_reviews(appid, params):
        url_start = 'https://store.steampowered.com/appreviews/'
        response = requests.get(url=url_start+appid, params=params, headers={'User-Agent': 'Mozilla/5.0'})
        return response.json() # return data extracted from the json response

In [3]:
def get_n_reviews(appid, n=100):
    reviews = []
    cursor = '*'
    params = { # https://partner.steamgames.com/doc/store/getreviews
            'json' : 1,
            'filter' : 'all', # sort by: recent, updated, all (helpfullness)
            'language' : 'english', # https://partner.steamgames.com/doc/store/localization
            'day_range' : 9223372036854775807, # shows reveiws from all time
            'review_type' : 'all', # all, positive, negative
            'purchase_type' : 'all', # all, non_steam_purchase, steam
        }
    while n > 0:
        params['cursor'] = cursor.encode() # for pagination
        params['num_per_page'] = min(100, n) # 100 is the max possible reviews in one requests
        n -= 100
        
        response = get_reviews(appid, params)
        cursor = response['cursor']
        reviews += response['reviews']
        
        if len(response['reviews']) < 100: break
    
    return reviews

In [12]:
def get_n_appids(n=100, filter_by='topsellers'):
    appids = []
    url = f'https://store.steampowered.com/search/?category1=998&filter={filter_by}&page='
    page = 0
    
    while page*25 < n:
        page += 1
        response = requests.get(url=url+str(page), headers={'User-Agent': 'Mozilla/5.0'})
        soup = BeautifulSoup(response.text, 'html.parser')
        for row in soup.find_all(class_='search_result_row'):
            appids.append(row['data-ds-appid'])
    
    return appids[:n]

In [17]:
reviews = []
appids = get_n_appids(1000)
for appid in appids:
    reviews += get_n_reviews(appid, 100)
df = pd.DataFrame(reviews)[['review', 'voted_up']]
df

Unnamed: 0,review,voted_up
0,I wanted to wait until I had 100 hours into th...,True
1,"I don't know how these devs did it, but I have...",True
2,"Has more game play, less bugs, and is polished...",True
3,I am very impressed with this game. Its worth...,True
4,Imagine if Rust and Runescape had a baby (with...,True
...,...,...
90250,got earraped at the start 5/5 would recommend,True
90251,Best remake of the time <3,True
90252,its like 60 seconds.,True
90253,uhhh for some reason i got this game for free ...,True


In [18]:
df.voted_up.value_counts(normalize=True)

True     0.801573
False    0.198427
Name: voted_up, dtype: float64

In [20]:
df.to_csv('data/reviews.csv', index=False)