This is the Steam Review portion of the Steam API. This downloaded review data for every game entry on Steam as of 2/6/2024 and exported it into a csv for further use.

In [None]:
# standard library imports
import csv
import datetime as dt
import json
import os
import statistics
import time

# third-party imports
import numpy as np
import pandas as pd
import requests
from requests.exceptions import SSLError

# customisations - ensure tables show all columns
pd.set_option("display.max_columns", 100)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
def get_request(url,parameters=None, steamspy=False):
    """Return json-formatted response of a get request using optional parameters.
    
    Parameters
    ----------
    url : string
    parameters : {'parameter': 'value'}
        parameters to pass as part of get request
    
    Returns
    -------
    json_data
        json-formatted response (dict-like)
    """
    try:
        response = requests.get(url=url, params=parameters)
    except SSLError as s:
        print('SSL Error:', s)
        
        for i in range(5, 0, -1):
            print('\rWaiting... ({})'.format(i), end='')
            time.sleep(1)
        print('\rRetrying.' + ' '*10)
        
        # recursively try again
        return get_request(url, parameters, steamspy)
    
    if response:
        try:
            return response.json()
        except:
            False
    else:
        # We do not know how many pages steamspy has... and it seems to work well, so we will use no response to stop.
        if steamspy:
            return "stop"
        else :
            # response is none usually means too many requests. Wait and try again 
            print('No response, waiting 10 seconds...')
            time.sleep(10)
            print('Retrying.')
            return get_request(url, parameters, steamspy)

Define Download Data

In [None]:
def get_app_data(app_list, start, stop, parser, pause):
    """Return list of app data generated from parser.
    
    parser : function to handle request
    """
    app_data = []
    
    # iterate through each row of app_list, confined by start and stop
    for index, appid in app_list[start:stop].items():
        print('Current index: {}'.format(index), end='\r')

        # retrive app data for a row, handled by supplied parser, and append to list
        try:
            data = parser(appid)
            app_data.append(data)
        except:
            print("Error with "+str(appid))
        time.sleep(pause) # prevent overloading api with requests
    
    return app_data


def process_batches(parser, app_list, download_path, data_filename, index_filename,
                    columns, begin=0, end=-1, batchsize=100, pause=1):
    """Process app data in batches, writing directly to file.
    
    parser : custom function to format request
    app_list : dataframe of appid and name
    download_path : path to store data
    data_filename : filename to save app data
    index_filename : filename to store highest index written
    columns : column names for file
    
    Keyword arguments:
    
    begin : starting index (get from index_filename, default 0)
    end : index to finish (defaults to end of app_list)
    batchsize : number of apps to write in each batch (default 100)
    pause : time to wait after each api request (defualt 1)
    
    returns: none
    """
    print('Starting at index {}:\n'.format(begin))
    
    # by default, process all apps in app_list
    if end == -1:
        end = len(app_list) + 1
    
    # generate array of batch begin and end points
    batches = np.arange(begin, end, batchsize)
    batches = np.append(batches, end)
    
    apps_written = 0
    batch_times = []
    
    for i in range(len(batches) - 1):
        start_time = time.time()
        
        start = batches[i]
        stop = batches[i+1]
        
        app_data = get_app_data(app_list, start, stop, parser, pause)
        
        rel_path = os.path.join(download_path, data_filename)
        
        # writing app data to file
        with open(rel_path, 'a', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=columns, extrasaction='ignore')
            
            for j in range(3,0,-1):
                print("\rAbout to write data, don't stop script! ({})".format(j), end='')
                time.sleep(0.5)
            
            writer.writerows(app_data)
            print('\rExported lines {}-{} to {}.'.format(start, stop-1, data_filename), end=' ')
            
        apps_written += len(app_data)
        
        idx_path = os.path.join(download_path, index_filename)
        
        # writing last index to file
        with open(idx_path, 'w') as f:
            index = stop
            print(index, file=f)
            
        # logging time taken
        end_time = time.time()
        time_taken = end_time - start_time
        
        batch_times.append(time_taken)
        mean_time = statistics.mean(batch_times)
        
        est_remaining = (len(batches) - i - 2) * mean_time
        
        remaining_td = dt.timedelta(seconds=round(est_remaining))
        time_td = dt.timedelta(seconds=round(time_taken))
        mean_td = dt.timedelta(seconds=round(mean_time))
        
        print('Batch {} time: {} (avg: {}, remaining: {})'.format(i, time_td, mean_td, remaining_td))
            
    print('\nProcessing batches complete. {} apps written'.format(apps_written))

In [None]:
def reset_index(download_path, index_filename):
    """Reset index in file to 0."""
    rel_path = os.path.join(download_path, index_filename)
    
    f= open(rel_path, 'w')
    f.write("0")
        

def get_index(download_path, index_filename):
    """Retrieve index from file, returning 0 if file not found."""
    try:
        rel_path = os.path.join(download_path, index_filename)

        with open(rel_path, 'r') as f:
            index = int(f.readline())
            #This just reads the initial line
    
    except FileNotFoundError:
        index = 0
        
    return index


def prepare_data_file(download_path, filename, index, columns):
    """Create file and write headers if index is 0."""
    if index == 0:
        rel_path = os.path.join(download_path, filename)

        with open(rel_path, 'w', newline='') as f:
            writer = csv.DictWriter(f, fieldnames=columns)
            writer.writeheader()

Steam Reviews

In [None]:
def parse_steamreviews_request(appid):
    """Parser to handle SteamSpy API data."""
    url = "https://store.steampowered.com/appreviews/" + str(appid)
    parameters = {"json": 1, "num_per_page": "0", "language": "all", "purchase_type": "all"}
    json_data = get_request(url, parameters)
    json_data = json_data['query_summary']
    json_data["appid"]=appid
    return json_data


# set files and columns
download_path = '../data/download'
steamreviews_data = 'steamreviews_data.csv'
steamreviews_index = 'steamreviews_index.txt'

steamreviews_columns = [
    'appid', 'review_score', 'review_score_desc', 'total_positive', 'total_negative', 'total_reviews'
]

reset_index(download_path, steamreviews_index)
index = get_index(download_path, steamreviews_index)

# Wipe data file if index is 0
prepare_data_file(download_path, steamreviews_data, index, steamreviews_columns)

full_steam_ids=pd.read_csv("../data/download/steam_app_data.csv")

process_batches(
    parser=parse_steamreviews_request,
    app_list=full_steam_ids["steam_appid"],
    download_path=download_path, 
    data_filename=steamreviews_data,
    index_filename=steamreviews_index,
    columns=steamreviews_columns,
    begin=index,
    end=len(full_steam_ids),
    batchsize=300,
    pause=0
)

  full_steam_ids=pd.read_csv("../data/download/steam_app_data.csv")


Starting at index 0:

Exported lines 0-299 to steamreviews_data.csv. Batch 0 time: 0:02:11 (avg: 0:02:11, remaining: 11:51:56)
Exported lines 300-599 to steamreviews_data.csv. Batch 1 time: 0:02:18 (avg: 0:02:14, remaining: 12:08:50)
Exported lines 600-899 to steamreviews_data.csv. Batch 2 time: 0:02:10 (avg: 0:02:13, remaining: 11:58:41)
Exported lines 900-1199 to steamreviews_data.csv. Batch 3 time: 0:02:10 (avg: 0:02:12, remaining: 11:52:37)
Exported lines 1200-1499 to steamreviews_data.csv. Batch 4 time: 0:02:15 (avg: 0:02:13, remaining: 11:53:18)
Exported lines 1500-1799 to steamreviews_data.csv. Batch 5 time: 0:02:10 (avg: 0:02:12, remaining: 11:48:31)
Exported lines 1800-2099 to steamreviews_data.csv. Batch 6 time: 0:02:08 (avg: 0:02:11, remaining: 11:43:09)
Exported lines 2100-2399 to steamreviews_data.csv. Batch 7 time: 0:02:07 (avg: 0:02:11, remaining: 11:38:13)
Exported lines 2400-2699 to steamreviews_data.csv. Batch 8 time: 0:02:08 (avg: 0:02:11, remaining: 11:34:08)
Export

In [None]:
steamreviews=pd.read_csv("../data/download/steamreviews_data.csv", index_col="appid")

In [None]:
steamreviews.info()

<class 'pandas.core.frame.DataFrame'>
Index: 98151 entries, 10 to 2830670
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   review_score       98151 non-null  int64 
 1   review_score_desc  98151 non-null  object
 2   total_positive     98151 non-null  int64 
 3   total_negative     98151 non-null  int64 
 4   total_reviews      98151 non-null  int64 
dtypes: int64(4), object(1)
memory usage: 4.5+ MB


In [None]:
steamreviews["total_reviews"].value_counts()

total_reviews
0         23763
1          3730
2          3112
3          2888
4          2563
          ...  
16253         1
650144        1
27230         1
23957         1
2469          1
Name: count, Length: 5437, dtype: int64

In [None]:
steamreviews["review_score_desc"].value_counts()

review_score_desc
No user reviews            23763
Very Positive              14193
Mixed                      13322
Positive                   11592
Mostly Positive             9500
1 user reviews              3730
2 user reviews              3112
3 user reviews              2888
4 user reviews              2563
5 user reviews              2371
Mostly Negative             2174
6 user reviews              2138
7 user reviews              1894
8 user reviews              1738
9 user reviews              1636
Overwhelmingly Positive     1160
Negative                     311
Very Negative                 53
Overwhelmingly Negative       13
Name: count, dtype: int64

In [None]:
steamreviews["review_score"].value_counts()

review_score
0    45833
8    14193
5    13322
7    11592
6     9500
4     2174
9     1160
3      311
2       53
1       13
Name: count, dtype: int64