# Steam Review Collector

Simple functions for collecting reviews for games via Steam Web API.

# API description

Maximum of 100k requests per day. (https://steamcommunity.com/dev/apiterms)

Review API:
https://partner.steamgames.com/doc/store/getreviews

## Response:
- success - 1 if the query was successful
- query_summary - Returned in the first request
    - num_reviews - The number of reviews returned in this response
    - review_score - The review score
    - review_score_desc - The description of the review score
    - total_positive - Total number of positive reviews
    - total_negative - Total number of negative reviews
    - total_reviews - Total number of reviews matching the query parameters
- cursor - The value to pass into the next request as the cursor to retrieve the next batch of reviews

### Reviews
- recommendationid - The unique id of the recommendation
- author
    - steamid - the user’s SteamID
    - num_games_owned - number of games owned by the user
    - num_reviews - number of reviews written by the user
    - playtime_forever - lifetime playtime tracked in this app
    - playtime_last_two_weeks - playtime tracked in the past two weeks for this app
    - playtime_at_review - playtime when the review was written
    - last_played - time for when the user last played
- language - language the user indicated when authoring the review
- review - text of written review
- timestamp_created - date the review was created (unix timestamp)
- timestamp_updated - date the review was last updated (unix timestamp)
- voted_up - true means it was a positive recommendation
- votes_up - the number of users that found this review helpful
- votes_funny - the number of users that found this review funny
- weighted_vote_score - helpfulness score
- comment_count - number of comments posted on this review
- steam_purchase - true if the user purchased the game on Steam
- received_for_free - true if the user checked a box saying they got the app for free
- written_during_early_access - true if the user posted this review while the game was in Early Access
- developer_response - text of the developer response, if any
- timestamp_dev_responded - Unix timestamp of when the developer responded, if applicable

In [1]:
import os
import json
import requests
import pandas as pd
import numpy as np
from datetime import datetime

import urllib.parse

In [99]:
# Single request
r_string = 'https://store.steampowered.com/appreviews/424370?json=1&num_per_page=1&filter=recent&language=english&cursor=*'
r = requests.get(r_string)

In [100]:
# Checking status code
r.status_code

200

In [2]:
def fetch_steam_reviews(app_id, filt='recent', num_per_page=100, language='english', cursor='*', meta=False):
    """Uses Steam Web API to fetch reviews from a specified game.

    Implemented according to Steam Web API specification for fetching game reviews. 
    https://partner.steamgames.com/doc/store/getreviews

    Args:
        app_id: Steam id of the game, found via Steam Store Page of a game.
        filt: Optional;  How the reviews are ordered.
        num_per_page: Optional;  The number of results to return for each query, 100 is maximum.
        language: Optional;  Language filter for the reviews.
        cursor: Optional;  The cursor used to fetch the next set of reviews according to API specification.
        meta: Optional;  Used to return additional meta information, a summary of the reviews of the game.

    Returns:
        A dictionary with metadata, reviews and next cursor.

    Raises:
        raise_for_status if return code is not 200.
    """
    
    r_string = f'https://store.steampowered.com/appreviews/{app_id}?json=1&num_per_page={num_per_page}&filter={filt}&language={language}&cursor={cursor}'
    r = requests.get(r_string)
    
    if r.status_code == requests.codes.ok:
        payload = r.json()
        
        if not payload['success'] == 1:
            print('Something was wrong with retrieving the payload.')
            
        # Save meta data.
        if meta:
            metadata = payload['query_summary']
        else:
            metadata = None
            
        # Remove recommendation id and author steam id for some sanitation.
        review_list = []
        for review in payload['reviews']:
            # Move information about author
            review['num_games_owned'] = review['author']['num_games_owned']
            review['num_reviews'] = review['author']['num_reviews']
            
            if 'playtime_forever' in review['author']:
                review['playtime_forever'] = review['author']['playtime_forever']
            else:
                review['playtime_forever'] = 0
                
            if 'playtime_last_two_weeks' in review['author']:
                review['playtime_last_two_weeks'] = review['author']['playtime_last_two_weeks']
            else:
                review['playtime_last_two_weeks'] = 0
                
            if 'playtime_at_review' in review['author']:
                review['playtime_at_review'] = review['author']['playtime_at_review']
            else:
                review['playtime_at_review'] = 0
                
            if 'last_played' in review['author']:
                review['last_played'] = review['author']['last_played']
            else:
                review['last_played'] = np.nan
                
            
            # Remove identifiable information
            del review['recommendationid']
            del review['author']
            
            # Remove developer response and response date if existing.
            if 'developer_response' in review:
                del review['developer_response']
                
            if 'timestamp_dev_responded' in review:
                del review['timestamp_dev_responded']
            
            # Save each review in a new structure.
            review_list.append(review)
            
            
        # Fetch cursor for next dataset
        rec_cursor = payload['cursor']
        
        return {'meta': metadata, 'reviews':review_list, 'cursor':rec_cursor}
    
    else:
        # Error handling
        r.raise_for_status()

In [109]:
# Example for meta data
res = fetch_steam_reviews('424370', num_per_page=0, cursor='*', meta=True)

In [3]:
def process_n_reviews(app_id, n_calls, output_folder, init_cursor=False, **kwargs):
    """Process a maximum number of API calls and stores the output.

    Using fetch_steam_reviews function to run a number of queries, at maximum n_calls many, to fetch a larger dataset.
    This is used to safetly query the API without overusing the API according to https://steamcommunity.com/dev/apiterms.
    The data is stored using parquet file format.

    Args:
        app_id: Steam id of the game, found via Steam Store Page of a game.
        n_calls: Maximum number of queries to the API.
        output_folder: Folder where parquet files are stored.
        init_cursor: Optional;  The cursor used to query first.
        **kwargs: Optional;  For future use.

    Returns:
        N/A

    Raises:
        IOError if output folder is not found.
    """
    
    # Check output folder.
    if not os.path.exists(output_folder):
        raise IOError(f"Specified output path does not exist! {output_folder}")
        
        
    # Check content of output folder, if there is previous mining done for this app. Not checking for duplicate information!
    outdir_content = os.listdir(output_folder)
    if any(f'review_{app_id}' in f for f in outdir_content):
        # There exist some matches, set the suffix number to be next unique number.
        suffix = sum(f'review_{app_id}' in f for f in outdir_content) + 1
    else:
        suffix = 1
        
    
    # Check if metadata file exits, create file otherwise.
    metafile = f'metadata_{app_id}.txt'
    meta_fullpath = os.path.join(output_folder, metafile)
    if not os.path.isfile(meta_fullpath):
        with open(meta_fullpath, 'w') as metafile:
            metafile.write('Timestamp\tIteration\tCurrent Cursor\tNext Cursor\tFilename\n')
            
    
    # Initialized to * for the first fetch as specified in the API documents.
    if not init_cursor:
        next_cursor = '*'
    else:
        next_cursor = init_cursor
    
    # Iterate using the cursor from previous query
    for n in range(n_calls):
        res = fetch_steam_reviews(app_id, cursor=next_cursor, meta=False, filt='recent', num_per_page=100, language='english', )
        
        # Stop if there is no new cursor, then there is no more data to collect
        if next_cursor == urllib.parse.quote_plus(res['cursor']):
            print("No more data to fetch, closing!")
            break
        
        # Create df, convert columns and add metadata for cursor
        df = pd.DataFrame.from_dict(res['reviews'])
        
        df['last_played'] = pd.to_datetime(df['last_played'], unit='s', errors='coerce')
        df['timestamp_created'] = pd.to_datetime(df['timestamp_created'], unit='s')
        df['timestamp_updated'] = pd.to_datetime(df['timestamp_updated'], unit='s')
        
        # Cast dtypes
        df = df.convert_dtypes()
        df['weighted_vote_score'] = df['weighted_vote_score'].astype('float32') # This will change the value, but not significantly.
        
        # Write to disk
        filename = f'review_{app_id}_{suffix}.parquet'
        path = os.path.join(output_folder, filename)
        df.to_parquet(path)
        
        # Update metadata file
        with open(meta_fullpath, 'a') as metafile:
            metafile.write(f"{datetime.now()}\t{n}\t{next_cursor}\t{urllib.parse.quote_plus(res['cursor'])}\t{filename}\n")
        
        # Step variables
        next_cursor = urllib.parse.quote_plus(res['cursor'])
        suffix = suffix + 1
        
        
        

In [112]:
process_n_reviews('424370', 700, 'D:\\data\\validation', init_cursor='*')

No more data to fetch, closing!


In [None]:
# Example for reading parquet file to pandas dataframe
dg = pd.read_parquet('D:\\data\\validation\\review_424370_5.feather')

In [None]:
dg.info()

In [2]:
def merge_data(input_folder):
    """Merging all parquet files in a folder.

    After multiple data files are saved in a folder it can be merged to a single file for easier handling upon analysis.

    Args:
        input_folder: Folder where parquet files are stored.

    Returns:
        N/A

    Raises:
        IOError if input folder is not found.
        IOError if no data is imported with the specified input folder.
    """
    
    # Check input folder.
    if not os.path.exists(input_folder):
        raise IOError(f"Specified input path does not exist! {input_folder}")
        
    df_out = pd.DataFrame()   

    # Iterate over all data files appending content to a larger dataframe
    for file in os.listdir(input_folder):
        if not file.endswith(".parquet"):
            continue
            
        df_tmp = pd.read_parquet(os.path.join(input_folder, file))
        
        if df_out.empty:
            df_out = df_tmp
        else:
            df_out = df_out.append(df_tmp, ignore_index=True)
    
    # Save data if df_out is not empty
    if not df_out.empty:
        filename = f'review_merged.parquet'
        path = os.path.join(input_folder, filename)
        df_out.to_parquet(path)
    else:
        raise IOError('Error, the dataframe was empty!')

In [3]:
# Example of merging data
merge_data('D:\\data\\test_train')

# Test data extraction: GTA V

In [6]:
process_n_reviews('271590', 3000, 'D:\\data\\test_train', init_cursor='*')

No more data to fetch, closing!
