This Notebook used code written by Nik Davis. This code will gather data from the Steam Store API using python. Please check out his amazing blog post! https://nik-davis.github.io/posts/2019/steam-data-collection/

In [39]:
# standard library imports
import csv
import datetime as dt
import json
import os
import statistics
import time

# third-party imports
import numpy as np
import pandas as pd
import requests
from requests.exceptions import SSLError

# customisations - ensure tables show all columns
pd.set_option("display.max_columns", 100)

In [45]:
import seaborn as sns
import matplotlib.pyplot as plt

In [46]:
def get_request(url,parameters=None, steamspy=False):
    """Return json-formatted response of a get request using optional parameters.
    
    Parameters
    ----------
    url : string
    parameters : {'parameter': 'value'}
        parameters to pass as part of get request
    
    Returns
    -------
    json_data
        json-formatted response (dict-like)
    """
    try:
        response = requests.get(url=url, params=parameters)
    except SSLError as s:
        print('SSL Error:', s)
        
        for i in range(5, 0, -1):
            print('\rWaiting... ({})'.format(i), end='')
            time.sleep(1)
        print('\rRetrying.' + ' '*10)
        
        # recursively try again
        return get_request(url, parameters, steamspy)
    
    if response:
        try:
            return response.json()
        except:
            False
    else:
        # We do not know how many pages steamspy has... and it seems to work well, so we will use no response to stop.
        if steamspy:
            return "stop"
        else :
            # response is none usually means too many requests. Wait and try again 
            print('No response, waiting 10 seconds...')
            time.sleep(10)
            print('Retrying.')
            return get_request(url, parameters, steamspy)

Generating List of App IDs

In [4]:
url = "https://steamspy.com/api.php"
parameters = {"request": "all"}

# request 'all' from steam spy and parse into dataframe
json_data = get_request(url, parameters=parameters)
steam_spy_all = pd.DataFrame.from_dict(json_data, orient='index')

# generate sorted app_list from steamspy data
app_list = steam_spy_all[['appid', 'name']].sort_values('appid').reset_index(drop=True)

# export disabled to keep consistency across download sessions
# app_list.to_csv('../data/download/app_list.csv', index=False)

# instead read from stored csv
#app_list = pd.read_csv('../data/download/app_list.csv')

# display first few rows
app_list.head()

Unnamed: 0,appid,name
0,10,Counter-Strike
1,20,Team Fortress Classic
2,30,Day of Defeat
3,40,Deathmatch Classic
4,50,Half-Life: Opposing Force


In [5]:
app_list.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   appid   1000 non-null   int64 
 1   name    1000 non-null   object
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


Defining Download Logic

In [26]:
def get_app_data(start, stop, parser, pause):
    """Return list of app data generated from parser.
    
    parser : function to handle request
    """
    app_data = []
    
    # iterate through each row of app_list, confined by start and stop
    for index, row in app_list[start:stop].iterrows():
        print('Current index: {}'.format(index), end='\r')
        
        appid = row['appid']
        name = row['name']

        # retrive app data for a row, handled by supplied parser, and append to list
        data = parser(appid, name)
        app_data.append(data)

        time.sleep(pause) # prevent overloading api with requests
    
    return app_data


def process_batches(parser, app_list, download_path, data_filename, index_filename,
                    columns, begin=0, end=-1, batchsize=100, pause=1):
    """Process app data in batches, writing directly to file.
    
    parser : custom function to format request
    app_list : dataframe of appid and name
    download_path : path to store data
    data_filename : filename to save app data
    index_filename : filename to store highest index written
    columns : column names for file
    
    Keyword arguments:
    
    begin : starting index (get from index_filename, default 0)
    end : index to finish (defaults to end of app_list)
    batchsize : number of apps to write in each batch (default 100)
    pause : time to wait after each api request (defualt 1)
    
    returns: none
    """
    print('Starting at index {}:\n'.format(begin))
    
    # by default, process all apps in app_list
    if end == -1:
        end = len(app_list) + 1
    
    # generate array of batch begin and end points
    batches = np.arange(begin, end, batchsize)
    batches = np.append(batches, end)
    
    apps_written = 0
    batch_times = []
    
    for i in range(len(batches) - 1):
        start_time = time.time()
        
        start = batches[i]
        stop = batches[i+1]
        
        app_data = get_app_data(start, stop, parser, pause)
        
        rel_path = os.path.join(download_path, data_filename)
        
        # writing app data to file
        with open(rel_path, 'a', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=columns, extrasaction='ignore')
            
            for j in range(3,0,-1):
                print("\rAbout to write data, don't stop script! ({})".format(j), end='')
                time.sleep(0.5)
            
            writer.writerows(app_data)
            print('\rExported lines {}-{} to {}.'.format(start, stop-1, data_filename), end=' ')
            
        apps_written += len(app_data)
        
        idx_path = os.path.join(download_path, index_filename)
        
        # writing last index to file
        with open(idx_path, 'w') as f:
            index = stop
            print(index, file=f)
            
        # logging time taken
        end_time = time.time()
        time_taken = end_time - start_time
        
        batch_times.append(time_taken)
        mean_time = statistics.mean(batch_times)
        
        est_remaining = (len(batches) - i - 2) * mean_time
        
        remaining_td = dt.timedelta(seconds=round(est_remaining))
        time_td = dt.timedelta(seconds=round(time_taken))
        mean_td = dt.timedelta(seconds=round(mean_time))
        
        print('Batch {} time: {} (avg: {}, remaining: {})'.format(i, time_td, mean_td, remaining_td))
            
    print('\nProcessing batches complete. {} apps written'.format(apps_written))

In [27]:
def reset_index(download_path, index_filename):
    """Reset index in file to 0."""
    rel_path = os.path.join(download_path, index_filename)
    
    with open(rel_path, 'w') as f:
        print(0, file=f)
        

def get_index(download_path, index_filename):
    """Retrieve index from file, returning 0 if file not found."""
    try:
        rel_path = os.path.join(download_path, index_filename)

        with open(rel_path, 'r') as f:
            index = int(f.readline())
    
    except FileNotFoundError:
        index = 0
        
    return index


def prepare_data_file(download_path, filename, index, columns):
    """Create file and write headers if index is 0."""
    if index == 0:
        rel_path = os.path.join(download_path, filename)

        with open(rel_path, 'w', newline='') as f:
            writer = csv.DictWriter(f, fieldnames=columns)
            writer.writeheader()

Downloading Steam Data

In [28]:
def getAppListBatch(url, parameters):
    json_data = get_request(url, parameters=parameters)
    steam_id = pd.DataFrame.from_dict(json_data["response"]["apps"])
    try:
        more_results = json_data["response"]["have_more_results"]
        last_appid =  json_data["response"]["last_appid"]
    except:
        more_results = False
        last_appid = False
    return more_results, steam_id, last_appid

def get_update_ids_old(updatedlist, oldlist):
    updatedlist['key1'] = 1
    oldlist['key2'] = 1
    updatedlist = pd.merge(updatedlist, oldlist, right_on=['steam_appid','name'],left_on=['appid','name'], how = 'outer')
    updatedlist = updatedlist[~(updatedlist.key2 == updatedlist.key1)]
    updatedlist = updatedlist.drop(['key1','key2','steam_appid'], axis=1)
    return updatedlist

def get_update_ids(idList, oldFullList):
    #We are going to forget about names and only care about IDs.
    idList = idList["appid"]
    oldFullList = oldFullList["steam_appid"]
    oldFullList.columns = ["appid"]
    updatedList = pd.concat([idList, oldFullList])
    updatedList = updatedList.drop_duplicates(keep=False)
    updatedList = updatedList.reset_index(drop=True)
    return updatedList

In [29]:
def getAppList():
    with open('../api/steam_key.txt') as f:
        key = f.read()

    url = "https://api.steampowered.com/IStoreService/GetAppList/v1/?"
    parameters = {"key": key}
    more_results = True
    begin = True
    # from the request we get the more_results flag and also the last_appid, so we use them for the next requests.
    while (more_results):
        more_results, steam_ids, last_appid = getAppListBatch(url, parameters)
        parameters["last_appid"] = last_appid
        if (begin):
            steam_allids = steam_ids
            begin = False
        else:
            steam_allids = pd.concat([steam_allids, steam_ids])
    return steam_allids
# request 'all' from steam spy and parse into dataframe

In [34]:

def parse_steam_request(appid, name):
    """Unique parser to handle data from Steam Store API.
    
    Returns : json formatted data (dict-like)
    """
    with open('../api/steam_key.txt') as f:
        key = f.read()
        
    url = "http://store.steampowered.com/api/appdetails/"
    parameters = {"appids": appid, "key": key}
    
    json_data = get_request(url, parameters=parameters)
    json_app_data = json_data[str(appid)]
    
    if json_app_data['success']:
        data = json_app_data['data']
    else:
        data = {'steam_appid': appid}
        
    return data


# Set file parameters
download_path = '../data/download/'
steam_app_data = 'steam_app_data.csv'
steam_app_data_delta = 'steam_app_data_delta.csv'
steam_index = 'steam_index.txt'

steam_columns = [
    'type', 'name', 'steam_appid', 'required_age', 'is_free', 'controller_support',
    'dlc', 'detailed_description', 'about_the_game', 'short_description', 'fullgame',
    'supported_languages', 'header_image', 'website', 'pc_requirements', 'mac_requirements',
    'linux_requirements', 'legal_notice', 'drm_notice', 'ext_user_account_notice',
    'developers', 'publishers', 'demos', 'price_overview', 'packages', 'package_groups',
    'platforms', 'metacritic', 'reviews', 'categories', 'genres', 'screenshots',
    'movies', 'recommendations', 'achievements', 'release_date', 'support_info',
    'background', 'content_descriptors'
]

# Overwrites last index for demonstration (would usually store highest index so can continue across sessions)
if (os.path.isfile(download_path+steam_app_data_delta) == False):
    reset_index(download_path, steam_index)

# Retrieve last index downloaded from file
index = get_index(download_path, steam_index)

# Wipe or create data file and write headers if no previous  data
if (os.path.isfile(download_path+steam_app_data) == False):
    prepare_data_file(download_path, steam_app_data, index, steam_columns)
    
# Wipe or create data file delta and write headers if index is 0
if (os.path.isfile(download_path+steam_app_data_delta) == False):
    prepare_data_file(download_path, steam_app_data_delta, index, steam_columns)
    
    
# Here we get the list of appids from steam
full_steam_ids = getAppList()

# Here we get the real list of ids not yet in our dataframe. If this is the first time we are downloading the data, we can skip
# This step and instead use the full app_list.
try:
    oldlist = pd.read_csv('../data/download/steam_app_data.csv', usecols = ['name','steam_appid'])
    steam_ids = get_update_ids(full_steam_ids, oldlist)
except FileNotFoundError:
    print("Pre-existing file not found. First time downloading full app data from steam. This will take a while.\n")
    steam_ids = full_steam_ids

In [35]:
print("New IDs detected: "+str(len(steam_ids)))

New IDs detected: 97470


In [41]:
# I separated the long process to be able to debug it better.
# Set end and chunksize for demonstration - remove to run through entire app list
# Here by default we passed "app_list" that contained all the information and saved it, now we will modify it a bit
# And add pre-processing and post-processing
print("Adding "+str(len(steam_ids))+" new ids.\n")
process_batches(
    parser=parse_steam_request,
    app_list=steam_ids,
    download_path=download_path,
    data_filename=steam_app_data_delta,
    index_filename=steam_index,
    columns=steam_columns,
    begin=745,
    #end=10,
    #batchsize=5
)

try:
    oldlist = pd.read_csv('../data/download/steam_app_data.csv')
    # We change the old file to backup, so remove any backup named this way before...
    os.rename('../data/download/steam_app_data.csv', '../data/download/steam_app_data_backup.csv')
    newlist = pd.read_csv('../data/download/steam_app_data_delta.csv')
    oldlist = pd.concat([oldlist, newlist], ignore_index=True)
    oldlist.to_csv('../data/download/steam_app_data.csv', index=False)
except FileNotFoundError:
    os.rename('../data/download/steam_app_data_delta.csv', '../data/download/steam_app_data.csv')

Adding 97470 new ids.

Starting at index 745:

Exported lines 745-844 to steam_app_data_delta.csv. Batch 0 time: 0:02:36 (avg: 0:02:36, remaining: 1 day, 17:52:52)
Exported lines 845-944 to steam_app_data_delta.csv. Batch 1 time: 0:02:38 (avg: 0:02:37, remaining: 1 day, 18:06:14)
Exported lines 945-1044 to steam_app_data_delta.csv. Batch 2 time: 0:01:26 (avg: 0:02:13, remaining: 1 day, 11:44:28)
Exported lines 1045-1144 to steam_app_data_delta.csv. Batch 3 time: 0:00:02 (avg: 0:01:40, remaining: 1 day, 2:52:43)
Exported lines 1145-1244 to steam_app_data_delta.csv. Batch 4 time: 0:00:02 (avg: 0:01:21, remaining: 21:33:39)
Exported lines 1245-1344 to steam_app_data_delta.csv. Batch 5 time: 0:00:02 (avg: 0:01:07, remaining: 18:00:56)
Exported lines 1345-1444 to steam_app_data_delta.csv. Batch 6 time: 0:00:02 (avg: 0:00:58, remaining: 15:29:00)
Exported lines 1445-1544 to steam_app_data_delta.csv. Batch 7 time: 0:00:02 (avg: 0:00:51, remaining: 13:35:02)
Exported lines 1545-1644 to steam_a

FileExistsError: [WinError 183] Cannot create a file when that file already exists: '../data/download/steam_app_data.csv' -> '../data/download/steam_app_data_backup.csv'

In [42]:
steam_app_data = pd.read_csv('../data/download/steam_app_data.csv')

In [44]:


steam_app_data.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1010 entries, 0 to 1009
Data columns (total 39 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   type                     988 non-null    object 
 1   name                     1010 non-null   object 
 2   steam_appid              1010 non-null   int64  
 3   required_age             988 non-null    float64
 4   is_free                  988 non-null    object 
 5   controller_support       400 non-null    object 
 6   dlc                      563 non-null    object 
 7   detailed_description     986 non-null    object 
 8   about_the_game           986 non-null    object 
 9   short_description        986 non-null    object 
 10  fullgame                 2 non-null      object 
 11  supported_languages      986 non-null    object 
 12  header_image             988 non-null    object 
 13  website                  868 non-null    object 
 14  pc_requirements         

Downloading SteamSpy Data

In [14]:
def parse_steamspy_request(appid, name):
    """Parser to handle SteamSpy API data."""
    url = "https://steamspy.com/api.php"
    parameters = {"request": "appdetails", "appid": appid}
    
    json_data = get_request(url, parameters)
    return json_data


# set files and columns
download_path = '../data/download'
steamspy_data = 'steamspy_data.csv'
steamspy_index = 'steamspy_index.txt'

steamspy_columns = [
    'appid', 'name', 'developer', 'publisher', 'score_rank', 'positive',
    'negative', 'userscore', 'owners', 'average_forever', 'average_2weeks',
    'median_forever', 'median_2weeks', 'price', 'initialprice', 'discount',
    'languages', 'genre', 'ccu', 'tags'
]

reset_index(download_path, steamspy_index)
index = get_index(download_path, steamspy_index)

# Wipe data file if index is 0
prepare_data_file(download_path, steamspy_data, index, steamspy_columns)

process_batches(
    parser=parse_steamspy_request,
    app_list=app_list,
    download_path=download_path, 
    data_filename=steamspy_data,
    index_filename=steamspy_index,
    columns=steamspy_columns,
    begin=index,
    batchsize=10,
    pause=0.3
)

Starting at index 0:

Exported lines 0-9 to steamspy_data.csv. Batch 0 time: 0:00:10 (avg: 0:00:10, remaining: 0:17:26)
Exported lines 10-19 to steamspy_data.csv. Batch 1 time: 0:00:10 (avg: 0:00:10, remaining: 0:17:17)
Exported lines 20-29 to steamspy_data.csv. Batch 2 time: 0:00:12 (avg: 0:00:11, remaining: 0:17:42)
Exported lines 30-39 to steamspy_data.csv. Batch 3 time: 0:00:11 (avg: 0:00:11, remaining: 0:17:40)
Exported lines 40-49 to steamspy_data.csv. Batch 4 time: 0:00:11 (avg: 0:00:11, remaining: 0:17:29)
Exported lines 50-59 to steamspy_data.csv. Batch 5 time: 0:00:12 (avg: 0:00:11, remaining: 0:17:34)
Exported lines 60-69 to steamspy_data.csv. Batch 6 time: 0:00:10 (avg: 0:00:11, remaining: 0:17:07)
Exported lines 70-79 to steamspy_data.csv. Batch 7 time: 0:00:10 (avg: 0:00:11, remaining: 0:16:42)
Exported lines 80-89 to steamspy_data.csv. Batch 8 time: 0:00:10 (avg: 0:00:11, remaining: 0:16:25)
Exported lines 90-99 to steamspy_data.csv. Batch 9 time: 0:00:11 (avg: 0:00:11, 