In [1]:
import csv
import datetime as dt
import json
import os
import statistics
import time
import numpy as np
import pandas as pd
import requests

# customisations - ensure tables show all columns
pd.set_option("max_columns", 100)

# Steam API Data Collection

For this project I will be creating my own dataset so that way I have the most current and up to date data. It is made from SteamSpy's API and Steam's API. I am curently using SteamSpy's API to create a index file so that way I can make sure I gather all the data that I need from Steam's API. From Steam's API I am looking to use the category data and how many times the game was recomended to others. In the future I am wanting to look at SteamSpy's API and how many users own each app and the prices of each game.

I will design a API call function then I will download the infromation in a JSON array and then save that as a CSV file for easier access.

In [2]:
# create a function for my get request
def get_request(url, parameters=None):
    """Return json-formatted response of a get request using optional parameters.
    
    Parameters
    ----------
    url : string
    parameters : {'parameter': 'value'}
        parameters to pass as part of get request
    
    Returns
    -------
    json_data
        json-formatted response (dict-like)
    """
    try:
        response = requests.get(url=url, params=parameters)
    except SSLError as s:
        print('SSL Error:', s)
        
        for i in range(5, 0, -1):
            print('\rWaiting... ({})'.format(i), end='')
            time.sleep(1)
        print('\rRetrying.' + ' '*10)
        
        # recusively try again
        return get_request(url, parameters)
    
    if response:
        return response.json()
    else:
        # response is none usually means too many requests. Wait and try again 
        print('No response, waiting 10 seconds...')
        time.sleep(10)
        print('Retrying.')
        return get_request(url, parameters)

For this I am gathering the data from SteamSpy and I am creating my index file to compare to the downloaded JSON file to make sure I have everything I need to run my analysis. 

In [3]:
url = "https://steamspy.com/api.php"
parameters = {"request": "all"}

# request 'all' from steam spy and parse into dataframe
json_data = get_request(url, parameters=parameters)
steam_spy_all = pd.DataFrame.from_dict(json_data, orient='index')

steam_spy_all.head()

Unnamed: 0,appid,name,developer,publisher,score_rank,positive,negative,userscore,owners,average_forever,average_2weeks,median_forever,median_2weeks,price,initialprice,discount,ccu
570,570,Dota 2,Valve,Valve,,1534249,317680,0,"200,000,000 .. 500,000,000",37316,1718,727,1044,0,0,0,852995
730,730,Counter-Strike: Global Offensive,"Valve, Hidden Path Entertainment",Valve,,5941536,786910,0,"50,000,000 .. 100,000,000",28971,789,6384,243,0,0,0,874053
440,440,Team Fortress 2,Valve,Valve,,846180,57412,0,"50,000,000 .. 100,000,000",9747,1384,302,336,0,0,0,108900
1063730,1063730,New World,Amazon Games,Amazon Games,,163364,74344,0,"50,000,000 .. 100,000,000",9802,1680,4099,1087,1999,3999,50,127379
271590,271590,Grand Theft Auto V,Rockstar North,Rockstar Games,,1228710,213334,0,"20,000,000 .. 50,000,000",14465,605,6234,143,2998,2998,0,140671


In [4]:
# generate sorted app_list from steamspy data
app_list = steam_spy_all[['appid', 'name']].sort_values('appid').reset_index(drop=True)

# disabled export to keep consistency across download sessions
# app_list.to_csv('./data/app_list.csv', index=False)

# instead read from stored csv
app_list = pd.read_csv('./data/app_list.csv')

# display first few rows
app_list.head()

Unnamed: 0,appid,name
0,10,Counter-Strike
1,20,Team Fortress Classic
2,30,Day of Defeat
3,40,Deathmatch Classic
4,50,Half-Life: Opposing Force


Here I am creating my functions so that I can get the app data that I have previously saved and then proccess through batches of data making sure I get all the data that I have the index for already.

In [5]:
def get_app_data(start, stop, parser, pause):
    """Return list of app data generated from parser.
    
    parser : function to handle request
    """
    app_data = []
    
    # iterate through each row of app_list, confined by start and stop
    for index, row in app_list[start:stop].iterrows():
        print('Current index: {}'.format(index), end='\r')
        
        appid = row['appid']
        name = row['name']

        # retrive app data for a row, handled by supplied parser, and append to list
        data = parser(appid, name)
        app_data.append(data)

        time.sleep(pause) # prevent overloading api with requests
    
    return app_data


def process_batches(parser, app_list, download_path, data_filename, index_filename,
                    columns, begin=0, end=-1, batchsize=100, pause=1):
    """Process app data in batches, writing directly to file.
    
    parser : custom function to format request
    app_list : dataframe of appid and name
    download_path : path to store data
    data_filename : filename to save app data
    index_filename : filename to store highest index written
    columns : column names for file
    
    Keyword arguments:
    
    begin : starting index (get from index_filename, default 0)
    end : index to finish (defaults to end of app_list)
    batchsize : number of apps to write in each batch (default 100)
    pause : time to wait after each api request (defualt 1)
    
    returns: none
    """
    print('Starting at index {}:\n'.format(begin))
    
    # by default, process all apps in app_list
    if end == -1:
        end = len(app_list) + 1
    
    # generate array of batch begin and end points
    batches = np.arange(begin, end, batchsize)
    batches = np.append(batches, end)
    
    apps_written = 0
    batch_times = []
    
    for i in range(len(batches) - 1):
        start_time = time.time()
        
        start = batches[i]
        stop = batches[i+1]
        
        app_data = get_app_data(start, stop, parser, pause)
        
        rel_path = os.path.join(download_path, data_filename)
        
        # writing app data to file
        with open(rel_path, 'a', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=columns, extrasaction='ignore')
            
            for j in range(3,0,-1):
                print("\rAbout to write data, don't stop script! ({})".format(j), end='')
                time.sleep(0.5)
            
            writer.writerows(app_data)
            print('\rExported lines {}-{} to {}.'.format(start, stop-1, data_filename), end=' ')
            
        apps_written += len(app_data)
        
        idx_path = os.path.join(download_path, index_filename)
        
        # writing last index to file
        with open(idx_path, 'w') as f:
            index = stop
            print(index, file=f)
            
        # logging time taken
        end_time = time.time()
        time_taken = end_time - start_time
        
        batch_times.append(time_taken)
        mean_time = statistics.mean(batch_times)
        
        est_remaining = (len(batches) - i - 2) * mean_time
        
        remaining_td = dt.timedelta(seconds=round(est_remaining))
        time_td = dt.timedelta(seconds=round(time_taken))
        mean_td = dt.timedelta(seconds=round(mean_time))
        
        print('Batch {} time: {} (avg: {}, remaining: {})'.format(i, time_td, mean_td, remaining_td))
            
    print('\nProcessing batches complete. {} apps written'.format(apps_written))

I created some functions so that way if there is a problem I can easily troubleshoot by finding what index my batches stopped at and to reset the index if it is corupted and I can try again.

In [6]:
def reset_index(download_path, index_filename):
    """Reset index in file to 0."""
    rel_path = os.path.join(download_path, index_filename)
    
    with open(rel_path, 'w') as f:
        print(0, file=f)
        

def get_index(download_path, index_filename):
    """Retrieve index from file, returning 0 if file not found."""
    try:
        rel_path = os.path.join(download_path, index_filename)

        with open(rel_path, 'r') as f:
            index = int(f.readline())
    
    except FileNotFoundError:
        index = 0
        
    return index


def prepare_data_file(download_path, filename, index, columns):
    """Create file and write headers if index is 0."""
    if index == 0:
        rel_path = os.path.join(download_path, filename)

        with open(rel_path, 'w', newline='') as f:
            writer = csv.DictWriter(f, fieldnames=columns)
            writer.writeheader()

Here is my function to process through all the Steam data and then I am starting my download of my Steam Data.

In [7]:
def parse_steam_request(appid, name):
    """Unique parser to handle data from Steam Store API.
    
    Returns : json formatted data (dict-like)
    """
    url = "http://store.steampowered.com/api/appdetails/"
    parameters = {"appids": appid}
    
    json_data = get_request(url, parameters=parameters)
    json_app_data = json_data[str(appid)]
    
    if json_app_data['success']:
        data = json_app_data['data']
    else:
        data = {'name': name, 'steam_appid': appid}
        
    return data


# Set file parameters
download_path = './data'
steam_app_data = 'steam_app_data.csv'
steam_index = 'steam_index.txt'

steam_columns = [
    'type', 'name', 'steam_appid', 'required_age', 'is_free', 'controller_support',
    'dlc', 'detailed_description', 'about_the_game', 'short_description', 'fullgame',
    'supported_languages', 'header_image', 'website', 'pc_requirements', 'mac_requirements',
    'linux_requirements', 'legal_notice', 'drm_notice', 'ext_user_account_notice',
    'developers', 'publishers', 'demos', 'price_overview', 'packages', 'package_groups',
    'platforms', 'metacritic', 'reviews', 'categories', 'genres', 'screenshots',
    'movies', 'recommendations', 'achievements', 'release_date', 'support_info',
    'background', 'content_descriptors'
]

# Overwrites last index for demonstration (would usually store highest index so can continue across sessions)
reset_index(download_path, steam_index)

# Retrieve last index downloaded from file
index = get_index(download_path, steam_index)

# Wipe or create data file and write headers if index is 0
prepare_data_file(download_path, steam_app_data, index, steam_columns)

# Start on downloading 
process_batches(
    parser=parse_steam_request,
    app_list=app_list,
    download_path=download_path,
    data_filename=steam_app_data,
    index_filename=steam_index,
    columns=steam_columns,
    begin=index,
    batchsize=5
)

Starting at index 0:

Exported lines 0-4 to steam_app_data.csv. Batch 0 time: 0:00:07 (avg: 0:00:07, remaining: 0:24:52)
Exported lines 5-9 to steam_app_data.csv. Batch 1 time: 0:00:07 (avg: 0:00:07, remaining: 0:24:23)
Exported lines 10-14 to steam_app_data.csv. Batch 2 time: 0:00:07 (avg: 0:00:07, remaining: 0:24:13)
Exported lines 15-19 to steam_app_data.csv. Batch 3 time: 0:00:08 (avg: 0:00:07, remaining: 0:24:19)
Exported lines 20-24 to steam_app_data.csv. Batch 4 time: 0:00:08 (avg: 0:00:08, remaining: 0:24:31)
Exported lines 25-29 to steam_app_data.csv. Batch 5 time: 0:00:08 (avg: 0:00:08, remaining: 0:24:34)
Exported lines 30-34 to steam_app_data.csv. Batch 6 time: 0:00:07 (avg: 0:00:08, remaining: 0:24:23)
Exported lines 35-39 to steam_app_data.csv. Batch 7 time: 0:00:08 (avg: 0:00:08, remaining: 0:24:18)
Exported lines 40-44 to steam_app_data.csv. Batch 8 time: 0:00:07 (avg: 0:00:08, remaining: 0:24:03)
Exported lines 45-49 to steam_app_data.csv. Batch 9 time: 0:00:08 (avg: 0

In [8]:
# inspect downloaded data
pd.read_csv('./data/steam_app_data.csv').head()

Unnamed: 0,type,name,steam_appid,required_age,is_free,controller_support,dlc,detailed_description,about_the_game,short_description,fullgame,supported_languages,header_image,website,pc_requirements,mac_requirements,linux_requirements,legal_notice,drm_notice,ext_user_account_notice,developers,publishers,demos,price_overview,packages,package_groups,platforms,metacritic,reviews,categories,genres,screenshots,movies,recommendations,achievements,release_date,support_info,background,content_descriptors
0,game,Counter-Strike,10,0,False,,,Play the world's number 1 online action game. ...,Play the world's number 1 online action game. ...,Play the world's number 1 online action game. ...,,"English<strong>*</strong>, French<strong>*</st...",https://cdn.akamai.steamstatic.com/steam/apps/...,,{'minimum': '\r\n\t\t\t<p><strong>Minimum:</st...,{'minimum': 'Minimum: OS X Snow Leopard 10.6....,"{'minimum': 'Minimum: Linux Ubuntu 12.04, Dual...",,,,['Valve'],['Valve'],,"{'currency': 'CAD', 'initial': 1149, 'final': ...","[574941, 7]","[{'name': 'default', 'title': 'Buy Counter-Str...","{'windows': True, 'mac': True, 'linux': True}","{'score': 88, 'url': 'https://www.metacritic.c...",,"[{'id': 1, 'description': 'Multi-player'}, {'i...","[{'id': '1', 'description': 'Action'}]","[{'id': 0, 'path_thumbnail': 'https://cdn.akam...",,{'total': 124964},,"{'coming_soon': False, 'date': '1 Nov, 2000'}","{'url': 'http://steamcommunity.com/app/10', 'e...",https://cdn.akamai.steamstatic.com/steam/apps/...,"{'ids': [2, 5], 'notes': 'Includes intense vio..."
1,game,Team Fortress Classic,20,0,False,,,One of the most popular online action games of...,One of the most popular online action games of...,One of the most popular online action games of...,,"English, French, German, Italian, Spanish - Sp...",https://cdn.akamai.steamstatic.com/steam/apps/...,,{'minimum': '\r\n\t\t\t<p><strong>Minimum:</st...,{'minimum': 'Minimum: OS X Snow Leopard 10.6....,"{'minimum': 'Minimum: Linux Ubuntu 12.04, Dual...",,,,['Valve'],['Valve'],,"{'currency': 'CAD', 'initial': 569, 'final': 5...",[29],"[{'name': 'default', 'title': 'Buy Team Fortre...","{'windows': True, 'mac': True, 'linux': True}",,,"[{'id': 1, 'description': 'Multi-player'}, {'i...","[{'id': '1', 'description': 'Action'}]","[{'id': 0, 'path_thumbnail': 'https://cdn.akam...",,{'total': 4848},,"{'coming_soon': False, 'date': '1 Apr, 1999'}","{'url': '', 'email': ''}",https://cdn.akamai.steamstatic.com/steam/apps/...,"{'ids': [2, 5], 'notes': 'Includes intense vio..."
2,game,Day of Defeat,30,0,False,,,Enlist in an intense brand of Axis vs. Allied ...,Enlist in an intense brand of Axis vs. Allied ...,Enlist in an intense brand of Axis vs. Allied ...,,"English, French, German, Italian, Spanish - Spain",https://cdn.akamai.steamstatic.com/steam/apps/...,http://www.dayofdefeat.com/,{'minimum': '\r\n\t\t\t<p><strong>Minimum:</st...,{'minimum': 'Minimum: OS X Snow Leopard 10.6....,"{'minimum': 'Minimum: Linux Ubuntu 12.04, Dual...",,,,['Valve'],['Valve'],,"{'currency': 'CAD', 'initial': 569, 'final': 5...",[30],"[{'name': 'default', 'title': 'Buy Day of Defe...","{'windows': True, 'mac': True, 'linux': True}","{'score': 79, 'url': 'https://www.metacritic.c...",,"[{'id': 1, 'description': 'Multi-player'}, {'i...","[{'id': '1', 'description': 'Action'}]","[{'id': 0, 'path_thumbnail': 'https://cdn.akam...",,{'total': 3294},,"{'coming_soon': False, 'date': '1 May, 2003'}","{'url': '', 'email': ''}",https://cdn.akamai.steamstatic.com/steam/apps/...,"{'ids': [], 'notes': None}"
3,game,Deathmatch Classic,40,0,False,,,Enjoy fast-paced multiplayer gaming with Death...,Enjoy fast-paced multiplayer gaming with Death...,Enjoy fast-paced multiplayer gaming with Death...,,"English, French, German, Italian, Spanish - Sp...",https://cdn.akamai.steamstatic.com/steam/apps/...,,{'minimum': '\r\n\t\t\t<p><strong>Minimum:</st...,{'minimum': 'Minimum: OS X Snow Leopard 10.6....,"{'minimum': 'Minimum: Linux Ubuntu 12.04, Dual...",,,,['Valve'],['Valve'],,"{'currency': 'CAD', 'initial': 569, 'final': 5...",[31],"[{'name': 'default', 'title': 'Buy Deathmatch ...","{'windows': True, 'mac': True, 'linux': True}",,,"[{'id': 1, 'description': 'Multi-player'}, {'i...","[{'id': '1', 'description': 'Action'}]","[{'id': 0, 'path_thumbnail': 'https://cdn.akam...",,{'total': 1608},,"{'coming_soon': False, 'date': '1 Jun, 2001'}","{'url': '', 'email': ''}",https://cdn.akamai.steamstatic.com/steam/apps/...,"{'ids': [], 'notes': None}"
4,game,Half-Life: Opposing Force,50,0,False,,,Return to the Black Mesa Research Facility as ...,Return to the Black Mesa Research Facility as ...,Return to the Black Mesa Research Facility as ...,,"English, French, German, Korean",https://cdn.akamai.steamstatic.com/steam/apps/...,,{'minimum': '\r\n\t\t\t<p><strong>Minimum:</st...,{'minimum': 'Minimum: OS X Snow Leopard 10.6....,"{'minimum': 'Minimum: Linux Ubuntu 12.04, Dual...",,,,['Gearbox Software'],['Valve'],,"{'currency': 'CAD', 'initial': 569, 'final': 5...",[32],"[{'name': 'default', 'title': 'Buy Half-Life: ...","{'windows': True, 'mac': True, 'linux': True}",,,"[{'id': 2, 'description': 'Single-player'}, {'...","[{'id': '1', 'description': 'Action'}]","[{'id': 0, 'path_thumbnail': 'https://cdn.akam...",,{'total': 12910},,"{'coming_soon': False, 'date': '1 Nov, 1999'}","{'url': 'https://help.steampowered.com', 'emai...",https://cdn.akamai.steamstatic.com/steam/apps/...,"{'ids': [], 'notes': None}"


I am doing the same thing as above except for the SteamSpy Data.

In [9]:
def parse_steamspy_request(appid, name):
    """Parser to handle SteamSpy API data."""
    url = "https://steamspy.com/api.php"
    parameters = {"request": "appdetails", "appid": appid}
    
    json_data = get_request(url, parameters)
    return json_data


# set files and columns
download_path = './data'
steamspy_data = 'steamspy_data.csv'
steamspy_index = 'steamspy_index.txt'

steamspy_columns = [
    'appid', 'name', 'developer', 'publisher', 'score_rank', 'positive',
    'negative', 'userscore', 'owners', 'average_forever', 'average_2weeks',
    'median_forever', 'median_2weeks', 'price', 'initialprice', 'discount',
    'languages', 'genre', 'ccu', 'tags'
]

reset_index(download_path, steamspy_index)
index = get_index(download_path, steamspy_index)

# Wipe data file if index is 0
prepare_data_file(download_path, steamspy_data, index, steamspy_columns)

process_batches(
    parser=parse_steamspy_request,
    app_list=app_list,
    download_path=download_path, 
    data_filename=steamspy_data,
    index_filename=steamspy_index,
    columns=steamspy_columns,
    begin=index,
    batchsize=5,
    pause=0.3
) 

Starting at index 0:

Exported lines 0-4 to steamspy_data.csv. Batch 0 time: 0:00:06 (avg: 0:00:06, remaining: 0:18:48)
Exported lines 5-9 to steamspy_data.csv. Batch 1 time: 0:00:06 (avg: 0:00:06, remaining: 0:18:58)
Exported lines 10-14 to steamspy_data.csv. Batch 2 time: 0:00:06 (avg: 0:00:06, remaining: 0:18:43)
Exported lines 15-19 to steamspy_data.csv. Batch 3 time: 0:00:06 (avg: 0:00:06, remaining: 0:18:33)
Exported lines 20-24 to steamspy_data.csv. Batch 4 time: 0:00:06 (avg: 0:00:06, remaining: 0:18:27)
Exported lines 25-29 to steamspy_data.csv. Batch 5 time: 0:00:06 (avg: 0:00:06, remaining: 0:18:21)
Exported lines 30-34 to steamspy_data.csv. Batch 6 time: 0:00:06 (avg: 0:00:06, remaining: 0:18:20)
Exported lines 35-39 to steamspy_data.csv. Batch 7 time: 0:00:06 (avg: 0:00:06, remaining: 0:18:12)
Exported lines 40-44 to steamspy_data.csv. Batch 8 time: 0:00:06 (avg: 0:00:06, remaining: 0:18:05)
Exported lines 45-49 to steamspy_data.csv. Batch 9 time: 0:00:06 (avg: 0:00:06, re

In [10]:
# inspect downloaded steamspy data
pd.read_csv('./data/steamspy_data.csv').head()

Unnamed: 0,appid,name,developer,publisher,score_rank,positive,negative,userscore,owners,average_forever,average_2weeks,median_forever,median_2weeks,price,initialprice,discount,languages,genre,ccu,tags
0,10,Counter-Strike,Valve,Valve,,201186,5199,0,"10,000,000 .. 20,000,000",12777,145,228,10,999,999,0,"English, French, German, Italian, Spanish - Sp...",Action,13205,"{'Action': 5426, 'FPS': 4831, 'Multiplayer': 3..."
1,20,Team Fortress Classic,Valve,Valve,,5836,933,0,"5,000,000 .. 10,000,000",272,0,13,0,499,499,0,"English, French, German, Italian, Spanish - Sp...",Action,67,"{'Action': 754, 'FPS': 312, 'Multiplayer': 264..."
2,30,Day of Defeat,Valve,Valve,,5248,569,0,"5,000,000 .. 10,000,000",401,4,15,4,499,499,0,"English, French, German, Italian, Spanish - Spain",Action,100,"{'FPS': 791, 'World War II': 255, 'Multiplayer..."
3,40,Deathmatch Classic,Valve,Valve,,1962,437,0,"100,000 .. 200,000",249,0,10,0,499,499,0,"English, French, German, Italian, Spanish - Sp...",Action,2,"{'Action': 631, 'FPS': 142, 'Classic': 109, 'M..."
4,50,Half-Life: Opposing Force,Gearbox Software,Valve,,14879,756,0,"5,000,000 .. 10,000,000",396,43,252,43,499,499,0,"English, French, German, Korean",Action,78,"{'FPS': 893, 'Action': 335, 'Classic': 262, 'S..."
