In [2]:
# standard library imports
import csv
import datetime as dt
import json
import os
import statistics
import time

# third-party imports
import numpy as np
import pandas as pd
import requests

# customisations - ensure tables show all columns
pd.set_option("max_columns", 100)

In [3]:
def get_request(url, parameters=None):
    """Return json-formatted response of a get request using optional parameters.
    
    Parameters
    ----------
    url : string
    parameters : {'parameter': 'value'}
        parameters to pass as part of get request
    
    Returns
    -------
    json_data
        json-formatted response (dict-like)
    """
    try:
        response = requests.get(url=url, params=parameters)
    except SSLError as s:
        print('SSL Error:', s)
        
        for i in range(5, 0, -1):
            print('\rWaiting... ({})'.format(i), end='')
            time.sleep(1)
        print('\rRetrying.' + ' '*10)
        
        # recusively try again
        return get_request(url, parameters)
    
    if response:
        return response.json()
    else:
        # response is none usually means too many requests. Wait and try again 
        print('No response, waiting 10 seconds...')
        time.sleep(10)
        print('Retrying.')
        return get_request(url, parameters)

In [4]:
url = "https://steamspy.com/api.php"
parameters = {"request": "all"}

# request 'all' from steam spy and parse into dataframe
json_data = get_request(url, parameters=parameters)
steam_spy_all = pd.DataFrame.from_dict(json_data, orient='index')

# generate sorted app_list from steamspy data
app_list = steam_spy_all[['appid', 'name']].sort_values('appid').reset_index(drop=True)

# export disabled to keep consistency across download sessions
# app_list.to_csv('../data/download/app_list.csv', index=False)

# instead read from stored csv
# app_list = pd.read_csv('../data/download/app_list.csv')

# display first few rows
app_list.head()

Unnamed: 0,appid,name
0,10,Counter-Strike
1,20,Team Fortress Classic
2,30,Day of Defeat
3,40,Deathmatch Classic
4,50,Half-Life: Opposing Force


In [5]:
steam_spy_all

Unnamed: 0,appid,name,developer,publisher,score_rank,positive,negative,userscore,owners,average_forever,average_2weeks,median_forever,median_2weeks,price,initialprice,discount,ccu
570,570,Dota 2,Valve,Valve,,1321903,253139,0,"100,000,000 .. 200,000,000",39055,1770,1116,633,0,0,0,562749
730,730,Counter-Strike: Global Offensive,"Valve, Hidden Path Entertainment",Valve,,5216334,704261,0,"50,000,000 .. 100,000,000",31637,995,8587,296,0,0,0,779300
578080,578080,PUBG: BATTLEGROUNDS,"KRAFTON, Inc.","KRAFTON, Inc.",,1007031,826655,0,"50,000,000 .. 100,000,000",25924,715,12170,290,2999,2999,0,340972
440,440,Team Fortress 2,Valve,Valve,,763236,51008,0,"50,000,000 .. 100,000,000",9363,2165,338,464,0,0,0,89777
304930,304930,Unturned,Smartly Dressed Games,Smartly Dressed Games,,412459,38885,0,"20,000,000 .. 50,000,000",10178,5170,335,1706,0,0,0,27810
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
513780,513780,Turbo Pug DX,"SnowFlame, Back To Basics Gaming",Back To Basics Gaming,,1329,375,0,"500,000 .. 1,000,000",226,0,226,0,99,99,0,2
414530,414530,Skyforge,Allods Team,MY.GAMES,,2253,1565,0,"500,000 .. 1,000,000",428,0,107,0,0,0,0,89
977950,977950,A Dance of Fire and Ice,7th Beat Games,"7th Beat Games, indienova",,26657,1818,0,"500,000 .. 1,000,000",474,19,283,27,599,599,0,2734
301750,301750,RADical ROACH Remastered,DL Softworks,DL Softworks,,579,495,0,"500,000 .. 1,000,000",459,31,385,31,0,0,0,2


In [6]:
len(app_list)

1000

In [None]:
def get_app_data(start, stop, parser, pause):
    """Return list of app data generated from parser.
    
    parser : function to handle request
    """
    app_data = []
    
    # iterate through each row of app_list, confined by start and stop
    for index, row in app_list[start:stop].iterrows():
        print('Current index: {}'.format(index), end='\r')
        
        appid = row['appid']
        name = row['name']

        # retrive app data for a row, handled by supplied parser, and append to list
        data = parser(appid, name)
        app_data.append(data)

        time.sleep(pause) # prevent overloading api with requests
    
    return app_data


def process_batches(parser, app_list, download_path, data_filename, index_filename,
                    columns, begin=0, end=-1, batchsize=100, pause=1):
    """Process app data in batches, writing directly to file.
    
    parser : custom function to format request
    app_list : dataframe of appid and name
    download_path : path to store data
    data_filename : filename to save app data
    index_filename : filename to store highest index written
    columns : column names for file
    
    Keyword arguments:
    
    begin : starting index (get from index_filename, default 0)
    end : index to finish (defaults to end of app_list)
    batchsize : number of apps to write in each batch (default 100)
    pause : time to wait after each api request (defualt 1)
    
    returns: none
    """
    print('Starting at index {}:\n'.format(begin))
    
    # by default, process all apps in app_list
    if end == -1:
        end = len(app_list) + 1
    
    # generate array of batch begin and end points
    batches = np.arange(begin, end, batchsize)
    batches = np.append(batches, end)
    
    apps_written = 0
    batch_times = []
    
    for i in range(len(batches) - 1):
        start_time = time.time()
        
        start = batches[i]
        stop = batches[i+1]
        
        app_data = get_app_data(start, stop, parser, pause)
        
        rel_path = os.path.join(download_path, data_filename)
        
        # writing app data to file
        with open(rel_path, 'a', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=columns, extrasaction='ignore')
            
            for j in range(3,0,-1):
                print("\rAbout to write data, don't stop script! ({})".format(j), end='')
                time.sleep(0.5)
            
            writer.writerows(app_data)
            print('\rExported lines {}-{} to {}.'.format(start, stop-1, data_filename), end=' ')
            
        apps_written += len(app_data)
        
        idx_path = os.path.join(download_path, index_filename)
        
        # writing last index to file
        with open(idx_path, 'w') as f:
            index = stop
            print(index, file=f)
            
        # logging time taken
        end_time = time.time()
        time_taken = end_time - start_time
        
        batch_times.append(time_taken)
        mean_time = statistics.mean(batch_times)
        
        est_remaining = (len(batches) - i - 2) * mean_time
        
        remaining_td = dt.timedelta(seconds=round(est_remaining))
        time_td = dt.timedelta(seconds=round(time_taken))
        mean_td = dt.timedelta(seconds=round(mean_time))
        
        print('Batch {} time: {} (avg: {}, remaining: {})'.format(i, time_td, mean_td, remaining_td))
            
    print('\nProcessing batches complete. {} apps written'.format(apps_written))

In [11]:
end = len(app_list) + 1
begin = 0
batchsize = 100

batches = np.arange(begin, end, batchsize)
batches = np.append(batches, end)

In [12]:
print(batches)

[   0  100  200  300  400  500  600  700  800  900 1000 1001]


In [25]:
for i in range(len(batches) - 1):
    print(i)

0
1
2
3
4
5
6
7
8
9
10


In [23]:
for j in range(3,0,-1):
    print("About to write data, don't stop script! ({})\r".format(j), end='')
    time.sleep(0.5)

About to write data, don't stop script! (1)