## Build Larger Training Set

Now that this project looks somewhat promising. Let's build a proper dataset. The Steam API makes this extremely painful.

I don't ever want to have to do this again in my life, so we should try to pull down everything we could ever want and then just use that in the future.

In [1]:
import os
import json
import time
import datetime
import requests
from os.path import exists

In [2]:
if 'STEAM_API_KEY' not in os.environ:
  print("No API Key :(")
else:
  print("Found API Key.")
  STEAM_API_KEY = os.environ['STEAM_API_KEY']

Found API Key.


In [3]:
base_url = 'https://api.steampowered.com'

# The maximum number of matches the API will return at once.
num_matches_per_request = 100

def __request(method, path, **kwargs):
  url = base_url + path
  kwargs.setdefault('params', dict()).update(key=STEAM_API_KEY)

  max_tries = 5
  num_tries = 0
  while num_tries < max_tries:
    try:
      response = requests.request(method, url, **kwargs)
      if response.status_code == 429:
        print("429. Backing off.")
        print(response.headers)
        time.sleep(15)
        num_tries += 1
        continue
      response_json = response.json()
      return response_json
    except:
      print("Error on request.")
    
    num_tries += 1
    time.sleep(5)


def get_match_history_by_seq_num(seq_num, num_matches, **params):
  path = '/IDOTA2Match_570/GetMatchHistoryBySequenceNum/V001'
  params.update(start_at_match_seq_num=seq_num)
  params.update(matches_requested=num_matches)
  return __request('get', path, params=params)

def get_most_recent_matches():
  path = '/IDOTA2Match_570/GetMatchHistory/V001'
  return __request('get', path)

def get_most_recent_seq_number():
  response = get_most_recent_matches()
  most_recent_matches = response['result']
  
  max_seq_num = -1
  for match in most_recent_matches:
    seq_num = match['match_seq_num']
    max_seq_num = max(match_seq_num, max_seq_num)
    
  return max_seq_num


In [11]:
bulk_download_path = "dota_2_bulk.tsv"

# We start at a sequence number from August 7, 2021 and work forward.
current_seq_num = 5125140861

if exists(bulk_download_path):
  print("Bulk file found. Scanning for last (largest) sequence number.")
  max_seq_num = current_seq_num
  with open(bulk_download_path, 'r') as bulk_file:
    for line in bulk_file:
        match_seq_num, _ = line.split('\t')
        max_seq_num = max(max_seq_num, int(match_seq_num))
        
  current_seq_num = max_seq_num

else:
  print("No Dota 2 Bulk File. Creating...")
  # Create the file, find the latest seq_num
  with open(bulk_download_path, 'w') as bulk_file:
    # Just create the file
    pass

print("Starting at: ", current_seq_num)
num_iters = 0
max_seq_num = current_seq_num

with open(bulk_download_path, 'a') as bulk_file:
  while True:
    if num_iters % 100 == 0:
      print("Iterations: ", num_iters, ". seq_num: ", current_seq_num)

    # Don't hit the API too frequently.
    time.sleep(1.6)
    response = get_match_history_by_seq_num(current_seq_num, num_matches_per_request)

    if response is None:
      print("Skipping. Response was None...")
      continue

    matches = response['result']['matches']

    if len(matches) != num_matches_per_request:
        print("Problem. Expected: " + num_matches_per_request + " matches. Actual: ", len(matches))
        print(response)
        break

    for match in matches:
        if match['human_players'] != 10:
            # We only want "real" games of Dota so we're ignoring
            # games without 10 human players.
            continue

        try:
          match_seq_num = match['match_seq_num']

          str_json = json.dumps(match)
          print(str_json)

          if "\t" in str_json:
            print("TAB DETECTED IN JSON!")
            print(str_json)
            continue

          output_str = str(match_seq_num) + "\t" + str_json + "\n"
          bulk_file.write(output_str)

          max_seq_num = max(max_seq_num, match_seq_num)
        except Exception as e:
          print(e)
          print("Error:", match)
          break


    current_seq_num = max_seq_num + 1
    num_iters += 1
    
    print("exiting")
    break

print("Complete.")

No Dota 2 Bulk File. Creating...
Starting at:  5125140861
Iterations:  0 . seq_num:  5125140861
{"players": [{"account_id": 363849371, "player_slot": 0, "hero_id": 72, "item_0": 0, "item_1": 156, "item_2": 141, "item_3": 154, "item_4": 116, "item_5": 160, "backpack_0": 0, "backpack_1": 379, "backpack_2": 0, "item_neutral": 573, "kills": 7, "deaths": 3, "assists": 7, "leaver_status": 0, "last_hits": 681, "denies": 7, "gold_per_min": 768, "xp_per_min": 747, "level": 28, "net_worth": 31437, "aghanims_scepter": 1, "aghanims_shard": 0, "moonshard": 0}, {"account_id": 221883921, "player_slot": 1, "hero_id": 49, "item_0": 141, "item_1": 116, "item_2": 108, "item_3": 600, "item_4": 112, "item_5": 48, "backpack_0": 41, "backpack_1": 0, "backpack_2": 0, "item_neutral": 311, "kills": 6, "deaths": 6, "assists": 12, "leaver_status": 0, "last_hits": 482, "denies": 9, "gold_per_min": 679, "xp_per_min": 667, "level": 28, "net_worth": 31318, "aghanims_scepter": 0, "aghanims_shard": 1, "moonshard": 0}, 

Not sure what that last error was. We should update the script to handle that in the future. For now I have over 5 million games of Dota downloaded so I'm going to leave it.