## Build Larger Training Set

Now that this project looks somewhat promising. Let's build a proper dataset. The Steam API makes this extremely painful.

I don't ever want to have to do this again in my life, so we should try to pull down everything we could ever want and then just use that in the future.

In [1]:
import os
import json
import time
import datetime
import requests
from os.path import exists

In [1]:
import tqdm

In [None]:
for tqdm.tqdm_notebook()

In [2]:
if 'STEAM_API_KEY' not in os.environ:
  print("No API Key :(")
else:
  print("Found API Key.")
  STEAM_API_KEY = os.environ['STEAM_API_KEY']

Found API Key.


In [3]:
base_url = 'https://api.steampowered.com'

# The maximum number of matches the API will return at once.
num_matches_per_request = 100

def __request(method, path, **kwargs):
  url = base_url + path
  kwargs.setdefault('params', dict()).update(key=STEAM_API_KEY)

  max_tries = 5
  num_tries = 0
  while num_tries < max_tries:
    try:
      response = requests.request(method, url, **kwargs)
      if response.status_code == 429:
        print("429. Backing off.")
        print(response.headers)
        time.sleep(60)
        num_tries += 1
        continue
      response_json = response.json()
      return response_json
    except:
      print("Error on request.")
    
    num_tries += 1
    time.sleep(60)


def get_match_history_by_seq_num(seq_num, num_matches, **params):
  path = '/IDOTA2Match_570/GetMatchHistoryBySequenceNum/V001'
  params.update(start_at_match_seq_num=seq_num)
  params.update(matches_requested=num_matches)
  return __request('get', path, params=params)

def get_most_recent_matches():
  path = '/IDOTA2Match_570/GetMatchHistory/V001'
  return __request('get', path)

def get_most_recent_seq_number():
  response = get_most_recent_matches()
  most_recent_matches = response['result']
  
  max_seq_num = -1
  for match in most_recent_matches:
    seq_num = match['match_seq_num']
    max_seq_num = max(match_seq_num, max_seq_num)
    
  return max_seq_num


In [4]:
# Remove unnecessary fields from game result. 
# We do this to minimize the space needed to store everything.
def delete_if_present(json, key):
  if key in json:
    del json[key]
    
def clean_match(match):
  delete_if_present(match, 'tower_status_radiant')
  delete_if_present(match, 'tower_status_dire')
  delete_if_present(match, 'barracks_status_radiant')
  delete_if_present(match, 'barracks_status_dire')
  delete_if_present(match, 'positive_votes')
  delete_if_present(match, 'negative_votes')
  delete_if_present(match, 'radiant_score')
  delete_if_present(match, 'dire_score')
  delete_if_present(match, 'picks_bans')
  for player in match['players']:
    delete_if_present(player, 'item_0')
    delete_if_present(player, 'item_1')
    delete_if_present(player, 'item_2')
    delete_if_present(player, 'item_3')
    delete_if_present(player, 'item_4')
    delete_if_present(player, 'item_5')
    delete_if_present(player, 'backpack_0')
    delete_if_present(player, 'backpack_1')
    delete_if_present(player, 'backpack_2')
    delete_if_present(player, 'item_neutral')
    delete_if_present(player, 'kills')
    delete_if_present(player, 'deaths')
    delete_if_present(player, 'assists')
    delete_if_present(player, 'last_hits')
    delete_if_present(player, 'denies')
    delete_if_present(player, 'gold_per_min')
    delete_if_present(player, 'level')
    delete_if_present(player, 'net_worth')
    delete_if_present(player, 'aghanims_scepter')
    delete_if_present(player, 'aghanims_shard')
    delete_if_present(player, 'moonshard')
    delete_if_present(player, 'xp_per_min')
    delete_if_present(player, 'additional_units')  

In [None]:
bulk_download_path = "dota_2_bulk.tsv"

# We start at a sequence number from August 7, 2021 and work forward.
current_seq_num = 5125140861

if exists(bulk_download_path):
  print("Bulk file found. Scanning for last (largest) sequence number.")
  max_seq_num = current_seq_num
  with open(bulk_download_path, 'r') as bulk_file:
    for line in bulk_file:
        match_seq_num, _ = line.split('\t')
        max_seq_num = max(max_seq_num, int(match_seq_num))
        
  current_seq_num = max_seq_num

else:
  print("No Dota 2 Bulk File. Creating...")
  # Create the file, find the latest seq_num
  with open(bulk_download_path, 'w') as bulk_file:
    # Just create the file
    pass

print("Starting at: ", current_seq_num)
num_iters = 0
max_seq_num = current_seq_num

with open(bulk_download_path, 'a') as bulk_file:
  while True:
    if num_iters % 100 == 0:
      print("Iterations: ", num_iters, ". seq_num: ", current_seq_num)

    # Don't hit the API too frequently.
    time.sleep(2)
    response = get_match_history_by_seq_num(current_seq_num, num_matches_per_request)

    if response is None:
      print("Skipping. Response was None...")
      time.sleep(300)
      continue

    try:
      matches = response['result']['matches']
    except Exception as e:
      print("Bad error. This would normally break everything...")
      print(e)
      print(response)
      # Just wait 5 minutes before going again.
      time.sleep(300)
      continue

    if len(matches) != num_matches_per_request:
        print("Problem. Expected: " + num_matches_per_request + " matches. Actual: ", len(matches))
        print(response)
        break

    for match in matches:
        # Remove unneeded information to save space.
        clean_match(match)
        if match['human_players'] != 10:
            # We only want "real" games of Dota so we're ignoring
            # games without 10 human players.
            continue

        try:
          match_seq_num = match['match_seq_num']

          str_json = json.dumps(match)

          if "\t" in str_json:
            print("TAB DETECTED IN JSON!")
            print(str_json)
            continue

          output_str = str(match_seq_num) + "\t" + str_json + "\n"
          bulk_file.write(output_str)

          max_seq_num = max(max_seq_num, match_seq_num)
        except Exception as e:
          print(e)
          print("Error:", match)
          break


    current_seq_num = max_seq_num + 1
    num_iters += 1


print("Complete.")

#### Not sure what that last error was. We should update the script to handle that in the future. For now I have over 5 million games of Dota downloaded so I'm going to leave it.

#### Problem at: 1264575 perhaps. We'll have to look into it.