# Clean Training Set

Now that we've downloaded a bunch of Dota 2 games, let's clean up the dataset and format it so it's easier for us to train our model on it.

In [1]:
import os
import re
import json
import time
import tqdm
import datetime
import requests
import pandas as pd
from os.path import exists

In [4]:
# Stored outside of repo because it's 140GB
path = 'dota_2_bulk.tsv'

In [5]:
# How many games (lines) do we have?
num_games = sum(1 for line in open(path))
num_games

44567619

In [6]:
# Let's see what a single raw example looks like.
with open(path) as file:
  for line in tqdm.tqdm_notebook(file):
    seq_num, raw_json = line.split('\t')
    print(seq_num)
    print(raw_json)
    break

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for line in tqdm.tqdm_notebook(file):


0it [00:00, ?it/s]

5125140861
{"players": [{"account_id": 363849371, "player_slot": 0, "hero_id": 72, "leaver_status": 0}, {"account_id": 221883921, "player_slot": 1, "hero_id": 49, "leaver_status": 0}, {"account_id": 90082776, "player_slot": 2, "hero_id": 51, "leaver_status": 0}, {"account_id": 48735159, "player_slot": 3, "hero_id": 123, "leaver_status": 0}, {"account_id": 89222641, "player_slot": 4, "hero_id": 29, "leaver_status": 0}, {"account_id": 303331256, "player_slot": 128, "hero_id": 10, "leaver_status": 0}, {"account_id": 316988377, "player_slot": 129, "hero_id": 88, "leaver_status": 0}, {"account_id": 158626280, "player_slot": 130, "hero_id": 53, "leaver_status": 0}, {"account_id": 32095155, "player_slot": 131, "hero_id": 13, "leaver_status": 0}, {"account_id": 88154342, "player_slot": 132, "hero_id": 68, "leaver_status": 0}], "radiant_win": false, "duration": 3123, "pre_game_duration": 90, "start_time": 1628306396, "match_id": 6123629491, "match_seq_num": 5125140861, "cluster": 121, "first_bl

In [22]:
# Steam represents:
#  Players 0 - 4 as player slots 0 - 4
#  Players 5 - 9 as player slots 128-132
def mapPlayerSlotToPosition(slot):
  if slot <= 4:
    return slot
  return slot - 123

In [29]:
# Now read each line, parse the JSON and rip out the information we're interested in.
num_failures = 0
training_set = []

with open(path) as file:
  for line in tqdm.tqdm_notebook(file, total=num_games):
    seq_num, raw_json = line.split('\t')
    try:
      match = json.loads(raw_json)
      players = match['players']
      
      hero_ids = [None for _ in range(10)]
      account_ids = [None for _ in range(10)]
      
      if len(players) != 10:
        print("Weird. There weren't 10 players!")
        print(raw_json)
        num_failures += 1
        continue
      
      for player in players:
        player_slot = player['player_slot']
        hero_id = player['hero_id']
        # The default account number for anonymous accounts is 4294967295
        # Some games are missing the account number. I don't know why. 
        # We will default to the anonymous account number.
        account_id = 4294967295
        if 'account_id' in player:
          account_id = player['account_id']
          
        position = mapPlayerSlotToPosition(player_slot)
        hero_ids[position] = hero_id
        account_ids[position] = account_id
    
      start_time = match['start_time']
      # Convert to float target.
      radiant_win = float(match['radiant_win'])
      entry = hero_ids + account_ids + [start_time] + [radiant_win]
      training_set.append(entry)
      
    except Exception as e:
      print("Error:", e)
      print(raw_json)
      print(e)
      
      _, original = line.split('\t')
      print(original)
      num_failures +=1
      
print("Num failures:" , num_failures)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for line in tqdm.tqdm_notebook(file):


0it [00:00, ?it/s]

Weird. There weren't 10 players!
{"players": [{"account_id": 4294967295, "player_slot": 0, "team_number": 0, "team_slot": 0, "hero_id": 84, "leaver_status": 0}, {"account_id": 4294967295, "player_slot": 1, "team_number": 0, "team_slot": 1, "hero_id": 120, "leaver_status": 0}, {"account_id": 4294967295, "player_slot": 2, "team_number": 0, "team_slot": 2, "hero_id": 93, "leaver_status": 0}, {"account_id": 169760865, "player_slot": 3, "team_number": 0, "team_slot": 3, "hero_id": 68, "leaver_status": 0}, {"account_id": 4294967295, "player_slot": 4, "team_number": 0, "team_slot": 4, "hero_id": 94, "leaver_status": 0}, {"account_id": 4294967295, "player_slot": 128, "team_number": 1, "team_slot": 0, "hero_id": 29, "leaver_status": 0}, {"account_id": 4294967295, "player_slot": 129, "team_number": 1, "team_slot": 1, "hero_id": 34, "leaver_status": 1}, {"account_id": 33583200, "player_slot": 130, "team_number": 1, "team_slot": 2, "hero_id": 83, "leaver_status": 0}, {"account_id": 429660332, "pla

In [30]:
# Not sure why we're missing like 1 million rows...
len(training_set)

44567617

In [31]:
df = pd.DataFrame(training_set, 
             columns=['hero0', 'hero1', 'hero2', 'hero3', 'hero4',
                      'hero5', 'hero6', 'hero7', 'hero8', 'hero9',
                      'account0', 'account1', 'account2', 'account3', 'account4',
                      'account5', 'account6', 'account7', 'account8', 'account9',
                      'start_time', 'radiant_win'])
df.head()

Unnamed: 0,hero0,hero1,hero2,hero3,hero4,hero5,hero6,hero7,hero8,hero9,...,account2,account3,account4,account5,account6,account7,account8,account9,start_time,radiant_win
0,72,49,51,123,29,10,88,53,13,68,...,90082776,48735159,89222641,303331256,316988377,158626280,32095155,88154342,1628306396,0.0
1,62,25,2,67,48,32,54,71,94,9,...,4294967295,4294967295,4294967295,446047086,4294967295,234734286,397697516,4294967295,1628306898,1.0
2,42,32,14,35,26,79,110,70,11,93,...,119282046,415299772,4294967295,4294967295,4294967295,116157013,4294967295,4294967295,1628308993,1.0
3,25,18,5,75,11,22,1,62,119,35,...,112087465,279783318,4294967295,227499573,4294967295,146968160,4294967295,367997523,1628309609,0.0
4,47,14,8,112,68,67,40,29,126,55,...,142724295,4294967295,129748676,176466774,140994337,149039489,139291477,341953990,1628308251,1.0


In [33]:
df

Unnamed: 0,hero0,hero1,hero2,hero3,hero4,hero5,hero6,hero7,hero8,hero9,...,account2,account3,account4,account5,account6,account7,account8,account9,start_time,radiant_win
0,72,49,51,123,29,10,88,53,13,68,...,90082776,48735159,89222641,303331256,316988377,158626280,32095155,88154342,1628306396,0.0
1,62,25,2,67,48,32,54,71,94,9,...,4294967295,4294967295,4294967295,446047086,4294967295,234734286,397697516,4294967295,1628306898,1.0
2,42,32,14,35,26,79,110,70,11,93,...,119282046,415299772,4294967295,4294967295,4294967295,116157013,4294967295,4294967295,1628308993,1.0
3,25,18,5,75,11,22,1,62,119,35,...,112087465,279783318,4294967295,227499573,4294967295,146968160,4294967295,367997523,1628309609,0.0
4,47,14,8,112,68,67,40,29,126,55,...,142724295,4294967295,129748676,176466774,140994337,149039489,139291477,341953990,1628308251,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44567612,81,64,50,8,121,41,128,47,62,99,...,4294967295,160759704,87751664,182772900,4294967295,4294967295,4294967295,4294967295,1633627465,1.0
44567613,4,11,33,5,53,34,35,27,84,54,...,4294967295,4294967295,125292860,239269679,124672157,4294967295,4294967295,4294967295,1633628011,1.0
44567614,80,50,12,18,47,114,25,62,26,10,...,4294967295,4294967295,4294967295,171870635,4294967295,4294967295,4294967295,136257026,1633628587,1.0
44567615,73,51,56,103,104,4,8,15,25,9,...,4294967295,4294967295,4294967295,1003672236,184006594,4294967295,135128663,487842291,1633627862,0.0


In [32]:
df.to_csv("training_set_large.csv", index=False)