# Clean Training Set

Now that we've downloaded a bunch of Dota 2 games, let's clean up the dataset and format it so it's easier for us to train our model on it.

In [1]:
import os
import re
import json
import time
import tqdm
import datetime
import requests
import pandas as pd
from os.path import exists

In [2]:
# Stored outside of repo because it's 30GB
path = '../data/dota_2_bulk.tsv'

In [3]:
# How many games (lines) do we have?
num_games = sum(1 for line in open(path))
num_games

4212632

In [61]:
# Now read each line, parse the JSON and rip out the information we're interested in.

num_failures = 0
training_set = []

with open(path) as file:
  for line in tqdm.tqdm_notebook(file):
    seq_num, raw_json = line.split('\t')
    # TODO(joshvarty): Remove once we dump JSON correctly.
    # Some team names have quotations in them. We'll just skip them for now.
    if '\"' in raw_json:
      continue
    # When I called str(json) when saving this file it used single quotations.
    # Apparently this is not valid JSON. We'll cross our fingers and hope we can just
    # replace them with double quotes without any problems.
    raw_json = raw_json.replace("\'", "\"")
    # We also have to replace Python's True/False with true/false
    raw_json = raw_json.replace("True", "true")
    raw_json = raw_json.replace("False", "false")
    try:
      match = json.loads(raw_json)
      players = match['players']
      hero0 = players[0]['hero_id']
      hero1 = players[1]['hero_id']
      hero2 = players[2]['hero_id']
      hero3 = players[3]['hero_id']
      hero4 = players[4]['hero_id']
      hero5 = players[5]['hero_id']
      hero6 = players[6]['hero_id']
      hero7 = players[7]['hero_id']
      hero8 = players[8]['hero_id']
      hero9 = players[9]['hero_id']
      # Convert to float target.
      radiant_win = float(match['radiant_win'])
      training_set.append((hero0, hero1, hero2, hero3, hero4,
                    hero5, hero6, hero7, hero8, hero9,
                    radiant_win))
    except Exception as e:
      print(raw_json)
      print(e)
      
      _, original = line.split('\t')
      print(original)
      num_failures +=1
      
print("Num failures:" , num_failures)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for line in tqdm.tqdm_notebook(file):


0it [00:00, ?it/s]

{"players": [{"account_id": 4294967295, "player_slot": 0, "hero_id": 20, "item_0": 92, "item_1": 21, "item_2": 218, "item_3": 60, "item_4": 180, "item_5": 229, "backpack_0": 22, "backpack_1": 573, "backpack_2": 0, "item_neutral": 354, "kills": 7, "deaths": 4, "assists": 19, "leaver_status": 0, "last_hits": 34, "denies": 1, "gold_per_min": 318, "xp_per_min": 396, "level": 16, "net_worth": 9469, "aghanims_scepter": 0, "aghanims_shard": 0, "moonshard": 0}, {"account_id": 4294967295, "player_slot": 1, "hero_id": 14, "item_0": 1, "item_1": 90, "item_2": 214, "item_3": 206, "item_4": 0, "item_5": 38, "backpack_0": 0, "backpack_1": 0, "backpack_2": 0, "item_neutral": 356, "kills": 6, "deaths": 1, "assists": 26, "leaver_status": 0, "last_hits": 67, "denies": 3, "gold_per_min": 371, "xp_per_min": 585, "level": 21, "net_worth": 10864, "aghanims_scepter": 0, "aghanims_shard": 0, "moonshard": 0}, {"account_id": 1105812798, "player_slot": 2, "hero_id": 99, "item_0": 154, "item_1": 242, "item_2": 69

In [63]:
# Not sure why we're missing like 1 million rows...
len(training_set)

4212425

In [68]:
df = pd.DataFrame(training_set, 
             columns=['hero0', 'hero1', 'hero2', 'hero3', 'hero4',
                      'hero5', 'hero6', 'hero7', 'hero8', 'hero9',
                      'radiant_win'])
df.head()

Unnamed: 0,hero0,hero1,hero2,hero3,hero4,hero5,hero6,hero7,hero8,hero9,radiant_win
0,23,26,36,42,119,74,104,100,54,47,True
1,126,8,30,17,14,39,26,123,2,54,False
2,40,4,52,8,93,44,108,5,59,58,True
3,47,101,42,109,1,81,67,35,28,50,True
4,26,14,76,70,81,23,58,54,64,34,True


In [69]:
df.to_csv("training_set_large.csv", index=False)