# Compress Data

Currently I have downloaded 24.7 million Dota 2 game summaries. This ends up taking 140GB of space on my desktop. I probably can't scale more than 10x higher than this.

In order to work around this, we need to start dropping information that we know we will *never* need. Since I want to predict the outcome of a game within ~1 minute of it starting, let's drop anything that is guaranteed not to be present at this time.

I have already corrected the bulk download script to drop things, but let's go through `dota_2_bulk.tsv` and drop unnecessary fields from it as well.

In [37]:
import os
import re
import json
import time
import tqdm
import datetime
import requests
import pandas as pd
from os.path import exists

In [38]:
# Stored outside of repo because it's 140GB
path = '../data/dota_2_bulk.tsv'
clean_path = '../data/dota_2_bulk_clean.tsv'

In [39]:
# Remove unnecessary fields from game result. 
# We do this to minimize the space needed to store everything.
def delete_if_present(json, key):
  if key in json:
    del json[key]
    
def clean_match(match):
  delete_if_present(match, 'tower_status_radiant')
  delete_if_present(match, 'tower_status_dire')
  delete_if_present(match, 'barracks_status_radiant')
  delete_if_present(match, 'barracks_status_dire')
  delete_if_present(match, 'positive_votes')
  delete_if_present(match, 'negative_votes')
  delete_if_present(match, 'radiant_score')
  delete_if_present(match, 'dire_score')
  delete_if_present(match, 'picks_bans')
  for player in match['players']:
    delete_if_present(player, 'item_0')
    delete_if_present(player, 'item_1')
    delete_if_present(player, 'item_2')
    delete_if_present(player, 'item_3')
    delete_if_present(player, 'item_4')
    delete_if_present(player, 'item_5')
    delete_if_present(player, 'backpack_0')
    delete_if_present(player, 'backpack_1')
    delete_if_present(player, 'backpack_2')
    delete_if_present(player, 'item_neutral')
    delete_if_present(player, 'kills')
    delete_if_present(player, 'deaths')
    delete_if_present(player, 'assists')
    delete_if_present(player, 'last_hits')
    delete_if_present(player, 'denies')
    delete_if_present(player, 'gold_per_min')
    delete_if_present(player, 'level')
    delete_if_present(player, 'net_worth')
    delete_if_present(player, 'aghanims_scepter')
    delete_if_present(player, 'aghanims_shard')
    delete_if_present(player, 'moonshard')
    delete_if_present(player, 'xp_per_min')
    delete_if_present(player, 'additional_units')  

In [41]:
with open(clean_path, 'w') as clean_file:
  with open(path) as file:
    for line in tqdm.tqdm_notebook(file):
      seq_num, raw_json = line.split('\t')
      try:
        # Load the original match with unnecessary info.
        match = json.loads(raw_json)
        # Clean it by removing unnecessary info.
        clean_match(match)
        # Turn it back into a string.
        clean_json = json.dumps(match)
        # Create a single line for the clean file
        output_str = str(seq_num) + "\t" + clean_json + "\n"
        clean_file.write(output_str)
      except Exception as e:
        print(raw_json)
        print(e)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for line in tqdm.tqdm_notebook(file):


0it [00:00, ?it/s]

In [13]:
match = json.loads(raw_json)

In [14]:
match

{'players': [{'account_id': 363849371,
   'player_slot': 0,
   'hero_id': 72,
   'item_0': 0,
   'item_1': 156,
   'item_2': 141,
   'item_3': 154,
   'item_4': 116,
   'item_5': 160,
   'backpack_0': 0,
   'backpack_1': 379,
   'backpack_2': 0,
   'item_neutral': 573,
   'kills': 7,
   'deaths': 3,
   'assists': 7,
   'leaver_status': 0,
   'last_hits': 681,
   'denies': 7,
   'gold_per_min': 768,
   'xp_per_min': 747,
   'level': 28,
   'net_worth': 31437,
   'aghanims_scepter': 1,
   'aghanims_shard': 0,
   'moonshard': 0},
  {'account_id': 221883921,
   'player_slot': 1,
   'hero_id': 49,
   'item_0': 141,
   'item_1': 116,
   'item_2': 108,
   'item_3': 600,
   'item_4': 112,
   'item_5': 48,
   'backpack_0': 41,
   'backpack_1': 0,
   'backpack_2': 0,
   'item_neutral': 311,
   'kills': 6,
   'deaths': 6,
   'assists': 12,
   'leaver_status': 0,
   'last_hits': 482,
   'denies': 9,
   'gold_per_min': 679,
   'xp_per_min': 667,
   'level': 28,
   'net_worth': 31318,
   'aghanims_s

In [27]:
def delete_if_present(json, key):
  if key in json:
    del json[key]

In [28]:
'duration' in match

True

In [29]:
match['duration']

3123

In [31]:
match

{'players': [{'account_id': 363849371,
   'player_slot': 0,
   'hero_id': 72,
   'leaver_status': 0,
   'xp_per_min': 747},
  {'account_id': 221883921,
   'player_slot': 1,
   'hero_id': 49,
   'leaver_status': 0,
   'xp_per_min': 667},
  {'account_id': 90082776,
   'player_slot': 2,
   'hero_id': 51,
   'leaver_status': 0,
   'xp_per_min': 424},
  {'account_id': 48735159,
   'player_slot': 3,
   'hero_id': 123,
   'leaver_status': 0,
   'xp_per_min': 500},
  {'account_id': 89222641,
   'player_slot': 4,
   'hero_id': 29,
   'leaver_status': 0,
   'xp_per_min': 653},
  {'account_id': 303331256,
   'player_slot': 128,
   'hero_id': 10,
   'leaver_status': 0,
   'xp_per_min': 700},
  {'account_id': 316988377,
   'player_slot': 129,
   'hero_id': 88,
   'leaver_status': 0,
   'xp_per_min': 552},
  {'account_id': 158626280,
   'player_slot': 130,
   'hero_id': 53,
   'leaver_status': 0,
   'xp_per_min': 557},
  {'account_id': 32095155,
   'player_slot': 131,
   'hero_id': 13,
   'leaver_st

In [32]:
delete_if_present(match, 'duration')
delete_if_present(match, 'tower_status_radiant')
delete_if_present(match, 'tower_status_dire')
delete_if_present(match, 'barracks_status_radiant')
delete_if_present(match, 'barracks_status_dire')
delete_if_present(match, 'positive_votes')
delete_if_present(match, 'negative_votes')
delete_if_present(match, 'radiant_score')
delete_if_present(match, 'dire_score')
delete_if_present(match, 'picks_bans')

In [33]:
for player in match['players']:
  delete_if_present(player, 'item_0')
  delete_if_present(player, 'item_1')
  delete_if_present(player, 'item_2')
  delete_if_present(player, 'item_3')
  delete_if_present(player, 'item_4')
  delete_if_present(player, 'item_5')
  delete_if_present(player, 'backpack_0')
  delete_if_present(player, 'backpack_1')
  delete_if_present(player, 'backpack_2')
  delete_if_present(player, 'item_neutral')
  delete_if_present(player, 'kills')
  delete_if_present(player, 'deaths')
  delete_if_present(player, 'assists')
  delete_if_present(player, 'last_hits')
  delete_if_present(player, 'denies')
  delete_if_present(player, 'gold_per_min')
  delete_if_present(player, 'level')
  delete_if_present(player, 'net_worth')
  delete_if_present(player, 'aghanims_scepter')
  delete_if_present(player, 'aghanims_shard')
  delete_if_present(player, 'moonshard')
  delete_if_present(player, 'xp_per_min')
  delete_if_present(player, 'additional_units')


In [34]:
match

{'players': [{'account_id': 363849371,
   'player_slot': 0,
   'hero_id': 72,
   'leaver_status': 0},
  {'account_id': 221883921,
   'player_slot': 1,
   'hero_id': 49,
   'leaver_status': 0},
  {'account_id': 90082776,
   'player_slot': 2,
   'hero_id': 51,
   'leaver_status': 0},
  {'account_id': 48735159,
   'player_slot': 3,
   'hero_id': 123,
   'leaver_status': 0},
  {'account_id': 89222641,
   'player_slot': 4,
   'hero_id': 29,
   'leaver_status': 0},
  {'account_id': 303331256,
   'player_slot': 128,
   'hero_id': 10,
   'leaver_status': 0},
  {'account_id': 316988377,
   'player_slot': 129,
   'hero_id': 88,
   'leaver_status': 0},
  {'account_id': 158626280,
   'player_slot': 130,
   'hero_id': 53,
   'leaver_status': 0},
  {'account_id': 32095155,
   'player_slot': 131,
   'hero_id': 13,
   'leaver_status': 0},
  {'account_id': 88154342,
   'player_slot': 132,
   'hero_id': 68,
   'leaver_status': 0}],
 'radiant_win': False,
 'pre_game_duration': 90,
 'start_time': 16283063

In [12]:
match

{'radiant_win': False,
 'duration': 3123,
 'pre_game_duration': 90,
 'start_time': 1628306396,
 'match_id': 6123629491,
 'match_seq_num': 5125140861,
 'tower_status_radiant': 1540,
 'tower_status_dire': 1792,
 'barracks_status_radiant': 51,
 'barracks_status_dire': 50,
 'cluster': 121,
 'first_blood_time': 48,
 'lobby_type': 1,
 'human_players': 10,
 'leagueid': 13230,
 'positive_votes': 0,
 'negative_votes': 0,
 'game_mode': 2,
 'flags': 1,
 'engine': 1,
 'radiant_score': 25,
 'dire_score': 24,
 'radiant_team_id': 8124804,
 'radiant_name': 'Squid team',
 'radiant_logo': 4294967295,
 'radiant_team_complete': 0,
 'dire_team_id': 8396813,
 'dire_name': "I'm Bored",
 'dire_logo': 1777210068128158776,
 'dire_team_complete': 1,
 'radiant_captain': 90082776,
 'dire_captain': 303331256}