In [35]:
import seaborn as sns
import pandas as pd
import pylab as plt
import numpy as np
import steam.api
from steam.api import interface
import time
import os
import glob
import json
import tensorflow as tf
import sys

#dont keep api key in github!!!
api_file = os.path.join('..','..','steamapi','steam_api_key.txt')
with open(api_file, 'r') as fin:
    steam.api.key.set(fin.readline())

In [10]:
#declare interface urls
#dota2_beta should be used for testing in order to not aggressively make API requests
dota2_id = '570'
dota2_beta_id = '205790'

# Get 500 last matches to get approximate hero ratios
Need to be careful here so that test dataset doesn't have crossover with training data. My strategy to eliminate overlap was to start collecting match data for the training set for matches after the oldest match in the test set. To get around the 500 match retrieval limit I save 500 at a time to a .csv with a filename that has the start and end match_id. This means I can get 500, wait a bit, then get 500 more that are guarenteed to not have overlap. 

How long do I have to wait to get 500 more?? Unclear and was unable to find documentation. Thanks Valve.

In [22]:
#get matches data from dota2 API and put IDs into a dataframe
num_matches_to_request = 500

match_list = []

#put a start match_id if I'm trying to get more matches after getting 500
#if trying to load starting from a match_id previously obtained and saved
#use arg: start_at_match_id = old_match_list[-1]-1
matches = interface('IDOTA2Match_' + dota2_id).GetMatchHistory(hero_id = 121,
                                                               game_mode = 1,
                                                               skill_level = 3,
                                                               min_players = 10,
                                                               matches_requested = 100)




#@TODO: check if lobby_type is public/ranked matching (0 or 7) before appending
[match_list.append(match['match_id']) for match in matches['result']['matches']]

current_num = 100

while current_num <= num_matches_to_request:
    
    #check that match request returned any matches before trying to get more
    if len(match_list)==0:
        print('No matches returned\nWait a bit and retry\n\n    GabeN Bless')
    
    #sleep so we don't overload the servers
    time.sleep(1)
                
    current_num += 100
    matches = interface('IDOTA2Match_' + dota2_id).GetMatchHistory(start_at_match_id = match_list[-1]-1,
                                                                   game_mode = 1,
                                                                   skill_level = 3,
                                                                   min_players = 10,
                                                                   matches_requested = 100)

    #@TODO: check if lobby_type is public/ranked matching (0 or 7) before appending
    [match_list.append(match['match_id']) for match in matches['result']['matches']]

    
dota2_test_df = pd.DataFrame()
dota2_test_df['match_id'] = match_list

In [12]:
#get match details from match_ids obtained
#add radiant_win column
#add heroes split between dire and radiant

radiant_win_column = []
radiant_hero_column = []
dire_hero_column = []
match_details = []

for match_id in dota2_test_df['match_id']:
    time.sleep(1)
    match_details.append(interface('IDOTA2Match_' + dota2_id).GetMatchDetails(match_id = str(match_id)))
    
    radiant_win_column.append(match_details['result']['radiant_win'])
    
    radiant_hero_row = []
    dire_hero_row = []
    for player in match_details['result']['players']:
        #First bit of 8-bit integer player_slot is 1 if the player was on dire
        #so if player was on dire then player_slot > 128
        if player['player_slot']/128<1:
            radiant_hero_row.append(player['hero_id'])
        else:
            dire_hero_row.append(player['hero_id'])
    
    radiant_hero_column.append(radiant_hero_row)
    dire_hero_column.append(dire_hero_row)

In [13]:
#Add radiant_win column to Dataframe
dota2_test_df['radiant_win'] = 1*np.array(radiant_win_column)

#Add dire and radiant heroes to DataFrame under their own columns
for i in range(len(np.array(radiant_hero_column)[0])):
    dota2_test_df['radiant_heroes'+str(i)] = np.array(radiant_hero_column)[:,i]
    dota2_test_df['dire_heroes'+str(i)] = np.array(dire_hero_column)[:,i]

In [28]:
#save data to csv file so that we have it on disk
dota2_test_df.to_csv(os.path.join('dota2_test_data',
                                        str(dota2_test_df['match_id'][0])+
                                        '-'+
                                        str(dota2_test_df['match_id'][499])+'.csv'))

# Load test data that is newest for match_id checking

In [127]:
#Run this if the test data is needed to allow for proper test_data collection
#load newest file so that last_match_id is accurate? Unclear if this is the best strategy... 
#maybe should look at loading all then looking at match_id
dota2_data_files = glob.glob(os.path.join('dota2_test_data','*.csv'))
dota2_data_files.sort(key=os.path.getmtime)

dota2_test_df_from_file = pd.read_csv(dota2_data_files[0], usecols=range(1,13))
dota2_test_df_from_file.style

#purge 0 values from hero_id
dota2_test_df_from_file = dota2_test_df_from_file[dota2_test_df_from_file.radiant_heroes0 != 0]

#make old_match_list for easy checking of match_id
old_match_list = list(dota2_test_df_from_file['match_id'])

# Get 500 matches of each hero id (and trim duplicates)

In [12]:
hero_list = interface('IEconDOTA2_'+dota2_id).GetHeroes(itemizedonly = True)
#print(hero_list)

## Get match_id of all the matches

In [13]:
#get matches data from dota2 API and put IDs into a dataframe
num_matches_to_request = 500

match_list = []
#Loop through all hero_id values
#There are 116 heroes, but ids 105, 116-118 are not occupied
for i in range(1,hero_list['result']['count']+1):
    
    #better option is to build a vocabulary
    if i == 105:
        j = 121
    elif i == 115:
        j = 120
    elif i == 116:
        j = 119
    else:
        j = i
    
    #Get first match for each hero out of loop so that we can pull more than 100 games per hero
    #by utilizing start_at_match_id option
    matches = interface('IDOTA2Match_' + dota2_id).GetMatchHistory(hero_id = j,
                                                                   game_mode = 1,
                                                                   skill_level = 3,
                                                                   min_players = 10,
                                                                   matches_requested = 100)
    
    #@TODO: check if lobby_type is public/ranked matching (0 or 7) before appending
    [match_list.append(match['match_id']) for match in matches['result']['matches']]
    current_num = 1
    
    if len(match_list)==0:
        print('No matches returned\nWait a bit and retry\n\n    GabeN Bless')
        break
    
    while current_num <= num_matches_to_request:
        #sleep so we don't overload the servers
        time.sleep(1)

        current_num += 100
        matches = interface('IDOTA2Match_' + dota2_id).GetMatchHistory(hero_id = j,
                                                                       start_at_match_id = match_list[-1]-1,
                                                                       game_mode = 1,
                                                                       skill_level = 3,
                                                                       min_players = 10,
                                                                       matches_requested = 100)
        
        #@TODO: check if lobby_type is public/ranked matching (0 or 7) before appending
        [match_list.append(match['match_id']) for match in matches['result']['matches']]

## Purge duplicates

In [18]:
#Make dataframe of match_id values using dict.fromkeys(list) to remove duplicates
dota2_training_df = pd.DataFrame()
dota2_training_df['match_id'] = list(dict.fromkeys(match_list))

dota2_training_df.describe()

Unnamed: 0,match_id
count,16901.0
mean,4259853000.0
std,14961.71
min,4259789000.0
25%,4259845000.0
50%,4259855000.0
75%,4259863000.0
max,4259895000.0


## Get match details for matches

In [19]:
#initialize data columns if you're starting a match_detail getting batch
#if trying to finish one that broke unexpectedly then don't!
radiant_win_column = []
radiant_hero_column = []
dire_hero_column = []
match_details = []

In [20]:
#get match details from match_ids obtained
#add radiant_win column
#add heroes split between dire and radiant

for i,match_id in enumerate(dota2_training_df['match_id']):
    time.sleep(1)
    match_details.append(interface('IDOTA2Match_' + dota2_id).GetMatchDetails(match_id = str(match_id)))
    
    try:
        radiant_win_column.append(match_details[i]['result']['radiant_win'])
    except:
        print('server error at match_id: ' + str(match_id))
        print('element:' + str(i))
        break
    
    radiant_hero_row = []
    dire_hero_row = []
    for player in match_details[i]['result']['players']:
        #First bit of 8-bit integer player_slot is 1 if the player was on dire
        #so if player was on dire then player_slot > 128
        if player['player_slot']/128<1:
            radiant_hero_row.append(player['hero_id'])
        else:
            dire_hero_row.append(player['hero_id'])
    
    radiant_hero_column.append(radiant_hero_row)
    dire_hero_column.append(dire_hero_row)

In [21]:
print(len(dota2_training_df['match_id']))

16901


## Process and save data

In [22]:
#Add radiant_win column to Dataframe
dota2_training_df['radiant_win'] = 1*np.array(radiant_win_column)

#Add dire and radiant heroes to DataFrame under their own columns
for i in range(len(np.array(radiant_hero_column)[1])):
    print(i)
    dota2_training_df['radiant_heroes'+str(i)] = np.array(radiant_hero_column)[:,i]
    dota2_training_df['dire_heroes'+str(i)] = np.array(dire_hero_column)[:,i]

0
1
2
3
4


In [23]:
#purge 0 values from hero_id
dota2_training_df = dota2_training_df[dota2_training_df.radiant_heroes0 != 0]

In [24]:
#save data to csv file so we have it on disk
dota2_training_df.to_csv(os.path.join('dota2_training_data',
                                        str(dota2_training_df['match_id'][0])+
                                        '-'+
                                        str(list(dota2_training_df['match_id'])[-1])+
                                        '.csv'))

## Export data as json

In [25]:
outfile = os.path.join('dota2_training_data',
                                        'match_id'+
                                        str(dota2_training_df['match_id'][0])+
                                        '-'+
                                        str(list(dota2_training_df['match_id'])[-1])+
                                        '.json')

match_details_dict_list = [dict(match_detail) for match_detail in match_details]
import json
with open(outfile, 'w') as fout:
    json.dump(match_details_dict_list, fout)

# Build TFRecords file
Puts all training data in the training_data folder into one TFRecords file
Do this to nicely handle the features that is a numpy array (radiant_heroes and dire_heroes)

In [26]:
dota2_df = None
for filename in glob.glob(os.path.join('dota2_training_data','*.csv')):
    if type(dota2_df)==None:
        dota2_df = pd.read_csv(filename, usecols=range(1,13))
    else:
        dota2_df = pd.concat([dota2_df, pd.read_csv(filename, usecols=range(1,13))],ignore_index=True)

dota2_df = dota2_df.reindex(np.random.permutation(dota2_df.index))
dota2_df.describe()

Unnamed: 0,match_id,radiant_win,radiant_heroes0,dire_heroes0,radiant_heroes1,dire_heroes1,radiant_heroes2,dire_heroes2,radiant_heroes3,dire_heroes3,radiant_heroes4,dire_heroes4
count,43031.0,43031.0,43031.0,43031.0,43031.0,43031.0,43031.0,43031.0,43031.0,43031.0,43031.0,43031.0
mean,4257821000.0,0.550394,50.944412,51.307104,51.610885,51.351235,51.121819,51.459692,51.3065,51.440798,50.961911,51.092817
std,2660839.0,0.49746,34.766818,34.724615,34.778497,34.733521,34.758653,34.756512,34.753597,34.826556,34.817146,34.660759
min,4252092000.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,4258257000.0,0.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0
50%,4258288000.0,1.0,44.0,44.0,44.0,44.0,44.0,44.0,44.0,44.0,44.0,44.0
75%,4259850000.0,1.0,82.0,83.0,83.0,83.0,82.0,83.0,83.0,83.0,82.0,82.0
max,4259895000.0,1.0,121.0,121.0,121.0,121.0,121.0,121.0,121.0,121.0,121.0,121.0


In [27]:
def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
def _bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

In [28]:
def convert_to_tfrecords(filename, examples, targets):
    # open the TFRecords file
    writer = tf.python_io.TFRecordWriter(filename)
    
    radiant_hero_array = np.array(examples['radiant_heroes'])
    dire_hero_array = np.array(examples['dire_heroes'])
    target_array = np.array(targets['radiant_win'])
    
    for i in range(len(radiant_hero_array[:])):
        # print how many images are saved every 1000 images
        if not i % 5000:
            print('Train data: %d/%d' % (i, len(examples)))
            sys.stdout.flush()
            
        # Load the image
        radiant_heroes = radiant_hero_array[:][i].tostring()
        dire_heroes = dire_hero_array[:][i].tostring()
        target = target_array[i]
        
        # Create a feature
        feature = {'train/radiant_heroes': _bytes_feature(tf.compat.as_bytes(radiant_heroes)),
                   'train/dire_heroes': _bytes_feature(tf.compat.as_bytes(dire_heroes)),
                   'train/targets': _int64_feature(target)}
        # Create an example protocol buffer
        example = tf.train.Example(features=tf.train.Features(feature=feature))
    
        # Serialize to string and write on the file
        writer.write(example.SerializeToString())
    
    writer.close()
    sys.stdout.flush()

In [30]:
def preprocess_features(dota2_df):
    """Take dota2_df and create a dataframe containing only the features for our model
            Args: 
                dota2_df: Dataframe containing dota2 training and test data
            returns: 
                processed_df: pandas DataFrame containing only feature columns
    """
    
    #Use arrays of heroes for each team as features rather than each hero
    #individually to help model fit?
    #This is an attempt to make the model more similar to the movie review
    #text analysis example in the Google ML Crash Course
    processed_df = pd.DataFrame()
    
    processed_df['radiant_heroes'] = list(np.array(dota2_df.loc[:,['radiant_heroes0',
                                                'radiant_heroes1','radiant_heroes2',
                                                'radiant_heroes3','radiant_heroes4']]))
    processed_df['dire_heroes'] = list(np.array(dota2_df.loc[:,['dire_heroes0',
                                    'dire_heroes1','dire_heroes2',
                                    'dire_heroes3','dire_heroes4']]))
    
    ##only features (to start with) are the heroes in the game
    #processed_df = dota2_df[['radiant_heroes0', 'radiant_heroes1',
    #                   'radiant_heroes2', 'radiant_heroes3', 'radiant_heroes4',
    #                    'dire_heroes0', 'dire_heroes1', 'dire_heroes2',
    #                    'dire_heroes3', 'dire_heroes4']]
    
    ##create two synthetic features that is the product of all heroes for each team
    #processed_df['radiant_hero_product'] = dota2_df['radiant_heroes0']*dota2_df['radiant_heroes1']*dota2_df['radiant_heroes2']dota2_df['radiant_heroes3']*dota2_df['radiant_heroes4']
    #processed_df['dire_hero_product'] = dota2_df['dire_heroes0']*dota2_df['dire_heroes1']*dota2_df['dire_heroes2']dota2_df['dire_heroes3']*dota2_df['dire_heroes4']
    
    return processed_df
    
def preprocess_targets(dota2_df):
    """Take dota2_df and create a dataframe containing only the targets for our model
            Args: 
                dota2_df: Dataframe containing dota2 training and test data
            returns: 
                target_df: pandas DataFrame containing only the target column
    """
    target_df = pd.DataFrame()
    target_df['radiant_win'] = dota2_df['radiant_win']
    
    return target_df

In [36]:
convert_to_tfrecords(os.path.join('dota2_training_data', 'dota2_training_data.tfrecords'),
                     preprocess_features(dota2_df),
                     preprocess_targets(dota2_df))

Train data: 0/43031
Train data: 1000/43031
Train data: 2000/43031
Train data: 3000/43031
Train data: 4000/43031
Train data: 5000/43031
Train data: 6000/43031
Train data: 7000/43031
Train data: 8000/43031
Train data: 9000/43031
Train data: 10000/43031
Train data: 11000/43031
Train data: 12000/43031
Train data: 13000/43031
Train data: 14000/43031
Train data: 15000/43031
Train data: 16000/43031
Train data: 17000/43031
Train data: 18000/43031
Train data: 19000/43031
Train data: 20000/43031
Train data: 21000/43031
Train data: 22000/43031
Train data: 23000/43031
Train data: 24000/43031
Train data: 25000/43031
Train data: 26000/43031
Train data: 27000/43031
Train data: 28000/43031
Train data: 29000/43031
Train data: 30000/43031
Train data: 31000/43031
Train data: 32000/43031
Train data: 33000/43031
Train data: 34000/43031
Train data: 35000/43031
Train data: 36000/43031
Train data: 37000/43031
Train data: 38000/43031
Train data: 39000/43031
Train data: 40000/43031
Train data: 41000/43031
Train