In [1]:
import seaborn as sns
import pandas as pd
import pylab as plt
import numpy as np
from steam import WebAPI
import time
import os
import glob
import json
import tensorflow as tf
import sys
from IPython import display
import requests
#dont keep api key in github!!!
api_file = os.path.join('..','..','apikeys','steam_api_key.txt')
with open(api_file, 'r') as fin:
    api = WebAPI(key=fin.readline())

In [2]:
#declare interface urls
#dota2_beta should be used for testing in order to not aggressively make API requests
dota2_id = '570'
dota2_beta_id = '205790'

# Get 500 last matches to get approximate hero ratios
Need to be careful here so that test dataset doesn't have crossover with training data. My strategy to eliminate overlap was to start collecting match data for the training set for matches after the oldest match in the test set. To get around the 500 match retrieval limit I save 500 at a time to a .csv with a filename that has the start and end match_id. This means I can get 500, wait a bit, then get 500 more that are guarenteed to not have overlap. 

How long do I have to wait to get 500 more?? Unclear and was unable to find documentation. Thanks Valve.

In [None]:
#get matches data from dota2 API and put IDs into a dataframe
num_requested = 1000
skill = 3
match_list = []

#put a start match_id if I'm trying to get more matches after getting 500
#if trying to load starting from a match_id previously obtained and saved
#use arg: start_at_match_id = old_match_list[-1]-1
matches = api.call('IDOTA2Match_' + dota2_id +'.GetMatchHistory',
                                            hero_id = 121,
                                            game_mode = 1,
                                            skill = skill,
                                            min_players = 10,
                                            matches_requested = 100)

#@TODO: check if lobby_type is public/ranked matching (0 or 7) before appending
[match_list.append(match['match_id']) for match in matches['result']['matches']]

current_num = 100

while current_num <= num_requested:
    
    #check that match request returned any matches before trying to get more
    if len(match_list)==0:
        print('No matches returned\nWait a bit and retry\n\n    GabeN Bless')
    
    #sleep so we don't overload the servers
    time.sleep(1)
                
    current_num += 100
    matches = api.call('IDOTA2Match_' + dota2_id+'.GetMatchHistory',
                                            start_at_match_id = match_list[-1]-1,
                                            game_mode = 1,
                                            skill = skill,
                                            min_players = 10,
                                            matches_requested = 100)

    #@TODO: check if lobby_type is public/ranked matching (0 or 7) before appending
    [match_list.append(match['match_id']) for match in matches['result']['matches']]

    
dota2_test_df = pd.DataFrame()
dota2_test_df['match_id'] = match_list

In [None]:
#get match details from match_ids obtained
#add radiant_win column
#add heroes split between dire and radiant

radiant_win_column = []
radiant_hero_column = []
dire_hero_column = []
match_details = []

for match_id in dota2_test_df['match_id']:
    time.sleep(1)
    match_details.append(api.call('IDOTA2Match_' + dota2_id+'.GetMatchDetails',
                                  match_id = str(match_id)))
    radiant_win_column.append(match_details[-1]['result']['radiant_win'])
    
    radiant_hero_row = []
    dire_hero_row = []
    for player in match_details[-1]['result']['players']:
        #First bit of 8-bit integer player_slot is 1 if the player was on dire
        #so if player was on dire then player_slot > 128
        if player['player_slot']/128<1:
            radiant_hero_row.append(player['hero_id'])
        else:
            dire_hero_row.append(player['hero_id'])
    
    radiant_hero_column.append(radiant_hero_row)
    dire_hero_column.append(dire_hero_row)

In [None]:
#Add radiant_win column to Dataframe
dota2_test_df['radiant_win'] = 1*np.array(radiant_win_column)

#Add dire and radiant heroes to DataFrame under their own columns
for i in range(len(np.array(radiant_hero_column)[0])):
    dota2_test_df['radiant_heroes'+str(i)] = np.array(radiant_hero_column)[:,i]
    dota2_test_df['dire_heroes'+str(i)] = np.array(dire_hero_column)[:,i]

In [None]:
#save data to csv file so that we have it on disk
dota2_test_df.to_csv(os.path.join('dota2_test_data',
                                        str(dota2_test_df['match_id'][0])+
                                        '-'+
                                        str(dota2_test_df['match_id'][499])+'.csv'))

# Load test data that is newest for match_id checking

In [None]:
#Run this if the test data is needed to allow for proper test_data collection
#load newest file so that last_match_id is accurate? Unclear if this is the best strategy... 
#maybe should look at loading all then looking at match_id
dota2_data_files = glob.glob(os.path.join('dota2_test_data','*.csv'))
dota2_data_files.sort(key=os.path.getmtime)

dota2_test_df_from_file = pd.read_csv(dota2_data_files[0], usecols=range(1,13))
dota2_test_df_from_file.style

#purge 0 values from hero_id
dota2_test_df_from_file = dota2_test_df_from_file[dota2_test_df_from_file.radiant_heroes0 != 0]

#make old_match_list for easy checking of match_id
old_match_list = list(dota2_test_df_from_file['match_id'])

# Get 500 matches of each hero id (and trim duplicates)

In [None]:
hero_list = api.call('IEconDOTA2_'+dota2_id+'.GetHeroes',itemizedonly = True)
hero_list=pd.DataFrame(hero_list['result']['heroes'])

## Get match_id of all the matches
this uses GetMatchHistory which is a bit limits match getting to 500 per hero (about 2e4) and sometimes stops returning matches

In [None]:
#get matches data from dota2 API and put IDs into a dataframe
num_matches_to_request = 500
skill = 3
match_list = []
most_current_matches = True

#Loop through all hero_id values
#use vocabulary generated by an api call above to get all hero id's
for i in hero_list['id']:
    
    #Get first match for each hero out of loop so that we can pull more than 100 games per hero
    #by utilizing start_at_match_id option
    if most_current_matches==True:
        matches = api.call('IDOTA2Match_' + dota2_id+'.GetMatchHistory',hero_id = i,
                                                                   game_mode = 1,
                                                                   skill = skill,
                                                                   min_players = 10,
                                                                   matches_requested = 100)
    else:
        files=glob.glob(os.path.join('Dota_data','skill_level='+str(skill),'*.csv'))
        files.sort(key=os.path.getmtime)
        start_match = pd.read_csv(files[-1], usecols=[1])['match_id'].min()-1
        print(start_match)
        matches = api.call('IDOTA2Match_' + dota2_id+'.GetMatchHistory',hero_id=i,
                                                            start_at_match_id=start_match,
                                                            game_mode=1,
                                                            skill=skill,
                                                            min_players=10,
                                                            matches_requested=100)
    #Append matches to matchlist
    #if lobby_type is public/ranked matching (0 or 7)
    for match in matches['result']['matches']:
        lobby_type=match['lobby_type']
        if lobby_type==0 or lobby_type==7 or lobby_type==5 or lobby_type==2:
            match_list.append(match['match_id'])
            
    current_num = 1

    if len(match_list)==0:
        print('No matches returned\nWait a bit and retry\n\n    GabeN Bless')
        break
    
    while current_num <= num_matches_to_request:
        #sleep so we don't overload the servers
        time.sleep(1)

        current_num += 100
        matches = api.call('IDOTA2Match_' + dota2_id+'.GetMatchHistory',hero_id = i,
                                                        start_at_match_id = match_list[-1]-1,
                                                        game_mode = 1,
                                                        skill = skill,
                                                        min_players = 10,
                                                        matches_requested = 100)
        
        #Append matches to matchlist
        #if lobby_type is public/ranked matching (0 or 7)
        for match in matches['result']['matches']:
            lobby_type=match['lobby_type']
            if lobby_type==0 or lobby_type==7 or lobby_type==5 or lobby_type==2:
                match_list.append(match['match_id'])


## Purge duplicates and add to dataframe

In [None]:
#Make dataframe of match_id values using dict.fromkeys(list) to remove duplicates
dota2_df = pd.DataFrame()
dota2_df['match_id'] = list(dict.fromkeys(match_list))

dota2_df.describe()

## Save match id list

In [None]:
dota2_df.to_csv(os.path.join('Dota_data',
                                      'skill_level='+str(skill),
                                      'saved'+time.strftime('%Y-%m-%d')+'.csv'))

# Get match details for matches
this is a poor way to do it because it needs many calls to get the data. GetMatchHistoryBySequenceNum is nice but there are no filters so the usable matches are ~1/500.

## Load files of match_ids

In [None]:
dota_df = None

files=glob.glob(os.path.join('Dota_data','skill_level=*','*.csv'))
files.sort(key=os.path.getmtime)
#print(files)
for filename in files[:]:
    print(filename)
    if type(dota_df)==None:
        dota_df = pd.read_csv(filename, usecols=[1])
        dota_df['skill_level'] = np.ones(len(dota_df['match_id']))*int(filename[-21])
    else:
        dota_df_new = pd.read_csv(filename, usecols=[1])
        dota_df_new['skill_level'] = np.ones(len(dota_df_new['match_id']))*int(filename[-21])
        dota_df = pd.concat([dota_df, dota_df_new],
                                                    ignore_index=True)

dota_df.describe()

## Get the details

In [None]:
#initialize data columns if you're starting a match_detail getting batch
#if trying to finish one that broke unexpectedly then don't!
radiant_win_column = []
radiant_xp_column = []; dire_xp_column = []
radiant_gold_column = []; dire_gold_column = []
radiant_healing_column = []; dire_healing_column = []
radiant_towerdamage_column = []; dire_towerdamage_column =[]
radiant_kills_column = []; dire_kills_column = []
radiant_hero_column = []; dire_hero_column = []
match_id_list=[]
positive_votes_column=[]; negative_votes_column=[]
duration_column=[]
skill_column=[]

In [None]:
#get match details from match_ids obtained
#add radiant_win column
#add heroes split between dire and radiant

for i,match_id in enumerate(dota_df[:]['match_id']):
    time.sleep(1)
    
    try:
        match_details = api.call('IDOTA2Match_' + dota2_id+'.GetMatchDetails'
                                  ,match_id = str(match_id))
    except:
        print('server error at match_id: ' + str(match_id))
        print('element:',i,' match_id:',match_id)
        time.sleep(10)
        continue
         
    dire_xp = 0; radiant_xp = 0    
    dire_gold = 0; radiant_gold = 0
    dire_healing = 0; radiant_healing = 0
    dire_kills = 0; radiant_kills = 0
    dire_towerdamage = 0; radiant_towerdamage = 0
    dire_healing = 0; radiant_healing = 0
    radiant_hero_row = []; dire_hero_row = []
    #get heroes, xp, and gold for features, target, target
    for player in match_details['result']['players']:
        #First bit of 8-bit integer player_slot is 1 if the player was on dire
        #so if player was on dire then player_slot > 128
        if player['player_slot']/128<1:
            radiant_hero_row.append(player['hero_id'])
            radiant_xp+=player['xp_per_min']*match_details['result']['duration']/60
            radiant_gold+=player['gold_spent']
            radiant_kills+=player['kills']
            radiant_healing+=player['hero_healing']
            radiant_towerdamage+=player['tower_damage']
        else:
            dire_hero_row.append(player['hero_id'])
            dire_xp+=player['xp_per_min']*match_details['result']['duration']/60
            dire_gold+=player['gold_spent']
            dire_kills+=player['kills']
            dire_healing+=player['hero_healing']
            dire_towerdamage+=player['tower_damage']
    
    skill_column = int(dota_df['skill_level'][i])
    radiant_win_column.append(match_details['result']['radiant_win'])

    dire_xp_column.append(dire_xp)
    radiant_xp_column.append(radiant_xp)
    radiant_gold_column.append(radiant_gold)
    dire_gold_column.append(dire_gold)
    radiant_kills_column.append(radiant_kills)
    dire_kills_column.append(dire_kills)
    #radiant healing appears to be bugged as of 20190609, but my code is identical to dire?
    radiant_healing_column.append(radiant_healing)
    dire_healing_column.append(dire_healing)
    radiant_towerdamage_column.append(radiant_towerdamage)
    dire_towerdamage_column.append(dire_towerdamage)
    
    positive_votes_column.append(match_details['result']['positive_votes'])
    negative_votes_column.append(match_details['result']['negative_votes'])
    duration_column.append(match_details['result']['duration'])               

    radiant_hero_column.append(radiant_hero_row)
    dire_hero_column.append(dire_hero_row)
    match_id_list.append(match_id)
    if len(radiant_win_column)!= len(match_id_list):
        print('length mismatch!')
        break

In [None]:
print(len(radiant_hero_column))
print(len(dota_df['match_id']))
print(len(match_id_list))

### Run if the match_detail fetching broke and the df length is different from the data column lengths

In [None]:
match_id_list = []
match_details = list(match_details)
radiant_win_column = []
radiant_xp_column = []
dire_xp_column = []
radiant_gold_column = []
dire_gold_column = []
radiant_hero_column = []
dire_hero_column = []

for i,match in enumerate(match_details):
    
    match_id_list.append(match['result']['match_id'])
    radiant_win_column.append(match['result']['radiant_win'])
    
    dire_xp = 0; radiant_xp = 0    
    dire_gold = 0; radiant_gold = 0
    radiant_hero_row = []; dire_hero_row = []
    #get heroes, xp, and gold for features, target, target
    for player in match_details[i]['result']['players']:
        #First bit of 8-bit integer player_slot is 1 if the player was on dire
        #so if player was on dire then player_slot > 128
        if player['player_slot']/128<1:
            radiant_hero_row.append(player['hero_id'])
            radiant_xp+=player['xp_per_min']*match_details[i]['result']['duration']/60
            for item_index in range(5):
                radiant_gold+=item_df.loc[item_df['id']==player['item_'+str(item_index)]]['cost']
        else:
            dire_hero_row.append(player['hero_id'])
            dire_xp+=player['xp_per_min']*match_details[i]['result']['duration']/60
            for item_index in range(5):
                dire_gold+=item_df.loc[item_df['id']==player['item_'+str(item_index)]]['cost']

    dire_xp_column.append(dire_xp)
    radiant_xp_column.append(radiant_xp)
    radiant_gold_column.append(radiant_gold)
    dire_gold_column.append(dire_gold)
    radiant_hero_column.append(radiant_hero_row)
    dire_hero_column.append(dire_hero_row)
dota2_training_df = pd.DataFrame()
dota2_training_df['match_id'] = match_id_list

## Process and save data

In [None]:
dota2_training_df = pd.DataFrame()
#Add radiant_win column to DataFrame
dota2_training_df['radiant_win'] = 1*np.array(radiant_win_column)

#add xp and gold columns to DataFrame
dota2_training_df['radiant_xp'] = np.array(radiant_xp_column)
dota2_training_df['dire_xp'] = np.array(dire_xp_column)
dota2_training_df['radiant_gold'] = np.array(radiant_gold_column)
dota2_training_df['dire_gold'] = np.array(dire_gold_column)

#add kills, healing, towerdamage, duration, and positive/negative votes
dota2_training_df['radiant_kills'] = np.array(radiant_kills_column)
dota2_training_df['dire_kills'] = np.array(dire_kills_column)
dota2_training_df['radiant_healing'] = np.array(radiant_healing_column)
dota2_training_df['dire_healing'] = np.array(dire_healing_column)
dota2_training_df['radiant_towerdamage'] = np.array(radiant_towerdamage_column)
dota2_training_df['dire_towerdamage'] = np.array(dire_towerdamage_column)
dota2_training_df['radiant_healing'] = np.array(positive_votes_column)
dota2_training_df['radiant_healing'] = np.array(negative_votes_column)
dota2_training_df['duration'] = np.array(duration_column.append)
dota2_training_df['negative_votes'] = np.array(negative_votes_column)
dota2_training_df['positive_votes'] = np.array(positive_votes_column)

#add match_id
dota2_training_df['match_id'] = match_id_list

#Add dire and radiant heroes to DataFrame under their own columns
for i in range(len(np.array(radiant_hero_column)[0])):
    print(i)
    dota2_training_df['radiant_heroes'+str(i)] = np.array(radiant_hero_column)[:,i]
    dota2_training_df['dire_heroes'+str(i)] = np.array(dire_hero_column)[:,i]

In [None]:
#purge 0 values from hero_id
dota2_training_df = dota2_training_df[dota2_training_df.radiant_heroes0 != 0]


In [None]:
#save data to csv file so we have it on disk
dota2_training_df.to_csv(os.path.join('Dota_data',
                                        'mixed_skill',
                                        str(dota2_training_df['match_id'].min())+
                                        '-'+
                                        str(dota2_training_df['match_id'].max())+
                                        '.csv'))

In [None]:
sns.set_style('ticks')
champ_sum = np.concatenate((dota2_training_df['radiant_heroes0'],
                            dota2_training_df['radiant_heroes1'],
                            dota2_training_df['radiant_heroes2'],
                            dota2_training_df['radiant_heroes3'],
                            dota2_training_df['radiant_heroes4'],
                            dota2_training_df['dire_heroes0'],
                            dota2_training_df['dire_heroes1'],
                            dota2_training_df['dire_heroes2'],
                            dota2_training_df['dire_heroes3'],
                            dota2_training_df['dire_heroes4']),axis=None)

unique, counts = np.unique(champ_sum, return_counts=True)

plt.plot(np.log10(counts),'.')
plt.xlabel('hero(arb)')
plt.ylabel('log10(count)')
plt.show()

# Get many matches using GetMatchHistoryBySequence

In [None]:
files=glob.glob(os.path.join('Dota_data','mixed_skill','*.csv'))
files.sort(key=os.path.getmtime)
print(files)

In [None]:
#open all recent data files then use the oldest one to set the initial
#    match_seq_num to use to get matches by GetMatchHistoryBySequenceNum
#TODO: write option to start from a recent match
files=glob.glob(os.path.join('Dota_data','mixed_skill,'*.csv'))
files.sort(key=os.path.getmtime)
start_match = pd.read_csv(files[-1], usecols=[1])['match_id'].iloc[-1]
start_match_seq_num = api.call('IDOTA2Match_'+dota2_id+'.GetMatchDetails',
                        match_id=start_match)['result']['match_seq_num']

num_requested=1e7
match_dict = api.call('IDOTA2Match_'+dota2_id+'.GetMatchHistoryBySequenceNum',
                      start_at_match_seq_num=start_match_seq_num,
                    matches_requested=100)
match_list = match_dict['result']['matches']

consec_err=0
while num_requested>len(match_list):
    time.sleep(1)
    try:
        match_dict=(api.call('IDOTA2Match_'+dota2_id+'.GetMatchHistoryBySequenceNum',
                start_at_match_seq_num=match_list[-1]['match_seq_num'],
                    matches_requested=100))
        match_list.extend(match_dict['result']['matches'])
        consec_err=0
    except requests.exceptions.HTTPError as err:
        print(err)
        if consec_err>2:
            print('triple error')
            break
        else:
            print(len(match_list))
            consec_err+=1
            time.sleep(30)

In [None]:
consec_err=0
num_requested=2e5
match_list=[match_list[-1]]
while num_requested>len(match_list):
    time.sleep(1)
    try:
        match_dict=(api.call('IDOTA2Match_'+dota2_id+'.GetMatchHistoryBySequenceNum',
                start_at_match_seq_num=match_list[-1]['match_seq_num'],
                    matches_requested=100))
        match_list.extend(match_dict['result']['matches'])
        consec_err=0
    except requests.exceptions.HTTPError as err:
        print(err)
        if consec_err>2:
            print('triple error')
            break
        else:
            print(len(match_list))
            consec_err+=1
            time.sleep(30)
match_list=match_list[1:]

In [None]:
print(len(match_list))

In [None]:
radiant_win_column = []; radiant_xp_column = []; dire_xp_column = []; radiant_gold_column = []
dire_gold_column = []; radiant_hero_column = []; dire_hero_column = []; match_id_list = []
match_seq_num_list = []

for match in match_list:
    
    if (match['game_mode']!=1 and match['game_mode']!=2) or match['human_players']!=10:
        continue
    
    match_id_list.append(match['match_id'])
    match_seq_num_list.append(match['match_seq_num'])
    radiant_win_column.append(match['radiant_win'])
    
    dire_xp = 0; radiant_xp = 0    
    dire_gold = 0; radiant_gold = 0
    radiant_hero_row = []; dire_hero_row = []
    #get heroes, xp, and gold for features, target, target
    for player in match['players']:
        #First bit of 8-bit integer player_slot is 1 if the player was on dire
        #so if player was on dire then player_slot > 128
        if player['player_slot']/128<1:
            radiant_hero_row.append(player['hero_id'])
            radiant_xp+=player['xp_per_min']*match['duration']/60
            for item_index in range(5):
                if player['item_'+str(item_index)]!=0:
                    radiant_gold+=int(item_df.loc[item_df['id']==player['item_'+str(item_index)]]['cost'])
        else:
            dire_hero_row.append(player['hero_id'])
            dire_xp+=player['xp_per_min']*match['duration']/60
            for item_index in range(5):
                if player['item_'+str(item_index)]!=0:
                    dire_gold+=int(item_df.loc[item_df['id']==player['item_'+str(item_index)]]['cost'])

    dire_xp_column.append(dire_xp)
    radiant_xp_column.append(radiant_xp)
    radiant_gold_column.append(radiant_gold)
    dire_gold_column.append(dire_gold)
    radiant_hero_column.append(radiant_hero_row)
    dire_hero_column.append(dire_hero_row)

In [None]:
print(len(match_id_list))

In [None]:
dota2_training_df = pd.DataFrame()

#Add radiant_win column to DataFrame
dota2_training_df['radiant_win'] = 1*np.array(radiant_win_column)

#add xp and gold columns to DataFrame
dota2_training_df['radiant_xp'] = np.array(radiant_xp_column)
dota2_training_df['dire_xp'] = np.array(dire_xp_column)
dota2_training_df['radiant_gold'] = np.array(radiant_gold_column)
dota2_training_df['dire_gold'] = np.array(dire_gold_column)

#add match_id and match_seq_num
dota2_training_df['match_id'] = np.array(match_id_list)
#dota2_training_df['match_seq_num'] = np.array(match_seq_num_list)


#Add dire and radiant heroes to DataFrame under their own columns
for i in range(len(np.array(radiant_hero_column)[0])):
    print(i)
    dota2_training_df['radiant_heroes'+str(i)] = np.array(radiant_hero_column)[:,i]
    dota2_training_df['dire_heroes'+str(i)] = np.array(dire_hero_column)[:,i]

In [None]:
#purge 0 values from hero_id
dota2_training_df = dota2_training_df[dota2_training_df.radiant_heroes0 != 0]

In [None]:
#save data to csv file so we have it on disk
dota2_training_df.to_csv(os.path.join('Dota_data',
                                        'mixed_skill',
                                        str(dota2_training_df['match_id'].min())+
                                        '-'+
                                        str(dota2_training_df['match_id'].max())+
                                        '.csv'))

# Build TFRecords file
Puts all training data in the training_data folder into one TFRecords file
Do this to nicely handle the features that are numpy arrays (radiant_heroes and dire_heroes)

In [3]:
dota2_df = None
files=glob.glob(os.path.join('Dota_data','mixed_skill','*.csv'))
files.sort(key=os.path.getmtime)
print(files[0])
for filename in files[-1:]:
    if type(dota2_df)==None:
        dota2_df = pd.read_csv(filename, usecols=range(1,26))
    else:
        dota2_df = pd.concat([dota2_df, pd.read_csv(filename, usecols=range(1,26))],ignore_index=True)

dota2_df = dota2_df.reindex(np.random.permutation(dota2_df.index))
dota2_df.describe()

Dota_data/mixed_skill/4725403110-4781085451.csv


Unnamed: 0,radiant_win,radiant_xp,dire_xp,radiant_gold,dire_gold,radiant_kills,dire_kills,radiant_healing,dire_healing,radiant_towerdamage,...,radiant_heroes0,dire_heroes0,radiant_heroes1,dire_heroes1,radiant_heroes2,dire_heroes2,radiant_heroes3,dire_heroes3,radiant_heroes4,dire_heroes4
count,287416.0,287416.0,287416.0,287416.0,287416.0,287416.0,287416.0,287416.0,287416.0,287416.0,...,287416.0,287416.0,287416.0,287416.0,287416.0,287416.0,287416.0,287416.0,287416.0,287416.0
mean,0.547402,108936.501296,108245.521488,71398.804068,70517.470061,30.67037,29.847507,0.002272,3692.235954,11538.02385,...,52.834762,52.900604,52.996876,53.116295,52.971891,53.098888,52.903033,53.014484,52.786247,52.950518
std,0.497749,50081.94058,52053.025385,27735.982901,29633.745523,13.389364,14.39574,0.081167,5060.36236,8374.040901,...,35.699988,35.689077,35.789319,35.749345,35.741674,35.747986,35.753247,35.744772,35.769582,35.694289
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,0.0,72688.0875,69510.475,52915.0,49535.0,21.0,19.0,0.0,325.0,2894.0,...,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0
50%,1.0,105657.85,105186.833333,69960.0,69575.0,31.0,30.0,0.0,1797.0,12058.0,...,48.0,47.0,48.0,48.0,48.0,48.0,48.0,48.0,47.0,48.0
75%,1.0,141294.5125,142534.1,88110.0,89235.0,40.0,40.0,0.0,5111.0,18838.0,...,83.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0
max,1.0,542318.266667,507291.733333,291645.0,266075.0,112.0,164.0,18.0,88485.0,37096.0,...,129.0,129.0,129.0,129.0,129.0,129.0,129.0,129.0,129.0,129.0


In [4]:
def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
def _float_feature(value):
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))
def _bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

In [5]:
def convert_to_tfrecords(filename, examples, targets, target_name):
    # open the TFRecords file
    writer = tf.python_io.TFRecordWriter(filename)
    
    radiant_hero_array = np.array(examples['radiant_heroes'])
    dire_hero_array = np.array(examples['dire_heroes'])
    target_array = np.array(targets[target_name])
    
    for i in range(len(radiant_hero_array[:])):
        # print how many games are saved every 5000 games
        if not i % 5000:
            print('Train data: %d/%d' % (i, len(examples)))
            sys.stdout.flush()
            
        # Load the arrays

        radiant_heroes = radiant_hero_array[:][i].tobytes()
        dire_heroes = dire_hero_array[:][i].tobytes()
        target = target_array[i]
        
        # Create a feature
        if target_name=='radiant_win':
            feature = {'radiant_heroes': _bytes_feature(tf.compat.as_bytes(radiant_heroes)),
                   'dire_heroes': _bytes_feature(tf.compat.as_bytes(dire_heroes)),
                   'targets': _int64_feature(target)}
        else:
            feature = {'radiant_heroes': _bytes_feature(tf.compat.as_bytes(radiant_heroes)),
                   'dire_heroes': _bytes_feature(tf.compat.as_bytes(dire_heroes)),
                   'targets': _float_feature(target)}
        # Create an example protocol buffer
        example = tf.train.Example(features=tf.train.Features(feature=feature))
    
        # Serialize to string and write on the file
        writer.write(example.SerializeToString())
    
    writer.close()
    sys.stdout.flush()

In [6]:
#target_name to name tfrecords file and the target_df column to call
#for now don't keep all possible targets in target_df
#target_df selection is made with the preprocess_target function call
def convert_to_tfrecords_dual(filename, examples, target_df):
    # open the TFRecords file
    writer = tf.python_io.TFRecordWriter(filename)
    
    radiant_hero_array = np.array(examples['radiant_heroes'])
    dire_hero_array = np.array(examples['dire_heroes'])
    win_array = np.array(target_df['radiant_win'])
    gold_diff_array = np.array(target_df['gold_diff'])
    
    for i in range(len(radiant_hero_array[:])):
        # print how many games are saved every 10000 games
        if not i % 10000:
            print('Train data: %d/%d' % (i, len(examples)))
            sys.stdout.flush()
            
        # Load the hero_array
        radiant_heroes = radiant_hero_array[:][i].tostring()
        dire_heroes = dire_hero_array[:][i].tostring()
        radiant_win = win_array[i]
        gold_diff = gold_diff_array[i]
        
        # Create a feature
        feature = {'radiant_heroes': _bytes_feature(tf.compat.as_bytes(radiant_heroes)),
                   'dire_heroes': _bytes_feature(tf.compat.as_bytes(dire_heroes)),
                   'radiant_win': _int64_feature(radiant_win),
                   'gold_diff': _float_feature(gold_diff)}

        # Create an example protocol buffer
        example = tf.train.Example(features=tf.train.Features(feature=feature))
    
        # Serialize to string and write on the file
        writer.write(example.SerializeToString())
    
    writer.close()
    sys.stdout.flush()

In [7]:
def preprocess_features(dota2_df):
    """Take dota2_df and create a dataframe containing only the features for our model
            Args: 
                dota2_df: Dataframe containing dota2 training and test data
            returns: 
                processed_df: pandas DataFrame containing only feature columns
    """
    
    #Use arrays of heroes for each team as features rather than each hero
    #individually to help model fit?
    #This is an attempt to make the model more similar to the movie review
    #text analysis example in the Google ML Crash Course
    processed_df = pd.DataFrame()
    
    processed_df['radiant_heroes'] = list(np.array(dota2_df.loc[:,['radiant_heroes0',
                                                                'radiant_heroes1',
                                                                'radiant_heroes2',
                                                                'radiant_heroes3',
                                                                'radiant_heroes4']]).astype(int))
    processed_df['dire_heroes'] = list(np.array(dota2_df.loc[:,['dire_heroes0',
                                                                'dire_heroes1',
                                                                'dire_heroes2',
                                                                'dire_heroes3',
                                                                'dire_heroes4']]).astype(int))
     
    return processed_df
    
def preprocess_targets_win(dota2_df):
    """Take dota2_df and create a dataframe containing only the targets for our model.
      (radiant_win here)
            Args: 
                dota2_df: Dataframe containing lol training and test data
            returns: 
                target_df: pandas DataFrame containing only the target column
    """
    target_df = pd.DataFrame()
    target_df['radiant_win'] = dota2_df['radiant_win']
    
    return target_df

def preprocess_targets_gold(dota2_df):
    """Take dota2_df and create a dataframe containing only the targets for our model 
        (normalized gold difference in this case)
            Args: 
                dota2_df: Dataframe containing lol training and test data
            returns: 
                target_df: pandas DataFrame containing only the target column (gold_diff)
    """
    target_df = pd.DataFrame()
    target_df['gold_diff'] = (dota2_df['radiant_gold']-dota2_df['dire_gold'])/(dota2_df['radiant_gold']+dota2_df['dire_gold'])
    target_df['gold_diff'] = (target_df['gold_diff']-target_df['gold_diff'].mean())/target_df['gold_diff'].std()
    return target_df

def preprocess_targets_dual_gold_win(dota2_df):
    """Take dota2_df and create a dataframe containing only the targets for our model 
        (normalized gold difference in this case)
            Args: 
                dota2_df: Dataframe containing dota training and test data
            returns: 
                target_df: pandas DataFrame containing only the target column (gold_diff)
    """
    target_df = pd.DataFrame()
    target_df['radiant_win'] = dota2_df['radiant_win']
    target_df['gold_diff'] = (dota2_df['radiant_gold']-dota2_df['dire_gold'])\
                                /(dota2_df['radiant_gold']+dota2_df['dire_gold'])
    target_df['gold_diff'] = (target_df['gold_diff']-target_df['gold_diff'].mean())\
                                /target_df['gold_diff'].std()
    return target_df

def preprocess_targets_xp(dota2_df):
    """Take dota2_df and create a dataframe containing only the targets for our model 
        (normalized xp difference in this case)
            Args: 
                dota2_df: Dataframe containing lol training and test data
            returns: 
                target_df: pandas DataFrame containing only the target column (xp_diff)
    """
    target_df = pd.DataFrame()
    target_df['xp_diff'] = (dota2_df['radiant_xp']-dota2_df['dire_xp'])/(dota2_df['radiant_xp']+dota2_df['dire_xp'])
    target_df['xp_diff'] = (target_df['xp_diff']-target_df['xp_diff'].mean())/target_df['xp_diff'].std()
    return target_df

### Visualize data for each set

In [11]:
plotting_df = dota2_df

display.display(plotting_df.iloc[:train_len,:].describe())
display.display(plotting_df.iloc[train_len:train_len+test_len,:].describe())
display.display(plotting_df.iloc[train_len+test_len:,:].describe())


Unnamed: 0,radiant_win,radiant_xp,dire_xp,radiant_gold,dire_gold,radiant_kills,dire_kills,radiant_healing,dire_healing,radiant_towerdamage,...,radiant_heroes0,dire_heroes0,radiant_heroes1,dire_heroes1,radiant_heroes2,dire_heroes2,radiant_heroes3,dire_heroes3,radiant_heroes4,dire_heroes4
count,201191.0,201191.0,201191.0,201191.0,201191.0,201191.0,201191.0,201191.0,201191.0,201191.0,...,201191.0,201191.0,201191.0,201191.0,201191.0,201191.0,201191.0,201191.0,201191.0,201191.0
mean,0.548106,108938.693336,108220.866103,71407.820057,70484.97731,30.665139,29.82454,0.002306,3693.343276,11546.788549,...,52.831578,52.858572,52.974099,53.049644,52.977196,53.122098,52.928362,53.033625,52.910742,52.951424
std,0.497682,50049.676662,52017.848951,27704.304996,29577.594231,13.388758,14.38544,0.083743,5064.06066,8373.275904,...,35.72965,35.713638,35.778186,35.723533,35.744579,35.762173,35.749777,35.788152,35.800501,35.708681
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,0.0,72752.266667,69555.933333,52930.0,49525.0,21.0,19.0,0.0,325.0,2899.0,...,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0
50%,1.0,105712.666667,105173.2,70000.0,69555.0,31.0,30.0,0.0,1796.0,12076.0,...,47.0,47.0,48.0,48.0,48.0,48.0,48.0,48.0,48.0,48.0
75%,1.0,141247.108333,142435.0,88115.0,89145.0,40.0,40.0,0.0,5117.0,18841.0,...,83.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0
max,1.0,542318.266667,507291.733333,291645.0,264420.0,112.0,151.0,18.0,88485.0,36224.0,...,129.0,129.0,129.0,129.0,129.0,129.0,129.0,129.0,129.0,129.0


Unnamed: 0,radiant_win,radiant_xp,dire_xp,radiant_gold,dire_gold,radiant_kills,dire_kills,radiant_healing,dire_healing,radiant_towerdamage,...,radiant_heroes0,dire_heroes0,radiant_heroes1,dire_heroes1,radiant_heroes2,dire_heroes2,radiant_heroes3,dire_heroes3,radiant_heroes4,dire_heroes4
count,28742.0,28742.0,28742.0,28742.0,28742.0,28742.0,28742.0,28742.0,28742.0,28742.0,...,28742.0,28742.0,28742.0,28742.0,28742.0,28742.0,28742.0,28742.0,28742.0,28742.0
mean,0.545578,109193.86439,108620.728872,71502.776599,70737.315775,30.622921,29.910723,0.002227,3674.385429,11478.784079,...,53.071603,52.987684,53.052154,52.980447,53.102046,53.022163,52.682486,52.982012,52.500696,52.827848
std,0.497927,50150.729069,52178.180105,27834.367686,29762.964883,13.331864,14.402898,0.058944,5064.220125,8355.858713,...,35.772876,35.597117,35.83581,35.842537,35.555327,35.898447,35.720508,35.621217,35.735363,35.614321
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,0.0,72599.116667,69644.7125,52976.25,49886.25,21.0,19.0,0.0,305.0,2895.25,...,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0
50%,1.0,105928.35,105515.566667,69907.5,69690.0,31.0,30.0,0.0,1772.0,11930.5,...,48.0,48.0,48.0,48.0,48.0,48.0,47.0,48.0,47.0,48.0
75%,1.0,141897.2375,143149.9125,88301.25,89540.0,40.0,40.0,0.0,5020.75,18713.75,...,84.0,83.0,84.0,84.0,84.0,84.0,84.0,84.0,83.0,83.0
max,1.0,375161.5,422954.666667,221635.0,227665.0,87.0,102.0,3.0,54582.0,37096.0,...,129.0,129.0,129.0,129.0,129.0,129.0,129.0,129.0,129.0,129.0


Unnamed: 0,radiant_win,radiant_xp,dire_xp,radiant_gold,dire_gold,radiant_kills,dire_kills,radiant_healing,dire_healing,radiant_towerdamage,...,radiant_heroes0,dire_heroes0,radiant_heroes1,dire_heroes1,radiant_heroes2,dire_heroes2,radiant_heroes3,dire_heroes3,radiant_heroes4,dire_heroes4
count,57483.0,57483.0,57483.0,57483.0,57483.0,57483.0,57483.0,57483.0,57483.0,57483.0,...,57483.0,57483.0,57483.0,57483.0,57483.0,57483.0,57483.0,57483.0,57483.0,57483.0
mean,0.545848,108800.145349,108144.208595,71315.26086,70521.270202,30.712402,29.896282,0.002175,3697.285737,11536.96773,...,52.727485,53.004175,53.048954,53.417497,52.888245,53.056017,52.924656,52.963728,52.493294,53.008681
std,0.497898,50160.70462,52113.525747,27797.69972,29765.095305,13.420259,14.428193,0.081598,5045.531385,8385.816041,...,35.559435,35.649314,35.805551,35.79198,35.824742,35.623243,35.781986,35.654964,35.676392,35.68431
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,0.0,72519.125,69305.758333,52810.0,49425.0,21.0,19.0,0.0,332.0,2878.0,...,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0
50%,1.0,105362.25,105085.75,69885.0,69570.0,31.0,30.0,0.0,1814.0,12050.0,...,47.0,48.0,48.0,49.0,47.0,48.0,48.0,48.0,47.0,48.0
75%,1.0,141084.933333,142520.083333,88015.0,89325.0,40.0,40.0,0.0,5132.0,18892.0,...,83.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,83.0,84.0
max,1.0,418968.333333,422298.933333,234345.0,266075.0,110.0,164.0,9.0,78674.0,34149.0,...,129.0,129.0,129.0,129.0,129.0,129.0,129.0,129.0,129.0,129.0


## Make tfrecords files

In [9]:
training_frac = 0.7
train_len = int(len(dota2_df) * training_frac)
test_len = int((len(dota2_df) - train_len)/1.5)
validation_len = len(dota2_df) - train_len - test_len


target_name='gold_diff'
train_features = preprocess_features(dota2_df.iloc[:train_len,:])
test_features = preprocess_features(dota2_df.iloc[train_len:train_len+test_len,:])
validation_features = preprocess_features(dota2_df.iloc[train_len+test_len:,:])

if target_name=='radiant_win':
    train_targets = preprocess_targets_win(dota2_df.iloc[:train_len,:])
    test_targets = preprocess_targets_win(dota2_df.iloc[train_len:train_len+test_len,:])
    validation_targets = preprocess_targets_win(dota2_df.iloc[train_len+test_len:,:])
elif target_name=='gold_diff':
    train_targets = preprocess_targets_gold(dota2_df.iloc[:train_len,:])
    test_targets = preprocess_targets_gold(dota2_df.iloc[train_len:train_len+test_len,:])
    validation_targets = preprocess_targets_gold(dota2_df.iloc[train_len+test_len:,:])
elif target_name=='xp_diff':
    train_targets = preprocess_targets_xp(dota2_df.iloc[:train_len,:])
    test_targets = preprocess_targets_xp(dota2_df.iloc[train_len:train_len+test_len,:])
    validation_targets = preprocess_targets_xp(dota2_df.iloc[train_len+test_len:,:])
    
convert_to_tfrecords(os.path.join('Dota_data','mixed_skill','dota2_training_data',
                     target_name+'dota2_training_data.tfrecords'),
                     train_features,
                     train_targets,
                     target_name=target_name)

convert_to_tfrecords(os.path.join('Dota_data','mixed_skill','dota2_test_data',
                     target_name+'dota2_test_data.tfrecords'),
                     test_features,
                     test_targets,
                     target_name=target_name)

convert_to_tfrecords(os.path.join('Dota_data','mixed_skill','dota2_validation_data',
                     target_name+'dota2_validation_data.tfrecords'),
                     validation_features,
                     validation_targets,
                     target_name=target_name)


Train data: 0/201191
Train data: 5000/201191
Train data: 10000/201191
Train data: 15000/201191
Train data: 20000/201191
Train data: 25000/201191
Train data: 30000/201191
Train data: 35000/201191
Train data: 40000/201191
Train data: 45000/201191
Train data: 50000/201191
Train data: 55000/201191
Train data: 60000/201191
Train data: 65000/201191
Train data: 70000/201191
Train data: 75000/201191
Train data: 80000/201191
Train data: 85000/201191
Train data: 90000/201191
Train data: 95000/201191
Train data: 100000/201191
Train data: 105000/201191
Train data: 110000/201191
Train data: 115000/201191
Train data: 120000/201191
Train data: 125000/201191
Train data: 130000/201191
Train data: 135000/201191
Train data: 140000/201191
Train data: 145000/201191
Train data: 150000/201191
Train data: 155000/201191
Train data: 160000/201191
Train data: 165000/201191
Train data: 170000/201191
Train data: 175000/201191
Train data: 180000/201191
Train data: 185000/201191
Train data: 190000/201191
Train data:

### Make dual gold_win tfrecord files

In [10]:
tier = 'mixed_skill'
training_frac = 0.7
train_len = int(len(dota2_df) * training_frac)
validation_len = int((len(dota2_df) - train_len)/1.5)
test_len = len(dota2_df) - train_len - validation_len

target_name='dual_gold_win'
train_features = preprocess_features(dota2_df.iloc[:train_len,:])
validation_features = preprocess_features(dota2_df.iloc[train_len:train_len+validation_len,:])
test_features = preprocess_features(dota2_df.iloc[train_len+validation_len:,:])

train_targets = preprocess_targets_dual_gold_win(dota2_df.iloc[:train_len,:])
validation_targets = preprocess_targets_dual_gold_win(\
                                        dota2_df.iloc[train_len:train_len+validation_len,:])
test_targets = preprocess_targets_dual_gold_win(dota2_df.iloc[train_len+validation_len:,:])

convert_to_tfrecords_dual(os.path.join('dota_data',tier,'dota2_training_data', 
                     target_name+'_dota2_training_data.tfrecords'),
                     train_features,
                     train_targets)
convert_to_tfrecords_dual(os.path.join('dota_data',tier,'dota2_test_data', 
                     target_name+'_dota2_test_data.tfrecords'),
                     test_features,
                     test_targets)

convert_to_tfrecords_dual(os.path.join('dota_data',tier,'dota2_validation_data',
                     target_name+'_dota2_validation_data.tfrecords'),
                     validation_features,
                     validation_targets)

Train data: 0/201191
Train data: 10000/201191
Train data: 20000/201191
Train data: 30000/201191
Train data: 40000/201191
Train data: 50000/201191
Train data: 60000/201191
Train data: 70000/201191
Train data: 80000/201191
Train data: 90000/201191
Train data: 100000/201191
Train data: 110000/201191
Train data: 120000/201191
Train data: 130000/201191
Train data: 140000/201191
Train data: 150000/201191
Train data: 160000/201191
Train data: 170000/201191
Train data: 180000/201191
Train data: 190000/201191
Train data: 200000/201191
Train data: 0/28742
Train data: 10000/28742
Train data: 20000/28742
Train data: 0/57483
Train data: 10000/57483
Train data: 20000/57483
Train data: 30000/57483
Train data: 40000/57483
Train data: 50000/57483
