## Functions and parameters

In [None]:
import numpy as np

def first_x_sec(seconds, filepath, timecol):
    ''' param: seconds, int, filter out the first seconds of the game
        param: filepath, string, the filepath of the csv datafile
        param: timecol, int, the index of the column that contains time
    '''
    f = open(filepath)
    lines = f.readlines()
    lines = [ line.split(',') for line in lines ]
    if seconds is None:
        return np.array(lines[1:])
    else:
        return np.array(list(filter(lambda line: int(line[timecol])<=seconds, lines[1:])))

def read_columns(filepath, start_col, stop_col):
    f = open(filepath)
    lines = f.readlines()
    lines = [ line.strip().split(',')[start_col:stop_col] for line in lines[1:] ]
    return np.array(lines).astype(np.int)

def separate(matrix, col, index_function = lambda x: x):
    ''' This function will take a 2 diminsional numpy array and create a python list.
        each entry is a 2 dimensional numpy array. The entries in the list are separated 
        on the values in the given column
    '''
    max_index = np.max(list(map(index_function, matrix[:,col])))
    min_index = np.min(list(map(index_function, matrix[:,col])))
    list_length = max_index-min_index+1
    separated = [np.empty(shape=(0,matrix.shape[1])) for _ in range(list_length)]
    for i in range(list_length):
        separated[i] = np.array(list(filter(lambda x: index_function(x[col])==i, matrix)))
    return separated

def save_array(array, filepath):
    print(array.shape)
    outfile = open(filepath, 'wb')
    np.save(outfile, array)
    outfile.close()


time_scope = 300
time_step = 60 # This is determined by data rather than by user here. Make sure how your data is separaed



## Match file contains more than just the label
# column 10 contains the True/False value of Radiant win

#purchases = first_x_sec(time_scope, 'dataset/purchase_log.csv', 1)





## Read player stats

In [None]:
# First 5 min: 5*60=300, col 2(1 in 0 indexing) is time.
player_time = first_x_sec(time_scope, 'dataset/player_time.csv', 1)
player_time = player_time.astype(np.float)
print(player_time.shape)




In [None]:
# Find what matches are too short
match_lengths = np.zeros((50000,1))
for log in player_time:
    match_lengths[int(log[0])] = log[1]
too_short_matches = []
for i in range(len(match_lengths)):
    if match_lengths[i] < time_scope:
        too_short_matches.append(i)
print(too_short_matches)
match_lengths = None

## Remove too short matches

In [None]:
player_time = np.array(list(filter(lambda log: log[0] not in too_short_matches, player_time)))
print(player_time.shape)

## Label data

In [None]:
from keras.utils import to_categorical
labels = first_x_sec(None, 'dataset/match.csv', None)
labels = np.array([int(match[9] == 'True') for match in labels])
print(labels)
labels = np.array(list(filter(lambda x: x[0] not in too_short_matches, enumerate(labels))))
print(labels.shape)
labels = labels[:,1]
labels =to_categorical(labels)
print(labels.shape)
print(labels)

In [None]:
save_array(labels, '/tmp/game_labels.npy')
labels = None # Clear memory

## Separate data into time steps

In [None]:
# First we need to separate the data into time_steps so that we can normalize over the time steps
time_step_separated = separate(player_time, 1, index_function = lambda x: int(x/time_step))
player_time = None



## Normalize data in each time step over all matches

In [None]:
for i in range(len(time_step_separated)):
    max_value = np.max(time_step_separated[i][:,2:], axis=0)
    time_step_separated[i][:,2:] = time_step_separated[i][:,2:]/max_value
print(time_step_separated[1].shape)
print(time_step_separated[1])

In [None]:
# Stick test that data is normalized
print(time_step_separated[3][3]) # timestep 3, match 3

## Change separation into matches

In [None]:
time_step_separated = np.array(time_step_separated)
print (time_step_separated.shape)
matches = np.swapaxes(time_step_separated, 0,1) #swap axes to have the data separated on matches now
time_step_separated = None

In [None]:
print(matches.shape)
print(matches[0][1])

In [None]:
matches = matches[:,:,2:]
print(matches.shape)

## Save games without hero data

In [None]:
save_array(matches, '/tmp/game_data.npy')

## Load in hero_data

In [None]:
from keras.utils.np_utils import to_categorical
hero_data = read_columns('dataset/players.csv', 0, 3)
hero_data = np.array(list(filter(lambda log: log[0] not in too_short_matches, hero_data)))
hero_data = hero_data[:,2]
hero_rep = to_categorical(hero_data)
print(hero_rep.shape)
hero_data = None

## Create team hero representations

In [None]:
print(hero_rep[10].flatten().reshape((-1,1)).shape)
print(hero_rep.shape)
team_hero_rep = np.zeros(shape=(int(hero_rep.shape[0]/5),hero_rep.shape[1]))
print(team_hero_rep.shape)

for i in range(team_hero_rep.shape[0]):
    team_hero_rep[i] += np.sum(hero_rep[5*i:5*(i+1)], axis=0)
print(team_hero_rep.shape)
print(team_hero_rep[1])
hero_rep = None

In [None]:
## reshape so that first dimension is number of matches and second is the 2  team vectors concatenated
team_hero_rep = team_hero_rep.reshape((matches.shape[0],team_hero_rep.shape[1]*2))
print(team_hero_rep.shape)

## Add the team hero data to the match vectors

In [None]:
print(matches.shape)
## Repeat info for all time steps
team_hero_rep = np.repeat(team_hero_rep, matches.shape[1], axis=1)
## Reshape into time steps
team_hero_rep = team_hero_rep.reshape((team_hero_rep.shape[0], team_hero_rep.shape[1]//matches.shape[1], matches.shape[1]))
## Swap the axes to match with matches
team_hero_rep = np.swapaxes(team_hero_rep, 1,2)
print(team_hero_rep.shape)
matches = np.concatenate((matches, team_hero_rep), axis=2)
print(matches.shape)

## Save games with hero data

In [None]:
save_array(matches, '/tmp/game_data.npy')

In [None]:
#import math
#print(purchases.shape)
#purchases = purchases.astype(np.int)

    
#for log in purchases:
#    log[1] = math.ceil(max(0, log[1]) / 60) * 60
 



In [None]:
#in any game, any player for one timestep, what is maximum number of items?
#matches = [np.empty(shape=(4,0), dtype=np.int)]*50000
#for x in purchases:
#    matches[x[3]] = np.concatenate((matches[x[3]], x.reshape(4,1)), axis=1)

# separate on time
#print(matches[1].T)
#for i, match in enumerate(matches):
#    separated = [np.empty(shape=(4,0), dtype=np.int)]*6 # 6 time steps [0 60 120 180 240 300]
#    for x in match.T:
#        time = int(x[1]/60)
#        separated[time] = np.concatenate((separated[time], x.reshape(4,1)), axis=1)
#    matches[i] = separated

#Separate on player
#for match in matches:
#    for i, time_step in enumerate(match):
#        #problem is that this vector is the same for all players. Need 10 copies
#        separated = [np.zeros(shape=(218,1), dtype=np.int) for j in range(10)]*10 # 10 players and 218 is maximum item id in dataset
        
#        for x in time_step.T:
#            player = x[2] if x[2]<5 else x[2] - 123 #make players numbered 0 to 9
#            separated[player][x[0]-1] += 1
#        match[i] = separated
 

In [None]:
#print (matches[15][0][2].shape) # first match, first timestep 0 and first player purchase log