## Functions and parameters

In [247]:
import numpy as np

def first_x_sec(seconds, filepath, timecol):
    ''' param: seconds, int, filter out the first seconds of the game
        param: filepath, string, the filepath of the csv datafile
        param: timecol, int, the index of the column that contains time
    '''
    f = open(filepath)
    lines = f.readlines()
    lines = [ line.split(',') for line in lines ]
    if seconds is None:
        return np.array(lines[1:])
    else:
        return np.array(list(filter(lambda line: int(line[timecol])<=seconds, lines[1:])))

def read_columns(filepath, start_col, stop_col):
    f = open(filepath)
    lines = f.readlines()
    lines = [ line.strip().split(',')[start_col:stop_col] for line in lines[1:] ]
    return np.array(lines).astype(np.int)

def separate(matrix, col, index_function = lambda x: x):
    ''' This function will take a 2 diminsional numpy array and create a python list.
        each entry is a 2 dimensional numpy array. The entries in the list are separated 
        on the values in the given column
    '''
    max_index = np.max(list(map(index_function, matrix[:,col])))
    min_index = np.min(list(map(index_function, matrix[:,col])))
    list_length = max_index-min_index+1
    separated = [np.empty(shape=(0,matrix.shape[1])) for _ in range(list_length)]
    for i in range(list_length):
        separated[i] = np.array(list(filter(lambda x: index_function(x[col])==i, matrix)))
    return separated

def save_array(array, filepath):
    print(array.shape)
    outfile = open(filepath, 'wb')
    np.save(outfile, array)
    outfile.close()


time_scope = 300
time_step = 60 # This is determined by data rather than by user here. Make sure how your data is separaed
max_matches = 50000





## Read player stats

In [201]:
# First 5 min: 5*60=300, col 2(1 in 0 indexing) is time.
player_time = first_x_sec(time_scope, 'dataset/player_time.csv', 1)
player_time = player_time.astype(np.float)
print(player_time.shape)




(299994, 32)


In [202]:
# Find what matches are too short
match_lengths = np.zeros((50000,1))
for log in player_time:
    match_lengths[int(log[0])] = log[1]
too_short_matches = []
for i in range(len(match_lengths)):
    if match_lengths[i] < time_scope:
        too_short_matches.append(i)
print(too_short_matches)
match_lengths = None

[6213, 13914, 21228, 31438]


## Remove too short matches

In [203]:
player_time = np.array(list(filter(lambda log: log[0] not in too_short_matches, player_time)))
print(player_time.shape)

(299976, 32)


## Label data

In [204]:
from keras.utils import to_categorical
labels = first_x_sec(None, 'dataset/match.csv', None)
labels = np.array([int(match[9] == 'True') for match in labels])
print(labels)
labels = np.array(list(filter(lambda x: x[0] not in too_short_matches, enumerate(labels))))
print(labels.shape)
labels = labels[:,1]
labels =to_categorical(labels)
print(labels.shape)
print(labels)

[1 0 0 ..., 1 1 0]
(49996, 2)
(49996, 2)
[[ 0.  1.]
 [ 1.  0.]
 [ 1.  0.]
 ..., 
 [ 0.  1.]
 [ 0.  1.]
 [ 1.  0.]]


In [205]:
save_array(labels, '/tmp/game_labels.npy')
labels = None # Clear memory

(49996, 2)


## Separate data into time steps

In [206]:
# First we need to separate the data into time_steps so that we can normalize over the time steps
time_step_separated = separate(player_time, 1, index_function = lambda x: int(x/time_step))
player_time = None



## Normalize data in each time step over all matches

In [207]:
for i in range(len(time_step_separated)):
    max_value = np.max(time_step_separated[i][:,2:], axis=0)
    time_step_separated[i][:,2:] = time_step_separated[i][:,2:]/max_value
print(time_step_separated[1].shape)
print(time_step_separated[1])

(49996, 32)
[[  0.00000000e+00   6.00000000e+01   3.13891021e-01 ...,   5.00816993e-01
    1.25000000e-01   1.12107623e-01]
 [  1.00000000e+00   6.00000000e+01   7.59785111e-02 ...,   2.05065359e-01
    5.00000000e-01   2.40358744e-01]
 [  2.00000000e+00   6.00000000e+01   1.33537989e-01 ...,   8.08823529e-02
    0.00000000e+00   1.34529148e-02]
 ..., 
 [  4.99970000e+04   6.00000000e+01   1.08979279e-01 ...,   2.32026144e-01
    2.50000000e-01   2.06278027e-01]
 [  4.99980000e+04   6.00000000e+01   1.65003837e-01 ...,   8.16993464e-02
    0.00000000e+00   7.53363229e-02]
 [  4.99990000e+04   6.00000000e+01   7.59785111e-02 ...,   1.47875817e-01
    2.50000000e-01   7.35426009e-02]]


In [208]:
# Stick test that data is normalized
print(time_step_separated[3][3]) # timestep 3, match 3

[  3.00000000e+00   1.80000000e+02   2.34705656e-01   1.60000000e-01
   3.20872274e-01   1.40422078e-01   3.70370370e-02   4.22816115e-02
   1.89093762e-01   8.33333333e-02   3.71671991e-01   2.74174294e-01
   1.60000000e-01   2.55985267e-01   2.21856725e-01   2.30769231e-01
   6.15107914e-01   1.21813031e-01   7.69230769e-02   2.60069045e-01
   2.02346041e-01   2.40000000e-01   3.96303901e-01   1.12684195e-01
   3.70370370e-02   2.01164875e-01   1.33635334e-01   0.00000000e+00
   1.87090267e-01   4.95019489e-01   5.00000000e-01   4.59293395e-01]


## Change separation into matches

In [209]:
time_step_separated = np.array(time_step_separated)
print (time_step_separated.shape)
matches = np.swapaxes(time_step_separated, 0,1) #swap axes to have the data separated on matches now
time_step_separated = None

(6, 49996, 32)


In [210]:
print(matches.shape)
print(matches[0][1])

(49996, 6, 32)
[  0.00000000e+00   6.00000000e+01   3.13891021e-01   0.00000000e+00
   6.25620655e-02   1.03347889e-01   1.00000000e-01   9.55315871e-02
   1.33439237e-01   0.00000000e+00   1.30344108e-01   1.61812298e-01
   0.00000000e+00   1.62731872e-01   1.50387597e-01   1.25000000e-01
   1.16604478e-01   1.41119221e-01   2.50000000e-01   6.56996587e-02
   1.11200645e-01   1.25000000e-01   5.20134228e-02   2.58426966e-01
   7.50000000e-01   2.92743953e-01   7.48502994e-02   0.00000000e+00
   6.15507594e-02   5.00816993e-01   1.25000000e-01   1.12107623e-01]


In [211]:
matches = matches[:,:,2:]
print(matches.shape)

(49996, 6, 30)


## Save games without hero data

In [212]:
save_array(matches, '/tmp/game_data.npy')

(49996, 6, 30)


## Load in hero_data

In [213]:
from keras.utils.np_utils import to_categorical
hero_data = read_columns('dataset/players.csv', 0, 3)
hero_data = np.array(list(filter(lambda log: log[0] not in too_short_matches, hero_data)))
hero_data = hero_data[:,2]
hero_rep = to_categorical(hero_data)
print(hero_rep.shape)
hero_data = None

(499960, 113)


## Create team hero representations

In [214]:
print(hero_rep[10].flatten().reshape((-1,1)).shape)
print(hero_rep.shape)
team_hero_rep = np.zeros(shape=(int(hero_rep.shape[0]/5),hero_rep.shape[1]))
print(team_hero_rep.shape)

for i in range(team_hero_rep.shape[0]):
    team_hero_rep[i] += np.sum(hero_rep[5*i:5*(i+1)], axis=0)
print(team_hero_rep.shape)
print(team_hero_rep[1])
hero_rep = None

(113, 1)
(499960, 113)
(99992, 113)
(99992, 113)
[ 0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  1.  0.
  0.  0.  0.  0.  0.]


In [215]:
## reshape so that first dimension is number of matches and second is the 2  team vectors concatenated
team_hero_rep = team_hero_rep.reshape((matches.shape[0],team_hero_rep.shape[1]*2))
print(team_hero_rep.shape)

(49996, 226)


## Add the team hero data to the match vectors

In [216]:
print(matches.shape)
## Repeat info for all time steps
team_hero_rep = np.repeat(team_hero_rep, matches.shape[1], axis=1)
## Reshape into time steps
team_hero_rep = team_hero_rep.reshape((team_hero_rep.shape[0], team_hero_rep.shape[1]//matches.shape[1], matches.shape[1]))
## Swap the axes to match with matches
team_hero_rep = np.swapaxes(team_hero_rep, 1,2)
print(team_hero_rep.shape)
matches = np.concatenate((matches, team_hero_rep), axis=2)
print(matches.shape)

(49996, 6, 30)
(49996, 6, 226)
(49996, 6, 256)


## Save games with hero data

In [217]:
save_array(matches, '/tmp/game_data.npy')

(49996, 6, 256)


## Load Purchases

In [218]:
purchases = first_x_sec(time_scope, 'dataset/purchase_log.csv', 1)
print(purchases.shape)
purchases = purchases.astype(np.int)

(4014805, 4)


## Filter too short matches

In [219]:
#purchases = np.array(list(filter(lambda x: x[-1] not in too_short_matches, purchases))) #last col in purchases is match id

## Move timestamp to closest 60 sec

In [220]:
import math
for log in purchases:
    log[1] = math.ceil(max(0, log[1]) / 60) * 60
 

In [221]:
print(purchases.shape)
# I want to have purch = nd_array (50000, 6, 218*10), (matches, time_steps, one_hot_items per player)
categorical = to_categorical(purchases[:,0])
print(categorical.shape)
categorical = categorical.reshape((categorical.shape[0], categorical.shape[1], 1))

(4014805, 4)
(4014805, 219)


In [222]:


purch = [[ [ np.zeros((categorical.shape[1],1)) for i in range(2)] for t in range(int(time_scope/time_step) +1)] for m in range(50000)]
print(len(purch[len(matches)-1][0][1]))
#test = purch[purchases[0][3]][int(purchases[0][1]/60)][0] + categorical[0].reshape(-1,1)
#print(test.shape)
for i, log in enumerate(purchases):
    team = 0 if log[2] < 5 else 1
    purch[log[3]][int(log[1]/60)][team] += categorical[i]
print(len(purch[0][0]))
purchases = purch
purch = None

219
2


In [223]:
purchases = list(filter(lambda x: x[0] not in too_short_matches, enumerate(purchases))) #last col in purchases is match id

In [224]:
purchases = list(map(lambda x: x[1], purchases))

In [225]:
print(len(purchases[2][4][1]))
purchases = np.array(purchases)
print(purchases.shape)
print(purchases[1,0,1].T)
purchases = purchases.reshape(purchases.shape[0], purchases.shape[1], purchases.shape[3]*purchases.shape[2])
print(purchases.shape)

219
(49996, 6, 2, 219, 1)
[[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  3.  0.
   0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  2.  2.  0.  0.  2.  0.  5.  1.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  3.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   2.  0.  0.]]
(49996, 6, 438)


In [226]:
print(matches.shape)
matches = np.concatenate((matches, purchases), axis=2)
print(matches.shape)

(49996, 6, 256)
(49996, 6, 694)


In [227]:
save_array(matches, '/tmp/game_data.npy')

(49996, 6, 694)


## Load objectives data

In [242]:
objectives = first_x_sec(time_scope, 'dataset/objectives.csv', -2)
print(objectives.shape)
objectives = np.array(list(filter(lambda x: int(x[2])>=0, objectives)))
print(objectives.shape)

(47525, 9)
(47410, 9)


## Create objectives map

In [243]:
from keras.utils import to_categorical
obj_cats = np.unique(objectives[:,5])
print(obj_cats.shape)
print(obj_cats)
obj_cats = dict(list(map(lambda x: (x[1], x[0]), enumerate(obj_cats))))
print(obj_cats)

(5,)
['CHAT_MESSAGE_AEGIS' 'CHAT_MESSAGE_FIRSTBLOOD' 'CHAT_MESSAGE_ROSHAN_KILL'
 'CHAT_MESSAGE_TOWER_DENY' 'CHAT_MESSAGE_TOWER_KILL']
{'CHAT_MESSAGE_FIRSTBLOOD': 1, 'CHAT_MESSAGE_TOWER_KILL': 4, 'CHAT_MESSAGE_ROSHAN_KILL': 2, 'CHAT_MESSAGE_AEGIS': 0, 'CHAT_MESSAGE_TOWER_DENY': 3}


## Separate objectives into matches and time steps, rep as one hot for each team concatenated

In [248]:
match_objectives = [[np.empty((0,len(obj_cats)*2)) for _ in range(int(time_scope/time_step) +1)] for _ in range(max_matches)]
for obj in objectives:
    team = 0 if int(obj[2]) < 5 else 1
    time = int(obj[-2])//60
    obj_arr = np.zeros((1,len(obj_cats)*2))
    obj_arr[0,obj_cats[obj[5]] + team*len(obj_cats)] = 1
    match_objectives[int(obj[0])][time] = np.concatenate((match_objectives[int(obj[0])][time], obj_arr), axis=0)
print(match_objectives[0][0].shape)


(1, 10)


## Filter away too short matches

In [250]:
match_objectives = list(map(lambda x: x[1], filter(lambda obj: obj[0] not in too_short_matches, enumerate(match_objectives))))
print(match_objectives[0][0])
print(match_objectives[0][0].shape)
print(len(match_objectives))


[[ 0.  1.  0.  0.  0.  0.  0.  0.  0.  0.]]
(1, 10)


## Sum objectives in one time step

In [252]:
#match_objectives = np.array(match_objectives)

print(match_objectives[1][1])
max_shape = 0
for i, match in enumerate(match_objectives):
    for j, step in enumerate(match):
        if len(step) == 0:
            match_objectives[i][j] = np.zeros((2*len(obj_cats),))
            assert match_objectives[i][j].shape == (10,), (step, 'was 0 shape',i,j)
        else:
            match_objectives[i][j] = np.sum(step, axis=0)
            assert match_objectives[i][j].shape == (10,), (step, 'was >0 shape', i,j)

        max_shape = max(match_objectives[i][j].shape[0],max_shape)
print(max_shape)

[]
10


## Check shapes and concatenate with matches

In [253]:
print(match_objectives[200][0].shape)
match_objectives = np.array(match_objectives)
print(match_objectives.shape)

(10,)
(49996, 6, 10)


In [254]:
print(matches.shape)
matches = np.concatenate((matches, match_objectives), axis = 2)

(49996, 6, 694)


## Save data with objectives as well

In [255]:
save_array(matches, '/tmp/game_data.npy')

(49996, 6, 704)
