In [45]:
import numpy as np

def first_x_sec(seconds, filepath, timecol):
    ''' param: seconds, int, filter out the first seconds of the game
        param: filepath, string, the filepath of the csv datafile
        param: timecol, int, the index of the column that contains time
    '''
    f = open(filepath)
    lines = f.readlines()
    lines = [ line.split(',') for line in lines ]
    if seconds is None:
        return np.array(lines[1:])
    else:
        return np.array(list(filter(lambda line: int(line[timecol])<=seconds, lines[1:])))

# First 5 min: 5*60=300, col 2 is time first line is header
#ability_upgrade = first_x_sec(300, 'dataset/ability_upgrades.csv', 2)
#objectives = first_x_sec(300, 'dataset/objectives.csv', 7)
player_time = first_x_sec(300, 'dataset/player_time.csv', 1)


## Match file contains more than just the label
# column 10 contains the True/False value of Radiant win
labels = first_x_sec(None, 'dataset/match.csv', None)
purchases = first_x_sec(300, 'dataset/purchase_log.csv', 1)



def read_columns(filepath, start_col, stop_col):
    f = open(filepath)
    lines = f.readlines()
    lines = [ line.strip().split(',')[start_col:stop_col] for line in lines[1:] ]
    return np.array(lines).astype(np.int)
    
hero_data = read_columns('dataset/players.csv', 2, 3)


In [46]:
labels = np.array([int(match[9] == 'True') for match in labels])


In [47]:
import math
print(purchases.shape)
purchases = purchases.astype(np.int)

    
for log in purchases:
    log[1] = math.ceil(max(0, log[1]) / 60) * 60
 



(4014805, 4)


In [48]:
#in any game, any player for one timestep, what is maximum number of items?
matches = [np.empty(shape=(4,0), dtype=np.int)]*50000
for x in purchases:
    matches[x[3]] = np.concatenate((matches[x[3]], x.reshape(4,1)), axis=1)

# separate on time
#print(matches[1].T)
for i, match in enumerate(matches):
    separated = [np.empty(shape=(4,0), dtype=np.int)]*6 # 6 time steps [0 60 120 180 240 300]
    for x in match.T:
        time = x[1]/60
        separated[time] = np.concatenate((separated[time], x.reshape(4,1)), axis=1)
    matches[i] = separated

#Separate on player
for match in matches:
    for i, time_step in enumerate(match):
        #problem is that this vector is the same for all players. Need 10 copies
        separated = [np.zeros(shape=(218,1), dtype=np.int) for j in range(10)]*10 # 10 players and 218 is maximum item id in dataset
        
        for x in time_step.T:
            player = x[2] if x[2]<5 else x[2] - 123 #make players numbered 0 to 9
            separated[player][x[0]-1] += 1
        match[i] = separated
 

In [49]:
print matches[15][0][2].shape # first match, first timestep 0 and first player purchase log

(218, 1)


In [50]:
print('Shape of player time matrix')
print('Contains all attributes of matches, the first minutes')
print(player_time.shape)
#print(player_time[1:10])
player_time = player_time.astype(np.int)


## Separate data matches. Each index in data is one match. One match is a 32*n.
## n is the number of logs. We have data every 60 secods, so for 5 min n=5
data = [np.empty(shape=(2212,0), dtype=np.int)]*50000
for x in player_time:
    timestep = x[1]/60
    features = x.reshape(32, 1)
    for player in range(10):
        features = np.concatenate((features, matches[x[0]][timestep][player]), axis=0)   
    data[x[0]] = np.concatenate((data[x[0]], features), axis=1)
#Print the first match log 


Shape of player time matrix
Contains all attributes of matches, the first minutes
(299994, 32)


In [51]:
print(hero_data.shape)
print(data[0].shape)
for i, _ in enumerate(data):
    data[i] = np.vstack((data[i], np.repeat(hero_data[10*i:10*(i+1)], data[i].shape[1], axis=1)))


(500000, 1)
(2212, 6)


In [52]:
#Pad games shorter than timelimit. (5 minutes)
print(data[0].shape)
for i in range(len(data)):
    while data[i].shape[1] <= 300/60:
        features = np.zeros(shape=(data[0].shape[0],1), dtype=np.int)
        data[i] = np.concatenate((data[i], features), axis=1)
    

(2222, 6)


In [53]:
from keras.utils.np_utils import to_categorical
games=np.array(data)
labels_binary = to_categorical(labels)

# We don't know how long the game existed for, therefore we append labels_binary for each frame.
labels_binary_length_corrected = []
for index, game in enumerate(games):
    for frame in game.T:
        labels_binary_length_corrected.append(labels_binary[index])


labels_binary_length_corrected = np.array(labels_binary_length_corrected)
print(labels_binary_length_corrected.shape)

(300000, 2)


In [55]:
print games.shape
print labels_binary_length_corrected.shape
labels = labels_binary_length_corrected.reshape((50000,6,2))

(50000, 2222, 6)
(300000, 2)


In [56]:
games2 = np.zeros(shape=(50000,6,2222))
for i, match in enumerate(games2):
    games2[i] = games[i].T
print games.shape

(50000, 2222, 6)


In [57]:
print games2.shape
#games2 = games2.reshape((300000, 2222))
#print games2.shape

(50000, 6, 2222)


In [58]:
outfile = open('/tmp/game_data.npy', 'w')
np.save(outfile, games2)
outfile.close()  

In [60]:
outfile = open('/tmp/game_labels.npy', 'w')
np.save(outfile, labels) 
outfile.close()