In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib as plt

PATH_TO_DATA = './mlcourse-dota2-win-prediction/'

df_train_features = pd.read_csv(os.path.join(PATH_TO_DATA, 
                                             'train_features.csv'), 
                                    index_col='match_id_hash')
df_train_targets = pd.read_csv(os.path.join(PATH_TO_DATA, 
                                            'train_targets.csv'), 
                                   index_col='match_id_hash')

df_test_features = pd.read_csv(os.path.join(PATH_TO_DATA, 'test_features.csv'), 
                                   index_col='match_id_hash')

In [3]:
full_df = pd.concat([df_train_features, df_test_features], sort=False)
train_size = df_train_features.shape[0]

In [4]:
for c in ['kills', 'deaths', 'assists', 'denies', 'gold', 'lh', 'xp', 'health', 'max_health', 'max_mana', 'level', 'x', 'y', 'stuns', 'creeps_stacked', 'camps_stacked', 'rune_pickups',
          'teamfight_participation', 'towers_killed', 'roshans_killed', 'obs_placed', 'sen_placed']:
    r_columns = [f'r{i}_{c}' for i in range(1, 6)]
    d_columns = [f'd{i}_{c}' for i in range(1, 6)]
    
    full_df['r_total_' + c] = full_df[r_columns].sum(1)
    full_df['d_total_' + c] = full_df[d_columns].sum(1)
    full_df['total_' + c + '_ratio'] = full_df['r_total_' + c] / full_df['d_total_' + c]
    
    full_df['r_std_' + c] = full_df[r_columns].std(1)
    full_df['d_std_' + c] = full_df[d_columns].std(1)
    full_df['std_' + c + '_ratio'] = full_df['r_std_' + c] / full_df['d_std_' + c]
    
    full_df['r_mean_' + c] = full_df[r_columns].mean(1)
    full_df['d_mean_' + c] = full_df[d_columns].mean(1)
    
    full_df['r_max_' + c] = full_df[r_columns].max(1)
    full_df['d_max_' + c] = full_df[d_columns].max(1)
    full_df['max_diff_' + c] = full_df['r_max_' + c] - full_df['d_max_' + c]
    
    full_df['r_min_' + c] = full_df[r_columns].min(1)
    full_df['d_min_' + c] = full_df[d_columns].min(1)   
    full_df['min_diff' + c] = full_df['r_min_' + c] - full_df['d_min_' + c]
    
    full_df.drop(r_columns+d_columns, axis=1, inplace=True) 

In [6]:
y_train = pd.read_csv(os.path.join(PATH_TO_DATA, 'train_targets.csv'), index_col='match_id_hash')['radiant_win']
y_train = y_train.map({True: 1, False: 0})

In [7]:
full_df.to_pickle('./pkl/base_features.pkl')

In [11]:
columns=df_train_features.columns

## Hero ids

In [12]:
hero_columns = [c for c in columns if '_hero_' in c]
full_df_hero_id = full_df[hero_columns]
full_df_hero_id

Unnamed: 0_level_0,r1_hero_id,r2_hero_id,r3_hero_id,r4_hero_id,r5_hero_id,d1_hero_id,d2_hero_id,d3_hero_id,d4_hero_id,d5_hero_id
match_id_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
a400b8f29dece5f4d266f49f1ae2e98a,11,78,14,59,77,12,21,60,84,34
b9c57c450ce74a2af79c9ce96fac144d,15,96,27,63,89,58,14,1,56,92
6db558535151ea18ca70a6892197db41,101,51,44,49,53,18,67,47,40,17
46a0ddce8f7ed2a8d9bd5edcbb925682,14,99,101,26,41,18,98,8,69,86
b1b35ff97723d9b7ade1c9c3cf48f770,42,69,27,104,65,23,22,35,72,1
...,...,...,...,...,...,...,...,...,...,...
9376a283b50779433de829c79529fe2c,93,86,35,97,26,23,8,101,85,12
bce2bace8b61980d282c9f6a9c69ef9c,40,11,32,119,12,87,74,54,85,18
dc00c2964363b0344a4891bdde235a44,21,49,50,1,30,114,10,27,23,16
d75db83f7857720f851a302b00ee6149,90,108,39,32,8,96,109,62,14,56


In [13]:
full_df_hero_id = full_df_hero_id.astype(str)
train_df_hero_id = full_df_hero_id.iloc[:train_size, :]
test_df_hero_id = full_df_hero_id.iloc[train_size:, :]

#### Make dummies

In [14]:
for team in 'r', 'd':
    players = [f'{team}{i}' for i in range(1, 6)]
    hero_columns = [f'{player}_hero_id' for player in players]
    d = pd.get_dummies(full_df_hero_id[hero_columns[0]])
    for c in hero_columns[1:]:
        d += pd.get_dummies(full_df_hero_id[c])
    full_df_hero_id = pd.concat([full_df_hero_id, d.add_prefix(f'{team}_hero_')], axis=1)
    full_df_hero_id.drop(columns=hero_columns, inplace=True)

In [15]:
train_df_hero_id = full_df_hero_id.iloc[:train_size, :]
test_df_hero_id = full_df_hero_id.iloc[train_size:, :]

In [16]:
rad_cols=[col for col in train_df_hero_id.columns if col.startswith('r')]
dire_cols=[col for col in train_df_hero_id.columns if col.startswith('d')]

In [17]:
bad_heroes_rad=train_df_hero_id[rad_cols].corrwith(y_train).sort_values(ascending=True).head(12).index
good_heroes_rad=train_df_hero_id[rad_cols].corrwith(y_train).sort_values(ascending=False).head(12).index

In [18]:
bad_heroes_rad=[int(x.split('_')[-1]) for x in bad_heroes_rad.tolist()]
good_heroes_rad=[int(x.split('_')[-1]) for x in good_heroes_rad.tolist()]

In [19]:
bad_heroes_dire=train_df_hero_id[dire_cols].corrwith(y_train).sort_values(ascending=False).head(12).index
good_heroes_dire=train_df_hero_id[dire_cols].corrwith(y_train).sort_values(ascending=True).head(12).index

In [20]:
bad_heroes_dire=[int(x.split('_')[-1]) for x in bad_heroes_dire.tolist()]
good_heroes_dire=[int(x.split('_')[-1]) for x in good_heroes_dire.tolist()]

In [21]:
bad_heroes=set(bad_heroes_rad) & set(bad_heroes_dire)
good_heroes=set(good_heroes_rad) & set(good_heroes_dire)

In [22]:
print(bad_heroes)
print(good_heroes)

{73, 106, 46, 19, 91}
{32, 96, 67, 102, 42, 108, 20, 22, 56, 92}


In [23]:
full_df_hero_id

Unnamed: 0_level_0,r_hero_1,r_hero_10,r_hero_100,r_hero_101,r_hero_102,r_hero_103,r_hero_104,r_hero_105,r_hero_106,r_hero_107,...,d_hero_90,d_hero_91,d_hero_92,d_hero_93,d_hero_94,d_hero_95,d_hero_96,d_hero_97,d_hero_98,d_hero_99
match_id_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
a400b8f29dece5f4d266f49f1ae2e98a,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
b9c57c450ce74a2af79c9ce96fac144d,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
6db558535151ea18ca70a6892197db41,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
46a0ddce8f7ed2a8d9bd5edcbb925682,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
b1b35ff97723d9b7ade1c9c3cf48f770,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9376a283b50779433de829c79529fe2c,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
bce2bace8b61980d282c9f6a9c69ef9c,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
dc00c2964363b0344a4891bdde235a44,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
d75db83f7857720f851a302b00ee6149,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [24]:
full_df_hero_id.to_pickle('./pkl/hero_id_ohe.pkl')

## Add count of good & bad heroes

In [25]:
hero_columns_rad=[f'r{i}_hero_id' for i in range(1, 6)]
hero_columns_dire=[f'd{i}_hero_id' for i in range(1, 6)]
full_df['r_team'] = full_df[hero_columns_rad].values.tolist()
full_df['d_team'] = full_df[hero_columns_dire].values.tolist()

In [26]:
full_df['r_team']=full_df['r_team'].apply(lambda x:set(x))

In [27]:
full_df['d_team']=full_df['d_team'].apply(lambda x:set(x))

In [28]:
full_df['r_team'].apply(lambda x: len(x & good_heroes))

match_id_hash
a400b8f29dece5f4d266f49f1ae2e98a    0
b9c57c450ce74a2af79c9ce96fac144d    1
6db558535151ea18ca70a6892197db41    0
46a0ddce8f7ed2a8d9bd5edcbb925682    0
b1b35ff97723d9b7ade1c9c3cf48f770    1
                                   ..
9376a283b50779433de829c79529fe2c    0
bce2bace8b61980d282c9f6a9c69ef9c    1
dc00c2964363b0344a4891bdde235a44    0
d75db83f7857720f851a302b00ee6149    2
1bbaf6d71e197ebf96c6afa3c921000a    1
Name: r_team, Length: 49675, dtype: int64

## Подсчет хороших/плохие героев

In [29]:
full_df['r_good_heroes_cnt']=full_df['r_team'].apply(lambda x: len(x & good_heroes))
full_df['d_good_heroes_cnt']=full_df['d_team'].apply(lambda x: len(x & good_heroes))
full_df['r_bad_heroes_cnt']=full_df['r_team'].apply(lambda x: len(x & bad_heroes))
full_df['d_bad_heroes_cnt']=full_df['d_team'].apply(lambda x: len(x & bad_heroes))

In [30]:
full_df['good_heroes_diff']=full_df['r_good_heroes_cnt']-full_df['d_good_heroes_cnt']
full_df['bad_heroes_diff']=full_df['r_bad_heroes_cnt']-full_df['d_bad_heroes_cnt']

In [31]:
full_df[['r_good_heroes_cnt','d_good_heroes_cnt',
        'r_bad_heroes_cnt','d_bad_heroes_cnt',
        'good_heroes_diff','bad_heroes_diff']].to_pickle('./pkl/good_bad_cnt.pkl')

### Bad heroes

In [32]:
!grep -oEm1 '"hero_id":19,"hero_name":"[^"]+?"' $PATH_TO_DATA/train_matches.jsonl
!grep -oEm1 '"hero_id":46,"hero_name":"[^"]+?"' $PATH_TO_DATA/train_matches.jsonl
!grep -oEm1 '"hero_id":91,"hero_name":"[^"]+?"' $PATH_TO_DATA/train_matches.jsonl
!grep -oEm1 '"hero_id":73,"hero_name":"[^"]+?"' $PATH_TO_DATA/train_matches.jsonl
!grep -oEm1 '"hero_id":106,"hero_name":"[^"]+?"' $PATH_TO_DATA/train_matches.jsonl

"hero_id":19,"hero_name":"npc_dota_hero_tiny"
"hero_id":46,"hero_name":"npc_dota_hero_templar_assassin"
"hero_id":91,"hero_name":"npc_dota_hero_wisp"
"hero_id":73,"hero_name":"npc_dota_hero_alchemist"
"hero_id":106,"hero_name":"npc_dota_hero_ember_spirit"


### Good heroes

In [33]:
!grep -oEm1 '"hero_id":96,"hero_name":"[^"]+?"' $PATH_TO_DATA/train_matches.jsonl
!grep -oEm1 '"hero_id":32,"hero_name":"[^"]+?"' $PATH_TO_DATA/train_matches.jsonl
!grep -oEm1 '"hero_id":67,"hero_name":"[^"]+?"' $PATH_TO_DATA/train_matches.jsonl
!grep -oEm1 '"hero_id":92,"hero_name":"[^"]+?"' $PATH_TO_DATA/train_matches.jsonl
!grep -oEm1 '"hero_id":56,"hero_name":"[^"]+?"' $PATH_TO_DATA/train_matches.jsonl
!grep -oEm1 '"hero_id":108,"hero_name":"[^"]+?"' $PATH_TO_DATA/train_matches.jsonl
!grep -oEm1 '"hero_id":42,"hero_name":"[^"]+?"' $PATH_TO_DATA/train_matches.jsonl
!grep -oEm1 '"hero_id":102,"hero_name":"[^"]+?"' $PATH_TO_DATA/train_matches.jsonl
!grep -oEm1 '"hero_id":22,"hero_name":"[^"]+?"' $PATH_TO_DATA/train_matches.jsonl
!grep -oEm1 '"hero_id":20,"hero_name":"[^"]+?"' $PATH_TO_DATA/train_matches.jsonl

"hero_id":96,"hero_name":"npc_dota_hero_centaur"
"hero_id":32,"hero_name":"npc_dota_hero_riki"
"hero_id":67,"hero_name":"npc_dota_hero_spectre"
"hero_id":92,"hero_name":"npc_dota_hero_visage"
"hero_id":56,"hero_name":"npc_dota_hero_clinkz"
"hero_id":108,"hero_name":"npc_dota_hero_abyssal_underlord"
"hero_id":42,"hero_name":"npc_dota_hero_skeleton_king"
"hero_id":102,"hero_name":"npc_dota_hero_abaddon"
"hero_id":22,"hero_name":"npc_dota_hero_zuus"
"hero_id":20,"hero_name":"npc_dota_hero_vengefulspirit"


## Json analysis

In [98]:
def read_matches(matches_file):
    
    MATCHES_COUNT = {
        'test_matches.jsonl': 10000,
        'train_matches.jsonl': 39675,
    }
    _, filename = os.path.split(matches_file)
    total_matches = MATCHES_COUNT.get(filename)
    
    with open(matches_file) as fin:
        for line in tqdm_notebook(fin, total=total_matches):
            yield json.loads(line)

In [99]:
import collections

def extract_features_csv(match):
    row = [
        ('match_id_hash', match['match_id_hash']),
    ]
        
    for slot, player in enumerate(match['players']):
        if slot < 5:
            player_name = 'r%d' % (slot + 1)
        else:
            player_name = 'd%d' % (slot - 4)
        row.append((f'{player_name}_ability_level', len(player['ability_upgrades'])))
        row.append((f'{player_name}_max_hero_hit', player['max_hero_hit']['value']))
        row.append((f'{player_name}_purchase_count', len(player['purchase_log'])))
        row.append((f'{player_name}_count_ability_use', sum(player['ability_uses'].values())))
        row.append((f'{player_name}_damage_dealt', sum(player['damage'].values())))
        row.append((f'{player_name}_damage_received', sum(player['damage_taken'].values())))
        row.append( (f'{player_name}_items', list(map(lambda x: x['id'][5:], player['hero_inventory'])) ) )
        
    return collections.OrderedDict(row)
    
def extract_targets_csv(match, targets):
    return collections.OrderedDict([('match_id_hash', match['match_id_hash'])] + [
        (field, targets[field])
        for field in ['game_time', 'radiant_win', 'duration', 'time_remaining', 'next_roshan_team']
    ])

In [100]:
from tqdm import tqdm_notebook
import json

In [120]:
%%time
new_train_features = []

for match in read_matches(os.path.join(PATH_TO_DATA, 'train_matches.jsonl')):
    match_id_hash = match['match_id_hash']
    features = extract_features_csv(match)
    
    new_train_features.append(features)

HBox(children=(IntProgress(value=0, max=39675), HTML(value='')))


Wall time: 2min 13s


In [121]:
new_train_features = pd.DataFrame.from_records(new_train_features).set_index('match_id_hash')

In [122]:
new_test_features = []
for match in read_matches(os.path.join(PATH_TO_DATA, 'test_matches.jsonl')):
    match_id_hash = match['match_id_hash']
    features = extract_features_csv(match)
    
    new_test_features.append(features)

HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))




In [123]:
new_test_features = pd.DataFrame.from_records(new_test_features).set_index('match_id_hash')

In [124]:
new_train_features.shape, new_test_features.shape

((39675, 70), (10000, 70))

In [125]:
def add_items_dummies(train_df, test_df):
    
    full_df = pd.concat([train_df, test_df], sort=False)
    train_size = train_df.shape[0]

    for team in 'r', 'd':
        players = [f'{team}{i}' for i in range(1, 6)]
        item_columns = [f'{player}_items' for player in players]

        d = pd.get_dummies(full_df[item_columns[0]].apply(pd.Series).stack()).sum(level=0, axis=0)
        dindexes = d.index.values

        for c in item_columns[1:]:
            d = d.add(pd.get_dummies(full_df[c].apply(pd.Series).stack()).sum(level=0, axis=0), fill_value=0)
            d = d.ix[dindexes]

        full_df = pd.concat([full_df, d.add_prefix(f'{team}_item_')], axis=1, sort=False)
        full_df.drop(columns=item_columns, inplace=True)

    train_df = full_df.iloc[:train_size, :]
    test_df = full_df.iloc[train_size:, :]

    return train_df, test_df

In [126]:
def drop_consumble_items(train_df, test_df):
    
    full_df = pd.concat([train_df, test_df], sort=False)
    train_size = train_df.shape[0]

    for team in 'r', 'd':
        consumble_columns = ['tango', 'tpscroll', 
                             'bottle', 'flask',
                            'enchanted_mango', 'clarity',
                            'faerie_fire', 'ward_observer',
                            'ward_sentry']
        
        starts_with = f'{team}_item_'
        consumble_columns = [starts_with + column for column in consumble_columns]
        full_df.drop(columns=consumble_columns, inplace=True)

    train_df = full_df.iloc[:train_size, :]
    test_df = full_df.iloc[train_size:, :]

    return train_df, test_df

In [127]:
%%time

new_train_features, new_test_features = add_items_dummies(new_train_features, new_test_features )
new_train_features, new_test_features = drop_consumble_items(new_train_features, new_test_features )

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  from ipykernel import kernelapp as app


Wall time: 3min 8s


In [128]:
item_columns=[x for x in new_train_features.columns if 'item' in x]

In [129]:
new_full_features = pd.concat([new_train_features, new_test_features], sort=False)

In [130]:
item_full_features=new_full_features[item_columns]

In [131]:
item_full_features.to_pickle('./pkl/item_features.pkl')

In [132]:
new_train_features.drop(item_columns,inplace=True,axis=1)

In [133]:
new_test_features.drop(item_columns,inplace=True,axis=1)

In [134]:
new_full_features.drop(item_columns,inplace=True,axis=1)

In [135]:
for c in ['ability_level', 'max_hero_hit', 'purchase_count', 'count_ability_use', 'damage_dealt', 'damage_received']:
    r_columns = [f'r{i}_{c}' for i in range(1, 6)]
    d_columns = [f'd{i}_{c}' for i in range(1, 6)]
    
    new_full_features['r_total_' + c] = new_full_features[r_columns].sum(1)
    new_full_features['d_total_' + c] = new_full_features[d_columns].sum(1)
    new_full_features['total_' + c + '_ratio'] = new_full_features['r_total_' + c] / new_full_features['d_total_' + c]
    
    new_full_features['r_std_' + c] = new_full_features[r_columns].std(1)
    new_full_features['d_std_' + c] = new_full_features[d_columns].std(1)
    new_full_features['std_' + c + '_ratio'] = new_full_features['r_std_' + c] / new_full_features['d_std_' + c]
    
    new_full_features['r_mean_' + c] = new_full_features[r_columns].mean(1)
    new_full_features['d_mean_' + c] = new_full_features[d_columns].mean(1)

In [136]:
new_full_features.to_pickle('./pkl/new_features.pkl')

In [151]:
new_train_features=new_full_features[:train_size]
new_test_features=new_full_features[train_size:]

## Match objectives

In [18]:
import json

with open(os.path.join(PATH_TO_DATA, 'train_matches.jsonl')) as fin:
    # read the 3-th line
    for i in range(1000):
        line = fin.readline()
    
        match = json.loads(line)
        if match['game_time']>4000:
            break
        
match['game_time']

4538

In [32]:
match['players'][5]

{'player_slot': 128,
 'hero_id': 12,
 'hero_name': 'npc_dota_hero_phantom_lancer',
 'account_id_hash': '765b44263a1525035aa9bbfa14b4f523',
 'ability_upgrades': [{'ability': 5065, 'time': 239, 'level': 1},
  {'ability': 5068, 'time': 402, 'level': 2},
  {'ability': 5065, 'time': 503, 'level': 3},
  {'ability': 5066, 'time': 616, 'level': 4},
  {'ability': 5065, 'time': 835, 'level': 5},
  {'ability': 5067, 'time': 966, 'level': 6},
  {'ability': 5065, 'time': 990, 'level': 7},
  {'ability': 5066, 'time': 1053, 'level': 8},
  {'ability': 5066, 'time': 1129, 'level': 9},
  {'ability': 5906, 'time': 1167, 'level': 10},
  {'ability': 5066, 'time': 1319, 'level': 11},
  {'ability': 5067, 'time': 1397, 'level': 12},
  {'ability': 5068, 'time': 1543, 'level': 13},
  {'ability': 5068, 'time': 1710, 'level': 14},
  {'ability': 5903, 'time': 1873, 'level': 15},
  {'ability': 5068, 'time': 1880, 'level': 16},
  {'ability': 5067, 'time': 2075, 'level': 17},
  {'ability': 6849, 'time': 2459, 'level'

In [196]:
match.keys()

dict_keys(['game_time', 'match_id_hash', 'teamfights', 'objectives', 'chat', 'game_mode', 'lobby_type', 'players', 'targets'])

In [198]:
match['targets']

{'game_time': 4538,
 'duration': 4592,
 'time_remaining': 54,
 'radiant_win': True,
 'next_roshan_team': None}

### Features from objectives

In [141]:
def add_new_features(df_features, matches_file):
    for match in read_matches(matches_file):
        match_id_hash = match['match_id_hash']

        radiant_tower_kills = 0
        dire_tower_kills = 0
        radiant_roshan_kills = 0
        dire_roshan_kills = 0
        radiant_aegis = 0
        dire_aegis = 0
        radiant_tower_deny = 0
        dire_tower_deny = 0
        for objective in match['objectives']:
            if objective['type'] == 'CHAT_MESSAGE_TOWER_KILL':
                if objective['team'] == 2:
                    radiant_tower_kills += 1
                if objective['team'] == 3:
                    dire_tower_kills += 1
            if objective['type'] == 'CHAT_MESSAGE_ROSHAN_KILL':
                if objective['team'] == 2:
                    radiant_roshan_kills += 1
                if objective['team'] == 3:
                    dire_roshan_kills += 1
            if objective['type'] == 'CHAT_MESSAGE_AEGIS':
                if objective['player_slot'] < 100:
                    radiant_aegis += 1
                if objective['player_slot'] > 100:
                    dire_aegis += 1
            if objective['type'] == 'CHAT_MESSAGE_TOWER_DENY':
                if objective['player_slot'] < 100:
                    radiant_tower_deny += 1
                if objective['player_slot'] > 100:
                    dire_tower_deny += 1
            
        # Write new features
        df_features.loc[match_id_hash, 'radiant_tower_kills'] = radiant_tower_kills
        df_features.loc[match_id_hash, 'dire_tower_kills'] = dire_tower_kills
        df_features.loc[match_id_hash, 'diff_tower_kills'] = radiant_tower_kills - dire_tower_kills
        df_features.loc[match_id_hash, 'radiant_roshan_kills'] = radiant_roshan_kills
        df_features.loc[match_id_hash, 'dire_roshan_kills'] = dire_roshan_kills        
        df_features.loc[match_id_hash, 'radiant_aegis'] = radiant_aegis
        df_features.loc[match_id_hash, 'dire_aegis'] = dire_aegis      
        df_features.loc[match_id_hash, 'radiant_tower_deny'] = radiant_tower_deny
        df_features.loc[match_id_hash, 'dire_tower_deny'] = dire_tower_deny

In [152]:
add_new_features(new_train_features, os.path.join(PATH_TO_DATA, 'train_matches.jsonl'))

HBox(children=(IntProgress(value=0, max=39675), HTML(value='')))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s





In [170]:
add_new_features(new_test_features, os.path.join(PATH_TO_DATA, 'test_matches.jsonl'))

HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))




In [171]:
new_test_features.shape,new_train_features.shape

((10000, 117), (39675, 117))

In [172]:
new_full_features = pd.concat([new_train_features, new_test_features], sort=False)

In [173]:
new_full_features.to_pickle('./pkl/new_features.pkl')

## Write to the file

In [8]:
import pickle
import pandas as pd
import os

In [33]:
pickles = os.listdir('./pkl')

In [34]:
df_all = pd.read_pickle('./pkl/base_features.pkl')
df_all.shape

(49675, 333)

In [35]:
pickles.remove('base_features.pkl')

In [36]:
for pkl in pickles:
    df = pd.read_pickle('./pkl/'+pkl)
    print(pkl + ' ' + str(df.shape))
    df_all = pd.concat([df_all, df], sort=False,axis=1)

bot_feature.pkl (49675, 18)
good_bad_cnt.pkl (49675, 6)
hero_id_ohe.pkl (49675, 230)
item_features.pkl (49675, 384)
new_features.pkl (49675, 117)


In [37]:
df_all.shape

(49675, 1088)

In [38]:
train_all=df_all[:train_size]

In [39]:
test_all=df_all[train_size:]

In [40]:
train_all.to_csv('./train.csv')
test_all.to_csv('./test.csv')