In [98]:
import numpy as np
import pandas as pd
import easydatascience as eds
import os
        
match_cols = ['MatchID', 'GameDuration', 'Blue_Won', 'Blue_KillsTower', 
              'Blue_KillsInhib', 'Blue_KillsBaron', 'Blue_KillsDragon', 
              'Purp_KillsTower', 'Purp_KillsInhib', 'Purp_KillsBaron',
              'Purp_KillsDragon']

base_player_cols = ['ChampionID', 'Kills', 'Deaths', 'Assists', 'Gold', 'CS', 'Role',
                    'Lane']

# Generating column names for 10 players
player_cols = []

for i in range(1, 11):
    curr_player = 'player' + str(i)
    curr_list = [curr_player + '_' + col for col in base_player_cols]
    player_cols += curr_list
    
column_names = match_cols + player_cols

# Reading CSVs
datas = []
csvs = [i for i in os.listdir() if i[-4:]=='.csv']

for csv in csvs:
    datas.append(pd.read_csv(csv, names=column_names))
    
data = pd.concat(datas)

data = data.reset_index(drop=True)

pd.set_option('max_rows', eds.look(data).shape[0])
eds.look(data)

Unnamed: 0,Types,Counts,Distincts,Nulls,Missing ratio (%),Uniques,Skewness
MatchID,object,25582,25470,0,0.0,"[[2554403479, 2560068908, 2559895973, 25590346...",-0.078463
GameDuration,object,25582,1432,0,0.0,"[[1573, 2214, 1964, 1681, 2403, 1553, 2330, 16...",0.946268
Blue_Won,object,25582,2,0,0.0,"[[Fail, Win]]",
Blue_KillsTower,object,25582,12,0,0.0,"[[1, 6, 3, 0, 9, 7, 2, 5, 10, 11, 8, 4]]",-0.113294
Blue_KillsInhib,object,25582,10,0,0.0,"[[0, 1, 3, 2, 4, 6, 7, 5, 9, 8]]",1.232648
Blue_KillsBaron,object,25582,5,0,0.0,"[[0, 1, 2, 3, 4]]",0.994564
Blue_KillsDragon,object,25582,8,0,0.0,"[[1, 3, 2, 0, 4, 5, 6, 7]]",0.047424
Purp_KillsTower,object,25582,12,0,0.0,"[[8, 7, 6, 11, 4, 2, 5, 10, 9, 1, 3, 0]]",-0.141165
Purp_KillsInhib,object,25582,10,0,0.0,"[[1, 3, 0, 2, 4, 5, 6, 7, 8, 10]]",1.193154
Purp_KillsBaron,object,25582,5,0,0.0,"[[1, 0, 2, 3, 4]]",0.900077


In [99]:
data = data.drop('MatchID', axis=1)

# String to bool
data['Blue_Won'] = np.where(data['Blue_Won']=='Fail', 0, 1)

# Merging "Role" and "Lane" columns
for r_col, l_col in zip([i for i in data.columns if i[-4:]=='Role'],
                        [i for i in data.columns if i[-4:]=='Lane']):
    data[r_col] = data[r_col] + ' ' + data[l_col]
    data = data.drop(l_col, axis=1)
    
# Droping rows with wrongly labeled roles
to_drop = ['DUO TOP', 'DUO MIDDLE', 'DUO_SUPPORT MIDDLE', 'DUO_SUPPORT TOP', 
           'DUO_CARRY MIDDLE', 'SOLO BOTTOM', 'DUO BOTTOM']

for col in [i for i in data.columns if i[-4:]=='Role']:
    for role in to_drop:
        data = data.loc[data[col]!=role]
        
data = data.reset_index(drop=True)

&emsp;We are going to be extra careful about how we handle roles this time. I didn't want to feature engineer lane-to-lane comparison features last time but that seems more likely now given the tight feature space we are given by the API where we are getting our test data from. 

In [100]:
# Renaming roles
roles = [['NONE JUNGLE', 'Jungle'], ['SOLO TOP', 'Top'], ['SOLO MIDDLE', 'Mid'],
         ['DUO_CARRY BOTTOM', 'Adc'], ['DUO_SUPPORT BOTTOM', 'Support']]

for col in [i for i in data.columns if i[-4:]=='Role']:
    for old, new in roles:
        data[col] = np.where(data[col]==old, new, data[col])

### Aligning postitions with columns

In [101]:
# Checking which games have valid team compositions
# Team1: Adc, Jungle, etc... Team2: -||-
valid_roles = ['Adc', 'Jungle', 'Mid', 'Support', 'Top']

player_data = data[[i for i in data.columns if i[:6]=='player']]

team1_okteams = []

team1 = player_data[['player'+str(i)+'_Role' for i in range(1, 6)]]
for idx, row in team1.iterrows():
    roles = list(row)
    roles.sort()
    if valid_roles == roles:
        team1_okteams.append(idx)
        
team2_okteams = []

team2 = player_data[['player'+str(i)+'_Role' for i in range(6, 11)]]
for idx, row in team2.iterrows():
    roles = list(row)
    roles.sort()
    if valid_roles == roles:
        team2_okteams.append(idx)
        
ok_games = [i for i in team1_okteams if i in team2_okteams]

data = data.iloc[[i for i in data.index if i in ok_games]]
data = data.reset_index(drop=True)

In [102]:
# Sorting dataframes so I can take the difference of players
# Iterative and slow code

teams = [data.iloc[:, 10:45], data.iloc[:, 45:]]

# Renaming columns
for idx in range(2):
    for start, role in zip(range(0, 35, 7), valid_roles):
        for col in teams[idx].iloc[:, start:start + 7].columns:
            teams[idx] = teams[idx].rename(columns={col:role+'_'+col.split('_')[1]})

# Sorting
for idx in range(2):
    to_concat = []
    for col in valid_roles:
        curr_data = pd.DataFrame(columns=[i for i in teams[idx].columns if col in i])

        for role in valid_roles:
            curr_cols = teams[idx][[i for i in teams[idx].columns if role in i]]
            curr_cols = curr_cols.loc[curr_cols.iloc[:, -1]==col] # Rows with Role
            curr_cols.columns = curr_data.columns # Renaming for concat
        
            curr_data = pd.concat([curr_data, curr_cols], axis=0)
            
        curr_data.sort_index(inplace=True)
        to_concat.append(curr_data)

    teams[idx] = pd.concat(to_concat, axis=1)
    
# Dropping the Role Columns and converting obj type
for idx in range(2):
    teams[idx] = teams[idx].drop([i for i in teams[idx].columns if 'Role' in i],
                                  axis=1).astype('float64')
    
# Indicating Blue and Purple team column names in players
teams[0].columns = ['Blue_'+i for i in teams[0].columns]
teams[1].columns = ['Purp_'+i for i in teams[1].columns]

In [103]:
# Merging data
# Aggregate teams' features

team_features = pd.DataFrame()

for i, team in enumerate(['Blue', 'Purp']):
    curr_data = teams[i]
    
    team_features[team+'_Kills'] = teams[i][[i for i in teams[i].columns if i[-5:]=='Kills']].sum(axis=1)
    team_features[team+'_Assists'] = teams[i][[i for i in teams[i].columns if i[-7:]=='Assists']].sum(axis=1)
    team_features[team+'_Gold'] = teams[i][[i for i in teams[i].columns if i[-4:]=='Gold']].sum(axis=1)


data = pd.concat([data.iloc[:, :10], team_features,teams[0], teams[1]], axis=1)

### Trimming

In [114]:
team_trim = [['_KillsInhib', 6], ['_KillsBaron', 3], ['_KillsDragon', 6],
           ['_Kills', 66], ['_Gold', 90000]]

for team in ['Blue', 'Purp']:
    for col, threshold in team_trim:
        data = data.loc[data[team+col]<threshold]
        
player_trim = [['Kills', 20], ['Deaths', 15]]

for col, threshold in player_trim:
    for full_col in [i for i in data.columns if i[-len(col):]==col and len(i.split('_'))==3]:
        data = data.loc[data[full_col]<threshold]
        
data = data.reset_index(drop=True)

### Export

In [116]:
data.to_csv('data.csv', index=False)