In [100]:
import pandas as pd
from tqdm import tqdm
import os

In [101]:
data_train = pd.read_csv("../datasets/merged_data/train.csv")
data_test= pd.read_csv("../datasets/merged_data/test.csv")
data_val = pd.read_csv("../datasets/merged_data/val.csv")

In [102]:
'''
Step1.Drop Useless columns: 
matchId
gameVersion
Player_id

Player_1_ban
Player_1_ban_turn
Player_1_role
Player_1_team
Player_1_victory
...

Step2.Adjust misplaced data

'''

# define useless columns
drop_columns = ['matchId','gameVersion','Player_id']
prefix = "Player_"
suffixs =["_ban","_ban_turn","_role","_team","_victory"]
for i in range(1,11):
    for suffix in suffixs:
        drop_columns.append(prefix+str(i)+suffix)

 

In [103]:
def data_process(df):
    #drop null value
    df.dropna(axis=0, how='any', inplace=True)
    #drop useless columns
    df.drop(drop_columns, axis=1, inplace=True)
    #convert column "Player_i_postion" from object to string
    for i in range(1,11):
        df[prefix+str(i)+"_position"] = df[prefix+str(i)+"_position"].astype('string')
    
    return df

In [104]:
def pos_sort(df):
    '''
    Adjust misplaced data
    '''

    # find misplaced data
    condition = (df['Player_1_position'] == "TOP") & (df['Player_2_position'] == "JUNGLE") & (df['Player_3_position'] == "MIDDLE") & (df['Player_4_position'] == "BOTTOM") & (df['Player_5_position'] == "UTILITY") & (df['Player_6_position'] == "TOP") & (df['Player_7_position'] == "JUNGLE") & (df['Player_8_position'] == "MIDDLE") & (df['Player_9_position'] == "BOTTOM") & (df['Player_10_position'] == "UTILITY")

    df_correct = df[condition]
    df_wrong = df[~condition]


     # correct misplaced data
    copy = ["gameId", "gameDuration","teamVictory","team_100_gold","team_200_gold"]
    keys = ["_pick","_position","_time_game","_gold","_xp","_dmg_dealt","_dmg_taken","_time_ccing"]
    role = ["","TOP","JUNGLE","MIDDLE","BOTTOM","UTILITY","TOP","JUNGLE","MIDDLE","BOTTOM","UTILITY"]
    teams = [[1,2,3,4,5],[6,7,8,9,10]]

    df_tmp = pd.DataFrame(columns=df_wrong.columns.tolist())
    for index, row in tqdm(df_wrong.iterrows()):
        value = []
        for c in copy:
            value.append(row[c])
        for team in teams:
            for j in team:
                if row["Player_"+str(j)+"_position"] != role[j]:
                    #print ("Player_"+str(j)+"_position no")
                    otherp = set(team)-{j}
                    for i in otherp:
                        if row["Player_"+str(i)+"_position"] == role[j]:
                            for key in keys:
                                value.append(row["Player_"+str(i)+key])
                            break
                else:
                    #print ("Player_"+str(j)+"_position yes")
                    for key in keys:
                        value.append(row["Player_"+str(j)+key])
        df_tmp.loc[len(df_tmp.index)] = value
    
    df_allcorrect = pd.concat([df_correct,df_tmp]).reset_index(drop=True)

    # return corrected data
    return df_allcorrect

In [105]:
df_sorted_test = pos_sort(data_process(data_test))
df_sorted_val = pos_sort(data_process(data_val))
df_sorted_train = pos_sort(data_process(data_train))


1358it [00:15, 84.98it/s]
1398it [00:16, 83.94it/s]


In [108]:
df_sorted_test.to_csv("../datasets/pos_sorted_data/test.csv", index= False)
df_sorted_val.to_csv("../datasets/pos_sorted_data/val.csv", index= False)
df_sorted_train.to_csv("../datasets/pos_sorted_data/train.csv", index= False)