In [7]:
#import necessary libraries
import numpy as np
import pandas as pd

In [14]:
#we have two csv files that are fetched matches
#we will concatenate them into one dataframe
df = pd.read_csv('matches.csv')
df2 = pd.read_csv('matches_2.csv')

df = pd.concat([df, df2], axis=0)

# drop duplicates based on match_id, unnecessary columns and nan rows
df = df.drop_duplicates(subset=['match_id'])
df = df.drop(columns=['avg_rank_tier'])
df = df.dropna()

#combining radiant_team and dire_team so that we can encode easier
df['team'] = pd.concat([df['radiant_team'], df['dire_team']], axis=1).apply(lambda x: ','.join(map(str, x)), axis=1)

In [15]:
#peak on our dataframe after the processing above
df.head()

Unnamed: 0,match_id,radiant_win,duration,avg_mmr,radiant_team,dire_team,team
0,7045889918,False,2199,2607.0,9705052136,693127408,9705052136693127408
2,7045889915,True,2301,2931.0,44811015086,276231425,44811015086276231425
4,7045889913,True,2093,1942.0,4162311714,10415541122,416231171410415541122
5,7045889912,False,2856,4332.0,6340644115,12168423929,634064411512168423929
6,7045889911,False,3539,3942.0,252355833,11281225386,25235583311281225386


In [20]:
#defining one-hot encoding function
#data: pandas_dataframe
# -> pandas_dataframe
def hot_en(data):
    champ1_cols = [f'team1_hero{i}' for i in range(1, 138)]
    champ2_cols = [f'team2_hero{i}' for i in range(1, 138)]

    all_cols = champ1_cols + champ2_cols

    test_df = pd.DataFrame(0, index=range(len(data)), columns=all_cols)

    for row_idx, row in enumerate(data):
        for hero_pos, hero_idx in enumerate(row):

            if int(hero_idx) > 0 and hero_pos < 5:
                test_df.loc[row_idx, f'team1_hero{hero_idx}'] = 1

            elif int(hero_idx) > 0 and hero_pos >= 5:
                test_df.loc[row_idx, f'team2_hero{hero_idx}'] = 1
                
    return test_df

In [21]:
#using the hot_en function above to five-hot encode each team (in total 10-hot)
df_encoded = hot_en(df['team'].str.split(',').apply(lambda x: [int(i) for i in x]))
df_encoded

Unnamed: 0,team1_hero1,team1_hero2,team1_hero3,team1_hero4,team1_hero5,team1_hero6,team1_hero7,team1_hero8,team1_hero9,team1_hero10,...,team2_hero128,team2_hero129,team2_hero130,team2_hero131,team2_hero132,team2_hero133,team2_hero134,team2_hero135,team2_hero136,team2_hero137
0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91583,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
91584,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
91585,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
91586,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
#dota 2 have multiple unreleased heroes that took up the hero id but was never in the game
#as a result, we'll remove these columns so that they match the real matches
zero_cols = df_encoded.columns[(df_encoded == 0).all()]
df_encoded = df_encoded.drop(columns = zero_cols)
df_encoded

Unnamed: 0,team1_hero1,team1_hero2,team1_hero3,team1_hero4,team1_hero5,team1_hero6,team1_hero7,team1_hero8,team1_hero9,team1_hero10,...,team2_hero119,team2_hero120,team2_hero121,team2_hero123,team2_hero126,team2_hero128,team2_hero129,team2_hero135,team2_hero136,team2_hero137
0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91583,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
91584,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
91585,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
91586,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
#combine the encoded data with the original df, and remove columns not used
df.reset_index(drop=True, inplace=True)
df_encoded.reset_index(drop=True, inplace=True)
combined_df = pd.concat([df, df_encoded], axis=1)
combined_df = combined_df.drop(columns = ['dire_team','radiant_team','team'])
combined_df

Unnamed: 0,match_id,radiant_win,duration,avg_mmr,team1_hero1,team1_hero2,team1_hero3,team1_hero4,team1_hero5,team1_hero6,...,team2_hero119,team2_hero120,team2_hero121,team2_hero123,team2_hero126,team2_hero128,team2_hero129,team2_hero135,team2_hero136,team2_hero137
0,7045889918,False,2199,2607.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7045889915,True,2301,2931.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,7045889913,True,2093,1942.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,7045889912,False,2856,4332.0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,7045889911,False,3539,3942.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91583,6744845900,False,3297,2763.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
91584,6744846112,True,1915,2882.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
91585,6744847011,False,2101,2159.0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
91586,6744847112,True,2494,2107.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
combined_df.to_csv('cleaned_data_final.csv', index=False)