In [None]:
import pandas as pd
import numpy as np

dataset_path = "../dataset/mlcourse-dota2-win-prediction/train_features.csv"

df = pd.read_csv(dataset_path)

print("Dataset shape:", df.shape)

print("Columns:", df.columns)

print("Unique game modes:", np.unique(df["game_mode"]))
print("Total number of game modes:", len(df["game_mode"]))
print("Number of games with game mode 22:", len(df[df["game_mode"]==22]))
df_22 = df[df["game_mode"]==22]
print("Number of games with game mode 2:", len(df[df["game_mode"]==2]))
df_2 = df[df["game_mode"]==2]

print("Number of games with game mode 22 and lobby type 7:", len(df_22[df_22["lobby_type"]==7]))

print("Number of games with game mode 22 and lobby type 0:", len(df_22[df_22["lobby_type"]==0]))

print("Number of games with game mode 2 and lobby type 7:", len(df_2[df_2["lobby_type"]==7]))

print("Number of games with game mode 2 and lobby type 0:", len(df_2[df_2["lobby_type"]==0]))
print("Number of games with game mode 2 and lobby type 0 (using &):", len(df[(df["game_mode"]==2) & (df["lobby_type"]==0)]))
print("Number of games with game mode 2 and lobby type 0 (using query):", len(df.query("game_mode == 2 and lobby_type == 0")))

single_hero_labels = [s for s in df.columns if s.startswith('d1')]
print("Single hero labels:", single_hero_labels)

hero_id_labels = [s for s in df.columns if s.endswith('_hero_id')]
print("Hero ID labels:", hero_id_labels)

hero_category = 5 # carry - midlaner - offlaner - roamer - support

hero_id_set = {i: set() for i in range(len(hero_id_labels))}

hero_id_set_tot = set()

for n, label in enumerate(hero_id_labels):
    for id in df[label]:
        hero_id_set[n].add(id)

for i in range(len(hero_id_labels)):
    print(f"Number of unique hero IDs in column {i}:", len(hero_id_set[i]))
    hero_id_set_tot = hero_id_set_tot.union(hero_id_set[i])

print("Total number of unique hero IDs:", len(hero_id_set_tot))



Dataset shape: (39675, 246)
Columns: Index(['match_id_hash', 'game_time', 'game_mode', 'lobby_type',
       'objectives_len', 'chat_len', 'r1_hero_id', 'r1_kills', 'r1_deaths',
       'r1_assists',
       ...
       'd5_stuns', 'd5_creeps_stacked', 'd5_camps_stacked', 'd5_rune_pickups',
       'd5_firstblood_claimed', 'd5_teamfight_participation',
       'd5_towers_killed', 'd5_roshans_killed', 'd5_obs_placed',
       'd5_sen_placed'],
      dtype='object', length=246)
Unique game modes: [ 2  3  4  5 12 16 22 23]
Total number of game modes: 39675
Number of games with game mode 22: 31762
Number of games with game mode 2: 408
Number of games with game mode 22 and lobby type 7: 25795
Number of games with game mode 22 and lobby type 0: 5967
Number of games with game mode 2 and lobby type 7: 379
Number of games with game mode 2 and lobby type 0: 29
Number of games with game mode 2 and lobby type 0 (using &): 29
Number of games with game mode 2 and lobby type 0 (using query): 29
Single hero 

In [5]:
df[single_hero_labels].describe()

Unnamed: 0,d1_hero_id,d1_kills,d1_deaths,d1_assists,d1_denies,d1_gold,d1_lh,d1_xp,d1_health,d1_max_health,...,d1_stuns,d1_creeps_stacked,d1_camps_stacked,d1_rune_pickups,d1_firstblood_claimed,d1_teamfight_participation,d1_towers_killed,d1_roshans_killed,d1_obs_placed,d1_sen_placed
count,39675.0,39675.0,39675.0,39675.0,39675.0,39675.0,39675.0,39675.0,39675.0,39675.0,...,39675.0,39675.0,39675.0,39675.0,39675.0,39675.0,39675.0,39675.0,39675.0,39675.0
mean,51.710347,3.168544,3.246881,4.700265,6.479219,7201.097064,65.435261,8352.204184,952.81903,1323.007007,...,11.635409,1.009175,0.332754,4.635766,0.089754,0.41846,0.318563,0.027498,1.197908,0.739761
std,34.344258,3.7738,3.238487,5.29888,8.416625,6562.240831,77.329993,7550.811402,690.678362,634.472253,...,20.177958,3.587444,0.956204,4.608758,0.285833,0.268381,0.750841,0.182473,2.507568,2.33081
min,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,320.0,...,-10.727492,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,21.0,0.0,1.0,1.0,1.0,2187.0,11.0,2291.5,531.0,840.0,...,0.0,0.0,0.0,1.0,0.0,0.241379,0.0,0.0,0.0,0.0
50%,46.0,2.0,2.0,3.0,3.0,5346.0,38.0,6015.0,848.0,1160.0,...,1.333057,0.0,0.0,3.0,0.0,0.444444,0.0,0.0,0.0,0.0
75%,81.0,5.0,5.0,7.0,9.0,10348.5,92.0,12525.0,1315.0,1655.0,...,15.859372,0.0,0.0,7.0,0.0,0.6,0.0,0.0,1.0,0.0
max,120.0,36.0,30.0,40.0,76.0,61189.0,930.0,30985.0,7510.0,7540.0,...,272.92606,138.0,26.0,50.0,1.0,2.0,7.0,4.0,23.0,43.0


In [9]:
target_path = "../dataset/mlcourse-dota2-win-prediction/train_targets.csv"

target = pd.read_csv(target_path)

target['radiant_win'] = target['radiant_win'].astype(int)

target.describe()

Unnamed: 0,game_time,radiant_win,duration,time_remaining
count,39675.0,39675.0,39675.0,39675.0
mean,1146.082798,0.524915,2328.413711,1182.330914
std,767.206621,0.499385,671.803393,770.779229
min,0.0,0.0,901.0,31.0
25%,521.0,0.0,1872.5,554.0
50%,1044.0,1.0,2270.0,1087.0
75%,1656.0,1.0,2746.0,1694.0
max,4933.0,1.0,5638.0,5108.0
