In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/dota-2-matches/players.csv
/kaggle/input/dota-2-matches/ability_upgrades.csv
/kaggle/input/dota-2-matches/test_player.csv
/kaggle/input/dota-2-matches/match_outcomes.csv
/kaggle/input/dota-2-matches/teamfights_players.csv
/kaggle/input/dota-2-matches/chat.csv
/kaggle/input/dota-2-matches/purchase_log.csv
/kaggle/input/dota-2-matches/test_labels.csv
/kaggle/input/dota-2-matches/ability_ids.csv
/kaggle/input/dota-2-matches/item_ids.csv
/kaggle/input/dota-2-matches/player_time.csv
/kaggle/input/dota-2-matches/patch_dates.csv
/kaggle/input/dota-2-matches/yasp_sample.json
/kaggle/input/dota-2-matches/match.csv
/kaggle/input/dota-2-matches/player_ratings.csv
/kaggle/input/dota-2-matches/hero_names.csv
/kaggle/input/dota-2-matches/cluster_regions.csv
/kaggle/input/dota-2-matches/teamfights.csv
/kaggle/input/dota-2-matches/objectives.csv


The first question I want to answer is: Can we predict the match outcome based on the aggregates of heroes in the match?

To start we are going to read in the data, and start to combine the CSVs to get information about a single match. 

In [2]:
#Each match has one entry, in this case we are focused on match_id, and radiant_win only as our target.
columnsWeWant = ['match_id','radiant_win']
match = pd.read_csv('/kaggle/input/dota-2-matches/match.csv')
match = match[columnsWeWant]

#We are only intrested in this case on the hero_id and match_id, and player_slot. Note that: Player_slot: 0-4 are Radiant, 128-132 are Dire
columnsWeWant = ['match_id','hero_id','player_slot']
players = pd.read_csv('/kaggle/input/dota-2-matches/players.csv')
players = players[columnsWeWant]

#Change from players_slot to boolean for radiant team or not
players["radiant"] = (players['player_slot'] < 5)

#Groups by match id and radiant boolean, and for each will combine hero_id into a list
combinedHeroID = players.groupby(['match_id','radiant']).hero_id.apply(lambda x: x.tolist())

#We unstack so that we can get the team lists for each match_id into columns instead of a row for each match_id
combinedHeroID = combinedHeroID.unstack()

#We rename the columns to make more sense
combinedHeroID.columns = ['dire_heros','radiant_heros']

#Here we merge the two DataFrames together on the match_id columns, and make the match_id the index
merged = combinedHeroID.merge(match,left_on='match_id',right_on='match_id')
merged = merged.set_index('match_id')

merged

Unnamed: 0_level_0,dire_heros,radiant_heros,radiant_win
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,"[106, 102, 46, 7, 73]","[86, 51, 83, 11, 67]",True
1,"[73, 22, 5, 67, 106]","[7, 82, 71, 39, 21]",False
2,"[38, 7, 10, 12, 85]","[51, 109, 9, 41, 27]",False
3,"[78, 19, 31, 40, 47]","[50, 44, 32, 26, 39]",False
4,"[101, 100, 22, 67, 21]","[8, 39, 55, 87, 69]",True
...,...,...,...
49995,"[32, 7, 109, 35, 112]","[73, 86, 21, 20, 14]",True
49996,"[36, 1, 112, 60, 71]","[93, 74, 100, 32, 85]",True
49997,"[28, 102, 21, 9, 23]","[100, 68, 75, 39, 44]",True
49998,"[46, 7, 29, 44, 3]","[56, 50, 2, 72, 30]",True


In [3]:
#To make features for the machine learning model we split the list of heros into different columns for each one
merged[["r1", "r2", "r3", "r4", "r5"]] = merged["radiant_heros"].tolist()
merged[["d1", "d2", "d3", "d4", "d5"]] = merged["dire_heros"].tolist()

merged = merged.drop(["radiant_heros","dire_heros"],axis=1)
merged

Unnamed: 0_level_0,radiant_win,r1,r2,r3,r4,r5,d1,d2,d3,d4,d5
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,True,86,51,83,11,67,106,102,46,7,73
1,False,7,82,71,39,21,73,22,5,67,106
2,False,51,109,9,41,27,38,7,10,12,85
3,False,50,44,32,26,39,78,19,31,40,47
4,True,8,39,55,87,69,101,100,22,67,21
...,...,...,...,...,...,...,...,...,...,...,...
49995,True,73,86,21,20,14,32,7,109,35,112
49996,True,93,74,100,32,85,36,1,112,60,71
49997,True,100,68,75,39,44,28,102,21,9,23
49998,True,56,50,2,72,30,46,7,29,44,3


In [4]:
#Here we start making the Machine Learning Model, we will start with a Random Forest Model

#Start by setting the prediction target
y = merged.radiant_win

#Next we set the features we are going to use
data_features = ["r1", "r2", "r3", "r4", "r5","d1", "d2", "d3", "d4", "d5"]
X = merged[data_features]

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

#Here we split the data into training and validation data
train_X, val_X, train_y, val_y = train_test_split(X, y,random_state = 0)

#Make and fit a Random Forest Classifier Model using training data
forest_model = RandomForestClassifier(random_state=1)
forest_model.fit(train_X, train_y)

#Use the Model to predict the validation data and get a accuracy score based on that
preds = forest_model.predict(val_X)
print("Accuracy of the model is: ", metrics.accuracy_score(val_y, preds))

Accuracy of the model is:  0.51392


In [5]:
#Lets try out the model on a random game I got from OPENDOTA: https://www.opendota.com/  Make sure that we do it from the same patches
#https://www.opendota.com/matches/17955123, In this game Radiant are the winners
random_game = pd.DataFrame([(31, 32, 29, 39, 52, 9, 79, 28, 21, 33)],columns=data_features)
print(random_game)
print(forest_model.predict(random_game))

   r1  r2  r3  r4  r5  d1  d2  d3  d4  d5
0  31  32  29  39  52   9  79  28  21  33
[ True]


To test our Random Forest Model we looked up a random match inside the patch, and the model thinks that Radiant will win, In this match Radiant does win.

But if we look at our accuracy score we see that we are correct 51.392% of the time, this is barely better than a coinflip.

There are a few ways we can improve our model, lets try some of them below!

In [6]:
#One of the options we have to make our model better is to make use of more variables. We can use player_ratings, purchase log, ability_upgrade, tower and barrack statuses. Gpm, Xpm, Kills and Deaths.

#Each match has one entry, in this case we are focused on match_id, and radiant_win only as our target.
columnsWeWant = ['match_id','radiant_win']
match = pd.read_csv('/kaggle/input/dota-2-matches/match.csv')
match = match[columnsWeWant]

#We are only intrested in this case on the hero_id and match_id, and player_slot. Note that: Player_slot: 0-4 are Radiant, 128-132 are Dire
columnsWeWant = ['match_id','hero_id','player_slot','gold_per_min','xp_per_min','kills','deaths']
players = pd.read_csv('/kaggle/input/dota-2-matches/players.csv')
players = players[columnsWeWant]
players

Unnamed: 0,match_id,hero_id,player_slot,gold_per_min,xp_per_min,kills,deaths
0,0,86,0,347,362,9,3
1,0,51,1,494,659,13,3
2,0,83,2,350,385,0,4
3,0,11,3,599,605,8,4
4,0,67,4,613,762,20,3
...,...,...,...,...,...,...,...
499995,49999,100,128,468,626,16,9
499996,49999,9,129,507,607,12,6
499997,49999,90,130,371,404,5,3
499998,49999,73,131,780,703,8,6
