# Baseline Model

Before we make our real model we'd first like to create a baseline model. This model should be very simple and provide us with a baseline. If our real model does worth than this we'll know something is very wrong.

In [1]:
import os
import json
import time
import datetime
import requests
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("training_set_small.csv")
df

Unnamed: 0,hero0,hero1,hero2,hero3,hero4,hero5,hero6,hero7,hero8,hero9,radiant_win
0,62,14,106,28,11,10,85,112,2,48,True
1,27,129,48,9,36,8,62,1,126,120,True
2,14,105,1,88,22,89,87,129,39,59,False
3,30,22,35,26,96,58,120,7,21,11,True
4,52,54,123,81,25,107,26,8,96,6,False
...,...,...,...,...,...,...,...,...,...,...,...
10090,114,27,74,17,13,34,18,44,5,101,True
10091,69,128,31,35,1,93,106,2,26,87,True
10092,54,40,26,96,14,31,86,2,63,46,True
10093,44,22,54,13,84,119,8,26,49,6,True


In [3]:
# Create train/valid set
train = df.iloc[:9000]
valid = df.iloc[9000:]

len(train), len(valid)

(9000, 1095)

In [4]:
min_hero = 1
max_hero = 135

num_games = [0 for _ in range(max_hero + 1)]
num_wins = [0 for _ in range(max_hero + 1)]

In [5]:
# Keep track of the numbers of games played for each hero and the number of times that hero won.
for i, row in train.iterrows():
    num_games[row['hero0']] += 1
    num_games[row['hero1']] += 1
    num_games[row['hero2']] += 1
    num_games[row['hero3']] += 1
    num_games[row['hero4']] += 1
    num_games[row['hero5']] += 1
    num_games[row['hero6']] += 1
    num_games[row['hero7']] += 1
    num_games[row['hero8']] += 1
    num_games[row['hero9']] += 1
    
    if row['radiant_win'] == True:
        num_wins[row['hero0']] += 1
        num_wins[row['hero1']] += 1
        num_wins[row['hero2']] += 1
        num_wins[row['hero3']] += 1
        num_wins[row['hero4']] += 1
    else:
        num_wins[row['hero5']] += 1
        num_wins[row['hero6']] += 1
        num_wins[row['hero7']] += 1
        num_wins[row['hero8']] += 1
        num_wins[row['hero9']] += 1

In [6]:
# Now create a lookup for each hero id that contains:
# - number of wins
# - number of games
# - win rate
lookup = pd.DataFrame(list(zip(num_wins, num_games)), 
                      columns=["num_wins", "num_games"], 
                      index=np.arange(max_hero + 1))
eps = 1e-6 # Avoid divide by zero.
lookup['win_rate'] = lookup['num_wins'] / (lookup['num_games'] + eps)

# For any heros we've never seen before, we'll assume a 50% win rate.
lookup[lookup['win_rate'] == 0.] = 0.5
lookup

Unnamed: 0,num_wins,num_games,win_rate
0,0.5,0.5,0.500000
1,513.0,1038.0,0.494220
2,756.0,1392.0,0.543103
3,84.0,208.0,0.403846
4,278.0,523.0,0.531549
...,...,...,...
131,0.5,0.5,0.500000
132,0.5,0.5,0.500000
133,0.5,0.5,0.500000
134,0.5,0.5,0.500000


Now that we have this lookup for each hero, we can generate predictions for new games (`valid`) based on which team has the highest aggregate win-rate.

In [7]:
valid

Unnamed: 0,hero0,hero1,hero2,hero3,hero4,hero5,hero6,hero7,hero8,hero9,radiant_win
9000,67,88,40,38,68,29,93,5,13,86,True
9001,64,114,19,90,10,97,71,106,93,30,False
9002,27,44,111,35,109,48,15,86,65,112,False
9003,85,21,35,19,98,93,15,27,62,59,True
9004,11,99,21,61,41,126,5,46,104,3,False
...,...,...,...,...,...,...,...,...,...,...,...
10090,114,27,74,17,13,34,18,44,5,101,True
10091,69,128,31,35,1,93,106,2,26,87,True
10092,54,40,26,96,14,31,86,2,63,46,True
10093,44,22,54,13,84,119,8,26,49,6,True


In [8]:
preds = []

for i, row in valid.iterrows():
    
    p_rad = ((
    lookup.iloc[row['hero0']]['win_rate'] + 
    lookup.iloc[row['hero1']]['win_rate'] + 
    lookup.iloc[row['hero2']]['win_rate'] + 
    lookup.iloc[row['hero3']]['win_rate'] + 
    lookup.iloc[row['hero4']]['win_rate']) / 5)
    
    p_dire = ((
    lookup.iloc[row['hero5']]['win_rate'] + 
    lookup.iloc[row['hero6']]['win_rate'] + 
    lookup.iloc[row['hero7']]['win_rate'] + 
    lookup.iloc[row['hero8']]['win_rate'] + 
    lookup.iloc[row['hero9']]['win_rate']) / 5)
    
    preds.append(p_rad >= p_dire)

In [9]:
valid['pred'] = preds


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [10]:
valid['correct'] = valid['pred'] == valid['radiant_win']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [12]:
valid['correct'].value_counts()

True     576
False    519
Name: correct, dtype: int64

In [19]:
counts = valid['correct'].value_counts()
counts[True] / len(valid)

0.5260273972602739

We do slightly better than random!