In [1]:
import numpy as np
import pandas as pd

from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.feature_selection import SelectKBest
from sklearn.grid_search import GridSearchCV
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split

In [2]:
dataset = pd.read_csv('dota_games.csv', index_col=0)

In [3]:
dataset.dtypes.value_counts()

float64    221
object       2
int64        1
dtype: int64

### Duplicate Radient/Dire games

In [4]:
def radient_win_pc(extract):
    counts = extract['winner'].value_counts()
    return (1.0 * counts['Radiant Victory']) / (counts['Radiant Victory'] + counts['Dire Victory'])

In [5]:
dataset['winner'].value_counts()

Radiant Victory    3788
Dire Victory       2959
dtype: int64

In [6]:
radient_win_pc(dataset)

0.561434711723729

In [7]:
data_temp = dataset.copy()
def remap_col_name(col):
    if col.startswith('radient-'):
        col = col.replace('radient-', 'dire-')
    elif col.startswith('dire-'):
        col = col.replace('dire-', 'radient-')
    return col
        
data_temp.columns = [remap_col_name(col) for col in data_temp.columns]
data_temp.loc[dataset['winner'] == 'Radiant Victory', 'winner'] = 'Dire Victory'
data_temp.loc[dataset['winner'] == 'Dire Victory', 'winner'] = 'Radiant Victory' 
# dataset = pd.concat([dataset, data_temp])

In [8]:
dataset = dataset.take(np.random.permutation(len(dataset)))

In [9]:
X = dataset.drop(['winner', 'duration', 'match_id', 'game_mode'], axis=1)
y = dataset['winner']

## MultinominalNB

In [10]:
pipe = Pipeline([    
    ('classifier', MultinomialNB())
])

scores = cross_val_score(pipe, X, y, cv = 10)
scores.mean()

0.68074741423669338

## MultinominalNB + tf-idf

In [11]:
pipe = Pipeline([
    ('tfidf', TfidfTransformer()),
    ('classifier', MultinomialNB())
])

scores = cross_val_score(pipe, X, y, cv = 10)
scores.mean()

0.66444671858687676

## Logistic Regression

In [12]:
pipe = Pipeline([
    ('classifier', LogisticRegression(penalty='l1'))
])

scores = cross_val_score(pipe, X, y, cv = 10)
scores.mean()

0.68268016324312375

## Logistic Regression + tf-idf

In [13]:
pipe = Pipeline([
    ('tfidf', TfidfTransformer()),
    ('classifier', LogisticRegression(penalty='l1'))
])

scores = cross_val_score(pipe, X, y, cv = 10)
scores.mean()

0.68327209576938641

In [14]:
est = LogisticRegression()
est.fit(X, y)

X.columns[np.ravel(np.abs(est.coef_) > 1)]

Index([u'dire-alchemist', u'dire-oracle', u'dire-undying'], dtype='object')

In [15]:
radient_win_pc(dataset[dataset['radient-alchemist'] == 1])

0.36622807017543857

# Neural Net 

In [25]:
import neurolab as nl

net = nl.net.newff([[0,1] for i in range(220)], [50, 1])

# y = (y == 'Radiant Victory').reshape(-1, 1)

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

inp = x_train
tar = y_train

net.trainf = nl.train.train_rprop
errors = net.train(inp, tar, show=10, epochs=100)

Epoch: 10; Error: 517.790049846;
Epoch: 20; Error: 386.086442785;
Epoch: 30; Error: 280.185580333;
Epoch: 40; Error: 206.560989262;
Epoch: 50; Error: 157.4634889;
Epoch: 60; Error: 127.825080276;
Epoch: 70; Error: 103.2622968;
Epoch: 80; Error: 83.8758258071;
Epoch: 90; Error: 68.3196535895;
Epoch: 100; Error: 60.5778123324;
The maximum number of train epochs is reached


In [20]:
errors

[671.32352191382915,
 998.47962088285522,
 708.08753693357426,
 4775.2138788289631,
 805.42769125581003,
 931.92992964994892,
 655.05491737839111,
 1126.218003301009,
 532.26609802086762,
 672.82686231055618,
 539.31440223005416,
 546.68388628338812,
 500.52704694185303,
 486.86236030083717,
 472.8416688165421,
 464.20793102110258,
 450.21297803398409,
 440.15380076229707,
 430.00388190887998,
 419.90501578800581,
 410.08609064815744,
 400.60377554076393,
 391.33630030352458,
 382.20994295524582,
 374.3501220959796,
 366.48006625362746,
 356.50234544018468,
 345.4269481044098,
 333.02482681996571,
 319.27638984689747,
 304.93418626665084,
 293.87369441540272,
 285.10284068665749,
 274.749817849343,
 265.2304833584144,
 255.89121433245663,
 246.7618493804992,
 237.93475131908113,
 229.54107836374538,
 222.97569788185871,
 217.35226066038732,
 211.7644000326581,
 206.15224428044456,
 200.65728491446734,
 195.07320608936203,
 189.62189233822539,
 184.1223839685488,
 179.42596778642493,
 1

In [26]:
# 586.94683040374218 error = 0.74 accuracy

from sklearn.metrics import accuracy_score
y_pred = net.sim(x_test)
y_pred[y_pred < 0.5] = 0
y_pred[y_pred >= 0.5] = 1

accuracy_score(y_test, y_pred)

0.64571171980242481