In [1]:
import numpy as np
import pandas as pd

from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.feature_selection import SelectKBest
from sklearn.grid_search import GridSearchCV
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
dataset = pd.read_csv('dota_games.csv', index_col=0)

In [3]:
dataset.dtypes.value_counts()

float64    221
object       2
int64        1
dtype: int64

### Duplicate Radient/Dire games

In [4]:
def radient_win_pc(extract):
    counts = extract['winner'].value_counts()
    return (1.0 * counts['Radiant Victory']) / (counts['Radiant Victory'] + counts['Dire Victory'])

In [5]:
dataset['winner'].value_counts()

Radiant Victory    3788
Dire Victory       2959
dtype: int64

In [6]:
radient_win_pc(dataset)

0.561434711723729

In [7]:
data_temp = dataset.copy()
def remap_col_name(col):
    if col.startswith('radient-'):
        col = col.replace('radient-', 'dire-')
    elif col.startswith('dire-'):
        col = col.replace('dire-', 'radient-')
    return col
        
data_temp.columns = [remap_col_name(col) for col in data_temp.columns]
data_temp.loc[dataset['winner'] == 'Radiant Victory', 'winner'] = 'Dire Victory'
data_temp.loc[dataset['winner'] == 'Dire Victory', 'winner'] = 'Radiant Victory' 
# dataset = pd.concat([dataset, data_temp])

In [8]:
dataset = dataset.take(np.random.permutation(len(dataset)))

In [9]:
X = dataset.drop(['winner', 'duration', 'match_id', 'game_mode'], axis=1)
y = dataset['winner']

## MultinominalNB

In [10]:
pipe = Pipeline([    
    ('classifier', MultinomialNB())
])

scores = cross_val_score(pipe, X, y, cv = 10)
scores.mean()

0.67778951395961906

## MultinominalNB + tf-idf

In [11]:
pipe = Pipeline([
    ('tfidf', TfidfTransformer()),
    ('classifier', MultinomialNB())
])

scores = cross_val_score(pipe, X, y, cv = 10)
scores.mean()

0.65881863281352715

## Logistic Regression

In [12]:
pipe = Pipeline([
    ('classifier', LogisticRegression(penalty='l1'))
])

scores = cross_val_score(pipe, X, y, cv = 10)
scores.mean()

0.68001283781654243

## Logistic Regression + tf-idf

In [13]:
pipe = Pipeline([
    ('tfidf', TfidfTransformer()),
    ('classifier', LogisticRegression(penalty='l1'))
])

scores = cross_val_score(pipe, X, y, cv = 10)
scores.mean()

0.67986402698923365

In [14]:
est = LogisticRegression()
est.fit(X, y)

X.columns[np.ravel(np.abs(est.coef_) > 1)]

Index([u'dire-alchemist', u'dire-oracle', u'dire-undying'], dtype='object')

In [15]:
radient_win_pc(dataset[dataset['radient-alchemist'] == 1])

0.36622807017543857

# Neural Net 

In [16]:
X.shape

(6747, 220)

In [None]:
import neurolab as nl

net = nl.net.newff([[0,1] for i in range(220)], [50, 1])

inp = X
tar = (y == 'Radiant Victory').reshape(-1, 1)

net.trainf = nl.train.train_gdm
errors = net.train(inp, tar, show=4, epochs=20)

In [34]:
errors

[8443.2836798030294,
 1404.131975983194,
 2478.5463575824788,
 1087.5618839076233,
 1043.6299569614707,
 942.46116195418551,
 887.33659611239568,
 836.04904279246239,
 779.85273928059246,
 730.01892392098955,
 691.41089630726503,
 670.59483560601541,
 658.06995476956058,
 644.38245650460999,
 633.39036062854757,
 624.10292647093115,
 615.08657819875339,
 605.68933036477199,
 596.97737479982447,
 586.94683040374218]

In [61]:
# 586.94683040374218 error = 0.74 accuracy

from sklearn.metrics import accuracy_score
y_pred = net.sim(X)
y_pred[y_pred < 0.5] = 0
y_pred[y_pred >= 0.5] = 1

accuracy_score(tar, y_pred)

0.74447902771602192