In [31]:
import numpy as np
import pandas as pd
import time
from sklearn import preprocessing
from sklearn import cross_validation
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import f1_score, confusion_matrix
from sklearn import grid_search
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer

In [32]:
def train_classifier(clf, X_train, y_train):
    print "Training {}...".format(clf.__class__.__name__)
    start = time.time()
    clf.fit(X_train, y_train)
    end = time.time()
    print "Done!\nTraining time (secs): {:.3f}".format(end - start)

In [33]:
def predict_labels(clf, features, target):
    print "Predicting labels using {}...".format(clf.__class__.__name__)
    start = time.time()
    y_pred = clf.predict(features)
    end = time.time()
    print "Done!\nPrediction time (secs): {:.3f}".format(end - start)
    print "Confusion Matrix:\n {}".format(confusion_matrix(target, y_pred))
    return f1_score(target.values, y_pred, pos_label=100)

In [34]:
# Ready csv with game data
game_data = pd.read_csv('gameData_15Minutes.csv',keep_default_na=False)

In [35]:
# Get the Accuracy for the Naive Approach (the team with more gold wins)
def naive_accuracy(X,y):    
    blue = len(X[X['blueGold']>X['redGold']][y==100])
    red  = len(X[X['blueGold']<X['redGold']][y==200])
    games = len(X)

    print 'Naive Accuracy: {:.2f}%'.format(float(blue+red)/games*100)

In [36]:
# Data Set for learning
X_all = game_data[game_data.columns[-28:]]
y_all = game_data['winner']

# Normalize Data 
#X_all = preprocessing.normalize(X_all, norm='l2')

# Separate data into train/test samples
num_all = len(game_data)
num_train = int(num_all*0.75)  # 75% of the data as train set
num_test = num_all - num_train
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X_all,y_all,test_size=num_test,random_state=0)
print "Training set: {} samples".format(X_train.shape[0])
print "Test set: {} samples".format(X_test.shape[0])
print X_all.columns

Training set: 64369 samples
Test set: 21457 samples
Index([u'blueGold', u'redGold', u'blueDragons', u'blueRiftHerald',
       u'redDragons', u'redRiftHerald', u'blueWards', u'redWards',
       u'blueDestroyedWards', u'redDestroyedWards', u'blueTopTurrets',
       u'blueMidTurrets', u'blueBotTurrets', u'redTopTurrets',
       u'redMidTurrets', u'redBotTurrets', u'blueInhibitor', u'redInhibitor',
       u'blueKills', u'blueAssists', u'redKills', u'redAssists', u'blueCs',
       u'blueJg', u'blueXp', u'redCs', u'redJg', u'redXp'],
      dtype='object')


In [37]:
# Train Classifier
clf = RandomForestClassifier(n_estimators=1000, max_features=0.5, max_depth=5)
train_classifier(clf, X_train, y_train)  
print clf

# Evaluate Classifier's performance on Train Data
train_f1_score = predict_labels(clf, X_train, y_train)
print "F1 score for training set: {}".format(train_f1_score)

# Evaluate Classifier's performance on Test Data
print "F1 score for test set: {}".format(predict_labels(clf, X_test, y_test))

Training RandomForestClassifier...
Done!
Training time (secs): 102.727
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features=0.5, max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Predicting labels using RandomForestClassifier...
Done!
Prediction time (secs): 4.992
Confusion Matrix:
 [[24292  7711]
 [ 7663 24703]]
F1 score for training set: 0.759623502924
Predicting labels using RandomForestClassifier...
Done!
Prediction time (secs): 1.726
Confusion Matrix:
 [[7934 2704]
 [2615 8204]]
F1 score for test set: 0.748949827725


In [38]:
naive_accuracy(X_test,y_test)

Naive Accuracy: 75.12%


In [42]:
print game_data[game_data['blueGold']>game_data['redGold']][game_data['winner']==200]

          matchId  matchVersion region                      queueType  \
12     2090631173     6.2.0.247     NA  TEAM_BUILDER_DRAFT_RANKED_5x5   
50     2103760187     6.3.0.240     NA  TEAM_BUILDER_DRAFT_RANKED_5x5   
52     2104346549     6.3.0.240     NA  TEAM_BUILDER_DRAFT_RANKED_5x5   
63     2104881738     6.3.0.240     NA  TEAM_BUILDER_DRAFT_RANKED_5x5   
67     2105710124     6.3.0.240     NA  TEAM_BUILDER_DRAFT_RANKED_5x5   
71     2105840008     6.3.0.240     NA  TEAM_BUILDER_DRAFT_RANKED_5x5   
108    2109788545     6.4.0.249     NA  TEAM_BUILDER_DRAFT_RANKED_5x5   
133    2111230700     6.4.0.257     NA  TEAM_BUILDER_DRAFT_RANKED_5x5   
137    2111380022     6.4.0.257     NA  TEAM_BUILDER_DRAFT_RANKED_5x5   
145    2111977774     6.4.0.257     NA  TEAM_BUILDER_DRAFT_RANKED_5x5   
151    2112041510     6.4.0.257     NA  TEAM_BUILDER_DRAFT_RANKED_5x5   
173    2112659213     6.4.0.257     NA  TEAM_BUILDER_DRAFT_RANKED_5x5   
182    2113568612     6.4.0.259     NA  TEAM_BUILDE

In [43]:
print game_data[game_data['blueGold']<game_data['redGold']][game_data['winner']==100]

          matchId  matchVersion region                      queueType  \
1      2082080642     6.2.0.238     NA  TEAM_BUILDER_DRAFT_RANKED_5x5   
4      2085919125     6.2.0.238     NA  TEAM_BUILDER_DRAFT_RANKED_5x5   
5      2085984147     6.2.0.238     NA  TEAM_BUILDER_DRAFT_RANKED_5x5   
13     2090933974     6.2.0.247     NA  TEAM_BUILDER_DRAFT_RANKED_5x5   
29     2098790673     6.3.0.240     NA  TEAM_BUILDER_DRAFT_RANKED_5x5   
34     2101437978     6.3.0.240     NA  TEAM_BUILDER_DRAFT_RANKED_5x5   
35     2101677342     6.3.0.240     NA  TEAM_BUILDER_DRAFT_RANKED_5x5   
39     2102180425     6.3.0.240     NA  TEAM_BUILDER_DRAFT_RANKED_5x5   
42     2102273654     6.3.0.240     NA  TEAM_BUILDER_DRAFT_RANKED_5x5   
57     2104518286     6.3.0.240     NA  TEAM_BUILDER_DRAFT_RANKED_5x5   
64     2104886590     6.3.0.240     NA  TEAM_BUILDER_DRAFT_RANKED_5x5   
68     2105769741     6.3.0.240     NA  TEAM_BUILDER_DRAFT_RANKED_5x5   
70     2105795120     6.3.0.240     NA  TEAM_BUILDE

In [6]:
parameters = {'n_estimators': (100, 500, 1000),
              'max_features': (0.3, 0.4, 0.5, 0.6),
              'max_depth': (5, 6, 7, 8, 9, 10)}
f1 = make_scorer(f1_score, pos_label=100)
clf = RandomForestClassifier()

reg = grid_search.GridSearchCV(clf, parameters, scoring=f1)
reg.fit(X_train,y_train)

print "Best Estimator:"
print reg.best_estimator_
print "F1 score for test sample:"
predict_labels(reg, X_test, y_test)

Best Estimator:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features=0.3, max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
F1 score for test sample:
Predicting labels using GridSearchCV...
Done!
Prediction time (secs): 1.235
Confusion Matrix:
 [[6224 2181]
 [2083 6439]]


0.74485399712781231