In [17]:
import pandas as pd
import xgboost as xgb
import numpy as np
from numpy import logspace, zeros, hstack
from datetime import datetime
from sklearn.preprocessing import scale
from sklearn.metrics import roc_auc_score
from sklearn.cross_validation import KFold, cross_val_predict
from sklearn.linear_model import LogisticRegression

In [2]:
result_features = ['duration', 
                   'tower_status_radiant',
                   'tower_status_dire',
                   'barracks_status_radiant',
                   'barracks_status_dire']
features = pd.read_csv('data/features.csv', index_col='match_id')
features.drop('start_time', axis=1, inplace=True)
features.drop(result_features, axis=1, inplace=True)

In [3]:
feature_counts = features.count()
na_features = feature_counts[feature_counts != features.shape[0]].index.values
for name in na_features: 
    features[name].fillna(0, inplace=True)
for i in na_features:
    print(i)

first_blood_time
first_blood_team
first_blood_player1
first_blood_player2
radiant_bottle_time
radiant_courier_time
radiant_flying_courier_time
radiant_first_ward_time
dire_bottle_time
dire_courier_time
dire_flying_courier_time
dire_first_ward_time


In [4]:
Y = features['radiant_win']
features.drop('radiant_win', axis=1, inplace=True)
X = features

In [5]:
folds = KFold(features.shape[0], n_folds=5, shuffle=True)

In [6]:
tree_counts = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

def task_1(X, Y, counts):
    class xgb_wrapper(xgb.XGBClassifier):
        def predict(self, X):
            return xgb.XGBClassifier.predict_proba(self, X)

    for count in counts:
        start_time = datetime.now()
        probas = cross_val_predict(xgb_wrapper(n_estimators=count), X, Y, folds)
        score = roc_auc_score(Y, probas[:,1])
        time = datetime.now() - start_time
        print('trees: {}, time: {}, score: {}'.format(count, time, score))

In [None]:
task_1(X, Y, tree_counts)

In [134]:
task_1(X, Y, [150, 200])

trees: 150, time: 0:00:58.905639, score: 0.7106615238047015
trees: 200, time: 0:01:14.201833, score: 0.7137183105431796


In [7]:
hero_features = ['{}{}_hero'.format(j, i+1) for j in ['r','d'] for i in range(5)]

In [144]:
def uniq_heroes(X):
    heroes = set()
    for h in hero_features:
        for i in X[h]:
            heroes.add(i)
    return list(heroes)
len(uniq_heroes(X))

108

In [11]:
def task_2(X, Y, folds):
    X_scaled = scale(X)

    class lr_wrapper(LogisticRegression):
        def predict(self, X):
            return LogisticRegression.predict_proba(self, X)

    def best(X, y, folds):
        best_score = -1
        best_c = 0
        best_time = 0

        for c in logspace(-3, 0, num=5):
            start_time = datetime.now()
            clf = lr_wrapper(penalty='l2', C=c)
            probas = cross_val_predict(clf, X, y, folds)
            score = roc_auc_score(y, probas[:,1])
            time = datetime.now() - start_time

            print('C: {}, time: {}, score: {}'.format(c, time, score))

            if score > best_score:
                best_score = score
                best_c = c
                best_time = time

        return best_c, best_score, best_time
    c, score, time = best(X_scaled, Y, folds)
    print ("best C:{}, time: {}, score: {}".format(c, time, score))

In [37]:
task_2(X, Y, folds)

C: 0.001, time: 0:00:06.956681, score: 0.7160242837290827
C: 0.005623413251903491, time: 0:00:09.769686, score: 0.7162321338171613
C: 0.03162277660168379, time: 0:00:11.032106, score: 0.7162128755551335
C: 0.1778279410038923, time: 0:00:10.941017, score: 0.716205439687345
C: 1.0, time: 0:00:10.848965, score: 0.716203595761612
best C:0.005623413251903491, time: 0:00:09.769686, score: 0.7162321338171613


In [38]:
featires_no_lobby = features.drop('lobby_type', axis=1)
features_drop_heroes = featires_no_lobby.drop(hero_features, axis=1)

In [47]:
# no categorial
task_2(features_drop_heroes, Y, folds)

C: 0.001, time: 0:00:06.463927, score: 0.7160353091508259
C: 0.005623413251903491, time: 0:00:09.079025, score: 0.7162392214066978
C: 0.03162277660168379, time: 0:00:09.818752, score: 0.71622008262699
C: 0.1778279410038923, time: 0:00:09.921642, score: 0.7162111134027613
C: 1.0, time: 0:00:09.931438, score: 0.7162096063154836
best C:0.005623413251903491, time: 0:00:09.079025, score: 0.7162392214066978


In [118]:
def bag_of_words(features, heroes):
    X_pick = np.zeros((features.shape[0], max(heroes)))

    for i, match_id in enumerate(features.index):
        for p in range(5):
            X_pick[i, features.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
            X_pick[i, features.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1
    
    return X_pick

heroes = uniq_heroes()
heroes_bag = bag_of_words(featires_no_lobby, heroes)

In [131]:
new_X = hstack((scale(features_drop_heroes), heroes_bag))

In [133]:
# categorial to bag of words
task_2(new_X, Y, folds)

C: 0.001, time: 0:00:12.467649, score: 0.7514523647988232
C: 0.005623413251903491, time: 0:00:16.668586, score: 0.7518053681104867
C: 0.03162277660168379, time: 0:00:18.454529, score: 0.7517761890026317
C: 0.1778279410038923, time: 0:00:19.404638, score: 0.7517616053804506
C: 1.0, time: 0:00:19.395077, score: 0.7517590424423165
best C:0.005623413251903491, time: 0:00:16.668586, score: 0.7518053681104867


In [135]:
# categorial to bag of words xgb
task_1(new_X, Y, [150, 200])

trees: 150, time: 0:01:19.784864, score: 0.7224056014084447
trees: 200, time: 0:01:47.696623, score: 0.7279447229570734


In [136]:
features_test = pd.read_csv('data/features_test.csv', index_col='match_id')
features_test.drop('start_time', axis=1, inplace=True)


feature_test_counts = features_test.count()
na_features_test = feature_test_counts[feature_test_counts != features_test.shape[0]].index.values
for name in na_features_test: features_test[name].fillna(0, inplace=True)

In [146]:
unique_heroes_test = uniq_heroes(features_test)
heroes_bag_test = bag_of_words(features_test, unique_heroes_test)

features_test.drop('lobby_type',axis=1, inplace=True)
features_test.drop(hero_features, axis=1, inplace=True)

X_scaled_test = hstack((scale(features_test), heroes_bag_test))


In [157]:
clf = LogisticRegression(penalty='l2', C=0.005623413251903491)
clf.fit(new_X, Y)

Y_pred = clf.predict(X_scaled_test)

df = pd.DataFrame(index=features_test.index, columns=['radiant_win'])
df['radiant_win'] = clf.predict_proba(X_scaled_test)
df.to_csv('predictions.csv')