In [1]:
import numpy as np
import pandas as pd
import sklearn
import datetime
import re

from pandas import DataFrame, Series
from sklearn.cross_validation import KFold
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [2]:
data = pd.read_csv('features.csv')

In [3]:
#Считаем пропуски
for col in data.columns:
    na_count = data[col].size-data[col].count()
    if(na_count!=0):
        print('{0} : {1}'.format(col, na_count))

first_blood_time : 19553
first_blood_team : 19553
first_blood_player1 : 19553
first_blood_player2 : 43987
radiant_bottle_time : 15691
radiant_courier_time : 692
radiant_flying_courier_time : 27479
radiant_first_ward_time : 1836
dire_bottle_time : 16143
dire_courier_time : 676
dire_flying_courier_time : 26098
dire_first_ward_time : 1826


first_blood_* - первое убийство произошло позже 5 минут
radiant_courier_time - курьера купили позже 5 минут
В целом все события не произошли, или произошли после 5 минут игры

In [4]:
#удаляем итоги матча
def dropPredicted(frame):
   return frame.drop(['radiant_win','duration','tower_status_radiant','tower_status_dire','barracks_status_radiant','barracks_status_dire'], 1)

#удаляем категориальные признаки
def dropCategories(frame):
    categories = list(frame.filter(regex="r._hero",axis=1).columns)
    categories.extend(frame.filter(regex="d._hero",axis=1).columns)
    categories.extend(['first_blood_player1','first_blood_player2','lobby_type','start_time','match_id'])
    return frame.drop(categories,1)

#Масштабирует данные
def scaleData(frame):
    scaler = StandardScaler()
    return DataFrame(scaler.fit_transform(frame))

In [5]:
data = data.fillna(0)

X = dropPredicted(data)
y = data['radiant_win']

In [6]:
def trainAndGetScore(clf, folds):
    summary = 0
    
    start_time = datetime.datetime.now()
    kf = KFold(len(data), n_folds=folds, shuffle=True)
    
    for train_index, test_index in kf:
        clf.fit(X.iloc[train_index], y.iloc[train_index])
        score = roc_auc_score(y.iloc[test_index], clf.predict_proba(X.iloc[test_index])[:,1])
        summary += score
        print(score)
    
    print('\r\nAvr score: {0}\r\nTime elapsed: {1}'.format(summary/folds, datetime.datetime.now() - start_time))

In [7]:
clf = GradientBoostingClassifier(n_estimators=10)
trainAndGetScore(clf,5)

0.664601927078
0.664968482204
0.663944197316
0.663266654288
0.667141342953

Avr score: 0.664784520768
Time elapsed: 0:00:37.893669


In [8]:
clf = GradientBoostingClassifier(n_estimators=20)
trainAndGetScore(clf,5)

0.681693146706
0.676977600096
0.685618849696
0.68582125039
0.678089221491

Avr score: 0.681640013676
Time elapsed: 0:01:21.334655


In [9]:
clf = GradientBoostingClassifier(n_estimators=30)
trainAndGetScore(clf,5)

0.692032006646
0.686187165505
0.689778979586
0.68674771604
0.692981910859

Avr score: 0.689545555727
Time elapsed: 0:01:51.558093


In [10]:
X = dropCategories(dropPredicted(data))
clf = GradientBoostingClassifier(n_estimators=30, max_depth=2)
trainAndGetScore(clf,5)

0.677862361304
0.679055494993
0.680409419271
0.680619624916
0.691516009825

Avr score: 0.681892582062
Time elapsed: 0:00:52.632833


# Логистическая регрессия

In [None]:
X = dropPredicted(data)
clf = LogisticRegression()
trainAndGetScore(clf,5)

0.519705373646
0.512417339814
0.514799590751
0.512214929992
0.508207559684

Avr score: 0.513468958777
Time elapsed: 0:00:03.307769


In [None]:
X = dropCategories(dropPredicted(data))
trainAndGetScore(clf,5)

In [None]:
#Категориальные признаки
X_pick = np.zeros((data.shape[0], 112))

for i, match_id in enumerate(data.index):
    for p in range(5):
        X_pick[i, data.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
        X_pick[i, data.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1

X = pd.concat([X, DataFrame(X_pick)], axis=1)

In [None]:
trainAndGetScore(clf,5)

# Конечная модель

In [None]:
#Выводим дополнительные фичи

#Среднее значение характеристик для каждого героя
stat = ['gold','xp','lh','level','kills','deaths']
heroPref = ['d1', 'd2', 'd3', 'd4', 'd5','r1', 'r2', 'r3', 'r4', 'r5']

#расчитываем статистику по героям
def getHeroStat(i,stat):
    heroStat = list()
    for prefix in heroPref:
        heroStat.extend(data[data['{}_hero'.format(prefix)]==i]['{0}_{1}'.format(prefix, stat)])
    return heroStat

for heroId in range(1,113):
    for s in stat:
        herosStats[s] = [np.average(getHeroStat(heroId, stat=s)) for heroId in range(112)]
        
herosStats = DataFrame(herosStats, columns=stat)

In [None]:
#Отклонение характеристики персонажа от среднего
heroColumns = ['d1_', 'd2_', 'd3_', 'd4_', 'd5_','r1_', 'r2_', 'r3_', 'r4_', 'r5_']

gold_avr = np.zeros((len(data), 10))
xp_avr = np.zeros((len(data), 10))

for index, row in data.iterrows():
    for i in range(10):
        xp_avr[index,i]=row[heroColumns[i]+'xp']-herosStats['xp'][int(row[heroColumns[i]+'hero'])]
        gold_avr[index,i]=row[heroColumns[i]+'gold']-herosStats['gold'][int(row[heroColumns[i]+'hero'])]

feature_hero_stat = pd.concat([DataFrame(xp_avr), DataFrame(gold_avr)], axis=1)

In [None]:
#Категориальные признаки
#Мешок слов для героя комбинируем с "качеством" этого героя
#Коэффициенты для качества героя выведены с помощью feature_importances_  у градиентного бустига, при классификации только по характеристикам героя

X_pick = np.zeros((data.shape[0], 112))
X_lobbies = np.zeros((data.shape[0], 10))

teamS = ['r','d']
heroColumns = ['d1', 'd2', 'd3', 'd4', 'd5','r1', 'r2', 'r3', 'r4', 'r5']
heroFeatureS = ['level','gold','xp','lh','kills','deaths']

for i, match_id in enumerate(data.index):
    X_lobbies[i, data.ix[match_id, 'lobby_type']] = 1

    for p in range(10):
            team = 1 if p>4 else -1
            heroPref = heroColumns[p]
            heroId = data.ix[match_id, '{}_hero'.format(heroPref)].astype(int)
            gold =  data.ix[match_id, '{}_gold'.format(heroPref)] / 1106
            xp = data.ix[match_id, '{}_xp'.format(heroPref)] / 1190.5
            lh = data.ix[match_id, '{}_lh'.format(heroPref)] / 10.4
            items = data.ix[match_id, '{}_items'.format(heroPref)] / 8.3
            
            X_pick[i, heroId-1] = (xp*0.4 + gold*0.355 + lh*0.1 + items*0.072 + 0.072) * team
            

feature_category =  pd.concat([DataFrame(X_pick), DataFrame(X_lobbies)], axis=1)

In [None]:
y = data['radiant_win']

#Сбор всех фич в одну и обучение модели
X = pd.concat([scaleData(dropCategories(dropPredicted(data))), scaleData(feature_hero_stat), feature_category], axis=1)

trainAndGetScore(clf,5) # avr 0.754
clf.fit(X,y) # учимся на всей выборке

# Оценка тестовых данных

In [None]:
data = pd.read_csv('features_test.csv')
data = data.fillna(0)

X_pick = np.zeros((data.shape[0], 112))
X_lobbies = np.zeros((data.shape[0], 10))

teamS = ['r','d']
heroColumns = ['d1', 'd2', 'd3', 'd4', 'd5','r1', 'r2', 'r3', 'r4', 'r5']
heroFeatureS = ['level','gold','xp','lh','kills','deaths']

for i, match_id in enumerate(data.index):
    X_lobbies[i, data.ix[match_id, 'lobby_type']] = 1

    for p in range(10):
            team = 1 if p>4 else -1
            heroPref = heroColumns[p]
            heroId = data.ix[match_id, '{}_hero'.format(heroPref)].astype(int)
            gold =  data.ix[match_id, '{}_gold'.format(heroPref)] / 1106
            xp = data.ix[match_id, '{}_xp'.format(heroPref)] / 1190.5
            lh = data.ix[match_id, '{}_lh'.format(heroPref)] / 10.4
            items = data.ix[match_id, '{}_items'.format(heroPref)] / 8.3
            
            X_pick[i, heroId-1] = (xp*0.4 + gold*0.355 + lh*0.1 + items*0.072 + 0.072) * team
            

feature_category =  pd.concat([DataFrame(X_pick), DataFrame(X_lobbies)], axis=1)

#Отклонение характеристики персонажа от среднего
heroColumns = ['d1_', 'd2_', 'd3_', 'd4_', 'd5_','r1_', 'r2_', 'r3_', 'r4_', 'r5_']

gold_avr = np.zeros((len(data), 10))
xp_avr = np.zeros((len(data), 10))

for index, row in data.iterrows():
    for i in range(10):
        xp_avr[index,i]=row[heroColumns[i]+'xp']-herosStats['xp'][int(row[heroColumns[i]+'hero'])]
        gold_avr[index,i]=row[heroColumns[i]+'gold']-herosStats['gold'][int(row[heroColumns[i]+'hero'])]

feature_hero_stat = pd.concat([DataFrame(xp_avr), DataFrame(gold_avr)], axis=1)

X = pd.concat([scaleData(dropCategories(data)), scaleData(feature_hero_stat), feature_category], axis=1)

proba = clf.predict_proba(X)[:,1]
print(proba.max())
print(proba.min())