In [1]:
import pandas as pd
import time
import datetime
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import GradientBoostingClassifier
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler

In [2]:
data = pd.read_csv('features.csv', index_col = 'match_id')
test = pd.read_csv('features_test.csv',index_col='match_id')

In [3]:
#удаление признаков, связанных с итогами матча
to_drop = []
for i in data.columns:
    if i not in test.columns:
        to_drop.append(i)
        
X=data.drop(to_drop,axis = 1)

In [4]:
#пропуски
kol = 0
for i in X.columns:
    if X[i].count() != X.shape[0]:
        print(i)
        kol +=1
print(kol,' признаков из ',X.shape[1],'имеют пропуски. (',kol/X.shape[1]*100,'%)')

first_blood_time
first_blood_team
first_blood_player1
first_blood_player2
radiant_bottle_time
radiant_courier_time
radiant_flying_courier_time
radiant_first_ward_time
dire_bottle_time
dire_courier_time
dire_flying_courier_time
dire_first_ward_time
12  признаков из  102 имеют пропуски. ( 11.76470588235294 %)


In [5]:
#Все эти признаки могут иметь пропуски, так как события, им
#соответствующие, могли не произойти в первые 5 минут матча.

In [6]:
#заполнение пропусков нулями
X.fillna(value=0,inplace=True)

In [7]:
#столбец целевой переменной
y = data['radiant_win']

In [8]:
cv = KFold(n_splits=5,shuffle=True)

X_1 = np.array(X)
y_1 = np.array(y)
np.delete(X_1,0, 0)
np.delete(y_1,0,0)

array([1, 0, 0, ..., 0, 0, 1], dtype=int64)

In [9]:
nachalo = datetime.datetime.now()

kolvo = [5,10,20,30,35,40]
learning_rate = [0.1,0.2,0.3,0.5,1]

for lr in learning_rate:
    print('Learning rate = ',lr)
    for n in kolvo:
        print('\tNumber of estimators: ',n)
        start_time = datetime.datetime.now()
        score = 0
        clf = GradientBoostingClassifier(n_estimators = n,learning_rate=lr)
        for train_id,test_id in cv.split(X_1):
            X_train,X_test = X_1[train_id], X_1[test_id]
            y_train,y_test = y_1[train_id], y_1[test_id]
            clf.fit(X_train,y_train)
            pred = clf.predict_proba(X_test)[:,1]
            score += roc_auc_score(y_test,pred)
        score/=5
        print('\tMean roc auc score: ',score)
        print('\tTime elapsed: ',datetime.datetime.now()-start_time,'\n')
    print('\n-----------------------------------------------\n')

print('Duration:',datetime.datetime.now()-nachalo)

Learning rate =  0.1
	Number of estimators:  5
	Mean roc auc score:  0.636128094942
	Time elapsed:  0:00:15.073819 

	Number of estimators:  10
	Mean roc auc score:  0.664564552611
	Time elapsed:  0:00:32.791749 

	Number of estimators:  20
	Mean roc auc score:  0.682313970098
	Time elapsed:  0:00:56.122964 

	Number of estimators:  30
	Mean roc auc score:  0.689773451365
	Time elapsed:  0:01:12.651712 

	Number of estimators:  35
	Mean roc auc score:  0.691756172706
	Time elapsed:  0:01:24.136885 

	Number of estimators:  40
	Mean roc auc score:  0.693912003844
	Time elapsed:  0:01:37.220189 


-----------------------------------------------

Learning rate =  0.2
	Number of estimators:  5
	Mean roc auc score:  0.655927524535
	Time elapsed:  0:00:14.992671 

	Number of estimators:  10
	Mean roc auc score:  0.67641991503
	Time elapsed:  0:00:27.085280 

	Number of estimators:  20
	Mean roc auc score:  0.691606814007
	Time elapsed:  0:00:50.910237 

	Number of estimators:  30
	Mean roc a

In [10]:
#масштабирование
X_1 = StandardScaler(copy = False).fit_transform(X_1)

In [11]:
grid = np.power(10.0, np.arange(-5, 6))

for C in grid:
    print('C = ',C)
    score = 0
    clf = LogisticRegression(C = C)
    for train_id, test_id in cv.split(X_1):
        X_train,X_test = X_1[train_id], X_1[test_id]
        y_train,y_test = y_1[train_id], y_1[test_id]
        clf.fit(X_train,y_train)
        pred = clf.predict_proba(X_test)[:,1]
        score += roc_auc_score(y_test,pred)
    score/=5
    print('Mean auc roc score: ',score)


C =  1e-05
Mean auc roc score:  0.695119475452
C =  0.0001
Mean auc roc score:  0.711273015757
C =  0.001
Mean auc roc score:  0.716205927228
C =  0.01
Mean auc roc score:  0.716350860972
C =  0.1
Mean auc roc score:  0.716323526629
C =  1.0
Mean auc roc score:  0.716354083048
C =  10.0
Mean auc roc score:  0.716412706907
C =  100.0
Mean auc roc score:  0.716483205999
C =  1000.0
Mean auc roc score:  0.716390590332
C =  10000.0
Mean auc roc score:  0.716486399241
C =  100000.0
Mean auc roc score:  0.716448741963


In [12]:
features = ['r1_hero','r2_hero','r3_hero','r4_hero','r5_hero','d1_hero','d2_hero','d3_hero','d4_hero','d5_hero']

In [15]:
#удаление категориальных признаков
X_2 = X.drop(['lobby_type']+features,axis=1)
X_2 = np.array(X_2)
np.delete(X_2,0,0)
X_2=StandardScaler(copy = False).fit_transform(X_2)

In [16]:
grid = np.power(10.0, np.arange(-5, 6))

for C in grid:
    print('C = ',C)
    score = 0
    clf = LogisticRegression(C = C)
    for train_id, test_id in cv.split(X_2):
        X_train,X_test = X_2[train_id], X_2[test_id]
        y_train,y_test = y_1[train_id], y_1[test_id]
        clf.fit(X_train,y_train)
        pred = clf.predict_proba(X_test)[:,1]
        score += roc_auc_score(y_test,pred)
    score/=5
    print('Mean auc roc score: ',score)


C =  1e-05
Mean auc roc score:  0.69494438105
C =  0.0001
Mean auc roc score:  0.711212332405
C =  0.001
Mean auc roc score:  0.716079683196
C =  0.01
Mean auc roc score:  0.716630386392
C =  0.1
Mean auc roc score:  0.716387337233
C =  1.0
Mean auc roc score:  0.716357234881
C =  10.0
Mean auc roc score:  0.716255183161
C =  100.0
Mean auc roc score:  0.716565587698
C =  1000.0
Mean auc roc score:  0.716513573643
C =  10000.0
Mean auc roc score:  0.716465427901
C =  100000.0
Mean auc roc score:  0.716434839021


In [17]:
print(max(data['r1_hero'].unique()))
print(len(data['r1_hero'].unique()))

112
108


In [18]:
def makeBag(data):
    X_pick = np.zeros((data.shape[0],112))

    for i,match_id in enumerate(data.index):
        for p in range(5):
            X_pick[i, data.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
            X_pick[i, data.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1 
    return X_pick

In [19]:
X_3 = np.hstack((X_2,makeBag(data)))

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  


In [20]:
grid = np.power(10.0, np.arange(-5, 6))

for C in grid:
    print('C = ',C)
    score = 0
    clf = LogisticRegression(C = C)
    for train_id, test_id in cv.split(X_3):
        X_train,X_test = X_3[train_id], X_3[test_id]
        y_train,y_test = y_1[train_id], y_1[test_id]
        clf.fit(X_train,y_train)
        pred = clf.predict_proba(X_test)[:,1]
        score += roc_auc_score(y_test,pred)
    score/=5
    print('Mean auc roc score: ',score)


C =  1e-05
Mean auc roc score:  0.699224002541
C =  0.0001
Mean auc roc score:  0.724880819244
C =  0.001
Mean auc roc score:  0.746299831888
C =  0.01
Mean auc roc score:  0.751577026586
C =  0.1
Mean auc roc score:  0.751830964069
C =  1.0
Mean auc roc score:  0.751675566088
C =  10.0
Mean auc roc score:  0.752004372175
C =  100.0
Mean auc roc score:  0.751752564226
C =  1000.0
Mean auc roc score:  0.751808719486
C =  10000.0
Mean auc roc score:  0.751816593076
C =  100000.0
Mean auc roc score:  0.751803868875


In [21]:
#для тестовой
test.fillna(0,inplace = True) 
make_test = np.hstack((test.drop(features+['lobby_type'],axis=1),makeBag(test)))
make_test = StandardScaler(copy = False).fit_transform(make_test)

clf = LogisticRegression(C = 10)
clf.fit(X_3,y_1)
pred = clf.predict_proba(make_test)[:,1]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  


In [23]:
pred[100:110]

array([ 0.59376988,  0.04700281,  0.75457189,  0.3561409 ,  0.79652291,
        0.1728468 ,  0.79804254,  0.34413269,  0.73988984,  0.8972236 ])

In [25]:
f = open('rez.csv','w')

f.write('match_id,radiant_win')
f.write('\n')
for i in range(test.shape[0]):
    f.write(str(test.index[i]))
    f.write(',')
    f.write(str(pred[i]))
    f.write('\n')

f.close()