In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, GridSearchCV, cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import warnings
from tqdm import tqdm
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('data/features.csv')

In [3]:
data.head()

Unnamed: 0,match_id,start_time,lobby_type,r1_hero,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,...,dire_boots_count,dire_ward_observer_count,dire_ward_sentry_count,dire_first_ward_time,duration,radiant_win,tower_status_radiant,tower_status_dire,barracks_status_radiant,barracks_status_dire
0,0,1430198770,7,11,5,2098,1489,20,0,0,...,4,2,2,-52.0,2874,1,1796,0,51,0
1,1,1430220345,0,42,4,1188,1033,9,0,1,...,4,3,1,-5.0,2463,1,1974,0,63,1
2,2,1430227081,7,33,4,1319,1270,22,0,0,...,4,3,1,13.0,2130,0,0,1830,0,63
3,3,1430263531,1,29,4,1779,1056,14,0,0,...,4,2,0,27.0,1459,0,1920,2047,50,63
4,4,1430282290,7,13,4,1431,1090,8,1,0,...,3,3,0,-16.0,2449,0,4,1974,3,63


In [4]:
train = data.drop(['duration', 'radiant_win', 'tower_status_radiant',
                   'tower_status_dire', 'barracks_status_radiant',
                   'barracks_status_dire', 'match_id', 'start_time'], axis=1)

In [5]:
train.columns[train.count() != train.shape[0]]

Index(['first_blood_time', 'first_blood_team', 'first_blood_player1',
       'first_blood_player2', 'radiant_bottle_time', 'radiant_courier_time',
       'radiant_flying_courier_time', 'radiant_first_ward_time',
       'dire_bottle_time', 'dire_courier_time', 'dire_flying_courier_time',
       'dire_first_ward_time'],
      dtype='object')

Пустые клетки из-за событий, которые не успели произойти(например, не было ФБ или не было покупки курьера)

In [6]:
train_filled = train.fillna(0)
y = data['radiant_win']

## Boosting

In [7]:
kfold = KFold(n_splits=5, shuffle=True, random_state=241)
scores = []
for n in [10, 20, 30, 40, 50]:
    gb = GradientBoostingClassifier(n_estimators=n, random_state=241)
    train_score = []
    test_score = []
    for train_idx, test_idx in kfold.split(train_filled, y):
        X_train, y_train = train_filled.iloc[train_idx], y.iloc[train_idx]
        X_test, y_test = train_filled.iloc[test_idx], y.iloc[test_idx]
        
        gb.fit(X_train, y_train)
        train_score.append(roc_auc_score(y_train, gb.predict(X_train)))
        test_score.append(roc_auc_score(y_test, gb.predict(X_test)))
        
    scores.append([np.mean(train_score), np.mean(test_score)])
    print('{}: {:>.4f} | {:>.4f}'.format(n, np.mean(train_score), np.mean(test_score)))

10: 0.6126 | 0.6059
20: 0.6331 | 0.6260
30: 0.6409 | 0.6313
40: 0.6454 | 0.6358
50: 0.6490 | 0.6381


In [8]:
boost = GradientBoostingClassifier(n_estimators=30, random_state=241)

In [9]:
%%time
score = cross_val_score(boost, train_filled, y, cv=kfold, n_jobs=-1)

CPU times: user 73.4 ms, sys: 91.4 ms, total: 165 ms
Wall time: 36.5 s


## Log Reg

In [10]:
scaler = StandardScaler()

In [11]:
train_scaled = scaler.fit_transform(train_filled)
scores = []
params = [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3]
for param in params:
    train_score = []
    test_score = []
    for train_idx, test_idx in kfold.split(train_scaled, y):
        logreg = LogisticRegression(C=param, random_state=241)
        X_train, y_train = train_scaled[train_idx], y.iloc[train_idx]
        X_test, y_test = train_scaled[test_idx], y.iloc[test_idx]
        
        logreg.fit(X_train, y_train)
        train_score.append(roc_auc_score(y_train, logreg.predict_proba(X_train)[:, 1]))
        test_score.append(roc_auc_score(y_test, logreg.predict_proba(X_test)[:, 1]))
        
    scores.append([np.mean(train_score), np.mean(test_score)])
    print('{}: {:>.4f} | {:>.4f}'.format(param, np.mean(train_score), np.mean(test_score)))

0.0001: 0.7123 | 0.7111
0.001: 0.7177 | 0.7161
0.01: 0.7180 | 0.7163
0.1: 0.7181 | 0.7162
1.0: 0.7181 | 0.7162
10.0: 0.7181 | 0.7162
100.0: 0.7181 | 0.7162
1000.0: 0.7181 | 0.7162


In [12]:
logit = LogisticRegression(C=1e-3, random_state=241)

In [13]:
%%time
score = cross_val_score(logit, train_scaled, y, cv=kfold, n_jobs=-1)

CPU times: user 154 ms, sys: 48.2 ms, total: 202 ms
Wall time: 4.97 s


## Categorical features

In [14]:
to_drop = [i for i in train_filled.columns if '_hero' in i] + ['lobby_type']
train_drop_cat = train_filled.drop(to_drop, axis=1)

In [15]:
data_scaled = scaler.fit_transform(train_drop_cat)
scores = []
params = [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3]
for param in params:
    train_score = []
    test_score = []
    for train_idx, test_idx in kfold.split(data_scaled, y):
        logreg = LogisticRegression(C=param, random_state=241)
        X_train, y_train = data_scaled[train_idx], y.iloc[train_idx]
        X_test, y_test = data_scaled[test_idx], y.iloc[test_idx]
        
        logreg.fit(X_train, y_train)
        train_score.append(roc_auc_score(y_train, logreg.predict_proba(X_train)[:, 1]))
        test_score.append(roc_auc_score(y_test, logreg.predict_proba(X_test)[:, 1]))
        
    scores.append([np.mean(train_score), np.mean(test_score)])
    print('{}: {:>.4f} | {:>.4f}'.format(param, np.mean(train_score), np.mean(test_score)))

0.0001: 0.7121 | 0.7111
0.001: 0.7175 | 0.7161
0.01: 0.7179 | 0.7163
0.1: 0.7179 | 0.7163
1.0: 0.7179 | 0.7163
10.0: 0.7179 | 0.7163
100.0: 0.7179 | 0.7163
1000.0: 0.7179 | 0.7163


In [16]:
heroes = np.zeros((train_filled.shape[0], max(train_filled['r4_hero'])), dtype=int)
heroes_col = [i for i in train_filled.columns if '_hero' in i]
heroes_r = train_filled[heroes_col[:5]].to_numpy()
heroes_d = train_filled[heroes_col[5:]].to_numpy()

In [17]:
for i in range(train_filled.shape[0]):
    line = heroes_r[i]
    heroes[i, line - 1] = 1
    line = heroes_d[i]
    heroes[i, line - 1] = -1

In [18]:
np.unique(heroes, return_counts=True)

(array([-1,  0,  1]), array([ 486150, 9917460,  486150]))

In [19]:
data_heroes = np.concatenate([data_scaled, heroes], axis=1)

In [20]:
scores = []
params = [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3]
for param in params:
    train_score = []
    test_score = []
    for train_idx, test_idx in kfold.split(data_heroes, y):
        logreg = LogisticRegression(C=param, random_state=241)
        X_train, y_train = data_heroes[train_idx], y.iloc[train_idx]
        X_test, y_test = data_heroes[test_idx], y.iloc[test_idx]
        
        logreg.fit(X_train, y_train)
        train_score.append(roc_auc_score(y_train, logreg.predict_proba(X_train)[:, 1]))
        test_score.append(roc_auc_score(y_test, logreg.predict_proba(X_test)[:, 1]))
        
    scores.append([np.mean(train_score), np.mean(test_score)])
    print('{}: {:>.4f} | {:>.4f}'.format(param, np.mean(train_score), np.mean(test_score)))

0.0001: 0.7261 | 0.7249
0.001: 0.7482 | 0.7462
0.01: 0.7542 | 0.7516
0.1: 0.7546 | 0.7518
1.0: 0.7546 | 0.7518
10.0: 0.7546 | 0.7518
100.0: 0.7546 | 0.7518
1000.0: 0.7546 | 0.7518


### Predict

In [21]:
test = pd.read_csv('data/features_test.csv')

In [22]:
test_clean = test.fillna(0).drop(to_drop + ['match_id', 'start_time'], axis=1)

In [23]:
test_scaled = scaler.transform(test_clean)

In [24]:
heroes = np.zeros((test.shape[0], max(train_filled['r4_hero'])), dtype=int)
heroes_col = [i for i in test.columns if '_hero' in i]
heroes_r = test[heroes_col[:5]].to_numpy()
heroes_d = test[heroes_col[5:]].to_numpy()

for i in range(test.shape[0]):
    line = heroes_r[i]
    heroes[i, line - 1] = 1
    line = heroes_d[i]
    heroes[i, line - 1] = -1
    
np.unique(heroes, return_counts=True)

(array([-1,  0,  1]), array([  85885, 1752054,   85885]))

In [25]:
test_heroes = np.concatenate([test_scaled, heroes], axis=1)

In [26]:
model = LogisticRegression(C=0.1, random_state=241)
model.fit(data_heroes, y)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=241, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [27]:
y_pred = model.predict_proba(test_heroes)[:, 1]

In [28]:
pd.DataFrame({'match_id': test['match_id'], 'radiant_win': y_pred}).to_csv('predict.csv', index=False)