In [1]:
import numpy as np
import pandas as pd

import datetime # ,time

#sklearn libraries
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import KFold, GridSearchCV, cross_val_score

In [2]:
features = pd.read_csv("data/features.csv",index_col='match_id')

### Удаляем признаки, связанные с итогами матча

In [3]:
features_to_be_deleted = "start_time duration tower_status_radiant tower_status_dire barracks_status_radiant barracks_status_dire".split()

In [4]:
features.drop(axis="columns",labels=features_to_be_deleted,inplace=True)

In [67]:
features.head()

Unnamed: 0_level_0,lobby_type,r1_hero,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,r2_hero,...,radiant_first_ward_time,dire_bottle_time,dire_courier_time,dire_flying_courier_time,dire_tpscroll_count,dire_boots_count,dire_ward_observer_count,dire_ward_sentry_count,dire_first_ward_time,radiant_win
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,7,11,5,2098,1489,20,0,0,7,67,...,35.0,103.0,-84.0,221.0,3,4,2,2,-52.0,1
1,0,42,4,1188,1033,9,0,1,12,49,...,-20.0,149.0,-84.0,195.0,5,4,3,1,-5.0,1
2,7,33,4,1319,1270,22,0,0,12,98,...,-39.0,45.0,-77.0,221.0,3,4,3,1,13.0,0
3,1,29,4,1779,1056,14,0,0,5,30,...,-30.0,124.0,-80.0,184.0,0,4,2,0,27.0,0
4,7,13,4,1431,1090,8,1,0,8,27,...,46.0,182.0,-80.0,225.0,6,3,3,0,-16.0,0


### Проверяем выборку на наличие пропусков

In [7]:
features.count()

lobby_type                  97230
r1_hero                     97230
r1_level                    97230
r1_xp                       97230
r1_gold                     97230
                            ...  
dire_boots_count            97230
dire_ward_observer_count    97230
dire_ward_sentry_count      97230
dire_first_ward_time        95404
radiant_win                 97230
Length: 102, dtype: int64

### признаки, имеющие пропуски

In [15]:
# which features have missing data
feat_with_missing_data = features.columns.values[features.isna().any()]
print(len(feat_with_missing_data), " features have missing data.")
# Which amoung of data is missing
features[feat_with_missing_data].isna().sum()

12  features have missing data.


first_blood_time               19553
first_blood_team               19553
first_blood_player1            19553
first_blood_player2            43987
radiant_bottle_time            15691
radiant_courier_time             692
radiant_flying_courier_time    27479
radiant_first_ward_time         1836
dire_bottle_time               16143
dire_courier_time                676
dire_flying_courier_time       26098
dire_first_ward_time            1826
dtype: int64

Самый большое количество пропусков имеет признак "first_blood_player2", потому что часто событие "Первая кровь" происходит без причастия второго игрока!
Так же признак "radiant_flying_courier_time" имеет достаточно немалое количество пропусков,  из-за того, что во многих матчах предмет "flying_courier" приобретается только после пяти первых игровых минут.

### Заменяем пропуски на нули

In [5]:
features.fillna(0,axis="columns",inplace=True)

### Целевая переменная

In [17]:
# COLUMN WITH TARGET VARIABLE
print("Target column: radiant_win")

Target column: radiant_win


### Генератор разбиений для кросс-валидации

In [6]:
kfold = KFold(n_splits=5,shuffle=True)

### Кросс-валидация

In [57]:
# KFOld for cross Validation 
learning_data = features.sample(20000)
X = learning_data.drop(axis='columns',labels="radiant_win").values
y = np.asarray(learning_data["radiant_win"])

In [31]:
for n_estimators in [10,20,30]:
    clf = GradientBoostingClassifier(n_estimators=n_estimators)
    
    start_time = datetime.datetime.now()
    scores = cross_val_score(clf,X,y,cv=kfold,scoring="roc_auc")
    totaltime = datetime.datetime.now() - start_time
    
    print("n_estimators: ",n_estimators,"  score: ",round(np.mean(scores),3), "  CV time: ", totaltime)

n_estimators:  10   score:  0.658   CV time:  0:00:37.862951
n_estimators:  20   score:  0.679   CV time:  0:01:11.208067
n_estimators:  30   score:  0.686   CV time:  0:01:45.521000


Скорее всего, качество продолжит расти при дальнейшем увеличении значения параметра n_estimators

## Логистическая регрессия
### 1. Оценка качества логистической регрессии

In [7]:
grid = {'C':np.power(10.0,np.arange(-5,6))}

log_clf = LogisticRegression(solver='lbfgs')
gs = GridSearchCV(log_clf,grid,scoring='roc_auc',cv=kfold)

In [35]:
gs.fit(X,y)





GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=True),
             error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'C': array([1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02,
       1.e+03, 1.e+04, 1.e+05])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='roc_auc', verbose=0)

In [36]:
gs.cv_results_

{'mean_fit_time': array([ 4.65908394,  8.67342286, 14.52760501, 20.65219865, 21.52280021,
        20.73300004, 22.23933744, 21.67800546, 21.03180046, 22.66758304,
        22.10878916]),
 'std_fit_time': array([0.3111774 , 0.89665717, 0.62856584, 1.64439094, 1.82963267,
        2.63291381, 3.57977587, 2.08522579, 3.11242547, 1.19894818,
        3.37082551]),
 'mean_score_time': array([0.03556056, 0.01120491, 0.0112072 , 0.01119666, 0.01200466,
        0.01120572, 0.01240292, 0.01119461, 0.01160107, 0.01241431,
        0.01260982]),
 'std_score_time': array([0.0394651 , 0.00039852, 0.00039701, 0.00098745, 0.0010897 ,
        0.00042114, 0.0014978 , 0.00039253, 0.00102137, 0.00135448,
        0.0013443 ]),
 'param_C': masked_array(data=[1e-05, 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0,
                    1000.0, 10000.0, 100000.0],
              mask=[False, False, False, False, False, False, False, False,
                    False, False, False],
        fill_value='?',
             dt

In [41]:
# Best parameter
gs.best_params_

{'C': 0.0001}

In [44]:
# The score
print("Best score: ",round(gs.best_score_,2))

Best score:  0.72


In [59]:
# LEARNING WITH BEST PARAMETER C
log_clf = LogisticRegression(C=0.0001)

start_time = datetime.datetime.now()
scores = cross_val_score(log_clf,X,y,cv=kfold,scoring="roc_auc")
totaltime = datetime.datetime.now() - start_time

print("Logistic regression score: ",round(np.mean(scores),3), "  CV time: ", totaltime)



Logistic regression score:  0.715   CV time:  0:01:08.962632


Чуть быстрее чем градиентный бустинг, и качество совсем также более приятное по сравнению с градиентным бустингом! Может быть благодаря тому, что логистическая регрессия более подходящая к этим данным.

### 2. Удаление категориальных признаков и кросс валидация

In [8]:
# New features to be deleted
r_hero,d_hero = [],[]
for i in range(1,6):
    r_hero.append(f"r{i}_hero")
    d_hero.append(f"d{i}_hero")

In [60]:
# Deletion of above features from the data 
X_first = learning_data.drop(axis='columns',labels=["radiant_win"]+r_hero+d_hero).values

In [119]:
# WE REPEAT THE CROSS VALIDATION
gs.fit(X_first,y)





GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=True),
             error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'C': array([1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02,
       1.e+03, 1.e+04, 1.e+05])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='roc_auc', verbose=0)

In [120]:
# The best parameter remains the same
gs.best_params_

{'C': 0.01}

In [121]:
# Strangly the score does not increase much
gs.best_score_

0.7134892377325055

Качество не улучшилось, может быть потому что удаленные признаки не такие существенные 

### 3. Cколько различных идентификаторов героев существует в данной игре?

In [9]:
heros = features[r_hero+d_hero]

In [10]:
heros_id = np.unique(heros.values)
#hero_id, hero_counts = np.unique(heros.values,return_counts=True)

In [95]:
heros_id

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  25,  26,  27,
        28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,
        41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,
        54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,
        67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,
        80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,
        93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104, 105,
       106, 109, 110, 112], dtype=int64)

In [11]:
N_heros = heros_id[-1]
print(f"В данных есть {N_heros} различных индентификаторов героев.")

В данных есть 112 различных индентификаторов героев.


### 4. Преобразование данных

In [12]:
X_pick = np.zeros((features.shape[0],N_heros))

for i, match_id in enumerate(features.index):
    for p in range(5):
        X_pick[i, features.loc[match_id, 'r%d_hero' % (p+1)]-1] = 1
        X_pick[i, features.loc[match_id, 'd%d_hero' % (p+1)]-1] = -1

In [100]:
X_pick

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ...,
       [ 0.,  0.,  0., ...,  0.,  0.,  1.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0., -1.]])

In [13]:
X_first = features.drop(axis='columns',labels=["radiant_win"]+r_hero+d_hero).values

In [14]:
# New objet-features matrice
X = np.append(X_first,X_pick,axis=1)

In [15]:
#Target columns
y = np.asarray(features.radiant_win)

In [116]:
X.shape

(97230, 203)

### 5. Кросс-валидация для логистической регрессии на новой выборке

In [122]:
# LET"S FIND THE BEST PARAMETER
gs.fit(X,y)





GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=True),
             error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'C': array([1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02,
       1.e+03, 1.e+04, 1.e+05])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='roc_auc', verbose=0)

In [123]:
gs.best_params_

{'C': 10000.0}

In [124]:
gs.best_score_

0.7135992305523432

In [16]:
log_clf = LogisticRegression(C=10000)

In [17]:
log_clf.fit(X,y)



LogisticRegression(C=10000, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

Качество получилось 0.713. Оно по моим результатам особо не улучшилось в связи с тем что добавленные признаки существенный вклад не приносят в качество.

### 6. Предсказание на тестовой выборке
Из изложенного выше, сразу понятно что мы будем использовать логистическую регрессию для предсказания, так как её качество превосходит качества градиентного бустинга!

In [18]:
test = pd.read_csv("data/features_test.csv",index_col="match_id")

In [19]:
test.fillna(0,axis=1,inplace=True)

In [20]:
N_heros = np.unique(test[r_hero+d_hero])[-1]

In [21]:
X_pick1 = test.drop(axis=1,labels=r_hero+d_hero).values

In [22]:
X_pick2 = np.zeros((test.shape[0], N_heros))

for i, match_id in enumerate(test.index):
    for p in range(5):
        X_pick2[i, test.loc[match_id, 'r%d_hero' % (p+1)]-1] = 1
        X_pick2[i, test.loc[match_id, 'd%d_hero' % (p+1)]-1] = -1

In [23]:
X = np.append(X_pick1,X_pick2,axis=1)

In [24]:
predictions = log_clf.predict_proba(X)

In [54]:
radiant_win = np.squeeze(predictions[:,1:])

In [55]:
radiant_win

array([0.8403763 , 0.77381141, 0.19860287, ..., 0.23031625, 0.60760679,
       0.41259217])

### Минимальное и максимальное значения

In [62]:
np.min(radiant_win)

0.00821799638386768

In [63]:
np.max(radiant_win)

0.9965125069745461

###  Final file

In [56]:
result = {'match_id': test.index.values,'radiant_win':radiant_win}

In [58]:
result_predictions = pd.DataFrame(result)

In [61]:
result_predictions.set_index('match_id')

Unnamed: 0_level_0,radiant_win
match_id,Unnamed: 1_level_1
6,0.840376
7,0.773811
10,0.198603
13,0.879852
16,0.265325
...,...
114369,0.706677
114377,0.629391
114378,0.230316
114393,0.607607


In [66]:
result_predictions.to_csv('radiant_win.csv',encoding='utf-8',index=False)