## Этап 1

### 1 загрузка данных

In [26]:
import pandas as pd

features = pd.read_csv('features.csv', index_col='match_id')
features_test = pd.read_csv('features_test.csv', index_col='match_id')

not_result_columns = [el for el in list(features) if el in list(features_test)]

X_train = features[not_result_columns]

X_test = features_test

### 2 проверка на пропуски

In [27]:
total = X_train.count().max()
for index, val in X_train.count().iteritems():
    if val < total:
        print(index, val)

first_blood_time 77677
first_blood_team 77677
first_blood_player1 77677
first_blood_player2 53243
radiant_bottle_time 81539
radiant_courier_time 96538
radiant_flying_courier_time 69751
radiant_first_ward_time 95394
dire_bottle_time 81087
dire_courier_time 96554
dire_flying_courier_time 71132
dire_first_ward_time 95404


Т.к. эти фичи описывают некие события, происходящие во время игровой сессии, то пропуск может означать, что событие так никогда и не произошло.

Так, из описания данных известно, что если событие "первая кровь" не успело произойти за первые 5 минут, то признаки first_blood_time, first_blood_team, first_blood_player1, first_blood_player2 принимают пропущенное значение.

### 3 замена пропусков нулями

In [28]:
X_train.fillna(0, inplace=True)

In [1]:
print(X_train)

NameError: name 'X_train' is not defined

### 4 Целевая перменная - radiant_win

In [29]:
y_train = features['radiant_win']

### 5 Обучение

Обучим модель с бустингом при количестве деревьев (10, 20, 30, 40, 50).

In [35]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import KFold, GridSearchCV

grid = {'n_estimators': (10, 20, 30, 40, 50)}
cv = KFold(n_splits=5, shuffle=True)
clf = GradientBoostingClassifier()
gs = GridSearchCV(clf, grid, scoring='roc_auc', cv=cv)
gs.fit(X_train, y_train)

cv_results = pd.DataFrame.from_dict(gs.cv_results_)
cv_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,6.007701,0.45142,0.048804,0.005913,10,{'n_estimators': 10},0.668729,0.657219,0.657763,0.666743,0.666967,0.663484,0.004944,5
1,11.349714,1.002081,0.056573,0.005737,20,{'n_estimators': 20},0.686071,0.675519,0.675339,0.685013,0.684193,0.681227,0.004772,4
2,16.113678,1.99279,0.06678,0.011854,30,{'n_estimators': 30},0.693,0.685071,0.684383,0.691598,0.692115,0.689233,0.003713,3
3,17.589548,0.659056,0.064259,0.008547,40,{'n_estimators': 40},0.69775,0.690096,0.689457,0.696049,0.695887,0.693848,0.003394,2
4,22.85649,1.941535,0.067029,0.003591,50,{'n_estimators': 50},0.700975,0.692248,0.693264,0.698351,0.699641,0.696896,0.003495,1


## Отчет по этапу 1

### 1. Какие признаки имеют пропуски среди своих значений? Что могут означать пропуски в этих признаках (ответьте на этот вопрос для двух любых признаков)?

- first_blood_time
- first_blood_team
- first_blood_player1
- first_blood_player2
- radiant_bottle_time
- radiant_courier_time
- radiant_flying_courier_time
- radiant_first_ward_time
- dire_bottle_time
- dire_courier_time
- dire_flying_courier_time
- dire_first_ward_time

Если событие "первая кровь" не успело произойти за первые 5 минут, то признаки first_blood_time, first_blood_team, принимают пропущенное значение.

### 2. Как называется столбец, содержащий целевую переменную?

radiant_win

### 3. Как долго проводилась кросс-валидация для градиентного бустинга с 30 деревьями? Инструкцию по измерению времени можно найти ниже по тексту. Какое качество при этом получилось? Напомним, что в данном задании мы используем метрику качества AUC-ROC.

- Около 16 сек.
- Качество 0.69

### 4. Имеет ли смысл использовать больше 30 деревьев в градиентном бустинге? Что бы вы предложили делать, чтобы ускорить его обучение при увеличении количества деревьев?

- Имеет, качество растет с увеличением количества деревьев. Но после 40 разница в качестве уже довольно мала.
- Можно ускориться, если ограничить глубину каждого дерева в композиции (max_depth) или количество признаков для каждого деления (max_features). Еще можно оценить вклад признаков и удалить малозначимые, если такие найдутся.

## Этап 2

In [76]:
import pandas as pd

features = pd.read_csv('features.csv', index_col='match_id')
features_test = pd.read_csv('features_test.csv', index_col='match_id')

not_result_columns = [el for el in list(features) if el in list(features_test)]

X_train = features.loc[:, not_result_columns]
X_test = features_test
X_train.fillna(0, inplace=True)
y_train = features['radiant_win']

In [77]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, GridSearchCV

grid = {'C': np.power(10.0, np.arange(-5, 6))}
cv = KFold(n_splits=5, shuffle=True)
clf = LogisticRegression(solver='lbfgs')
gs = GridSearchCV(clf, grid, scoring='roc_auc', cv=cv)
gs.fit(X_train, y_train)

cv_results = pd.DataFrame.from_dict(gs.cv_results_)
cv_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.4576,0.061124,0.028001,0.006029,1e-05,{'C': 1e-05},0.514974,0.510753,0.510522,0.515802,0.515307,0.513472,0.00233,1
1,0.402602,0.016559,0.023003,0.000629,0.0001,{'C': 0.0001},0.514974,0.510753,0.510522,0.515802,0.515307,0.513472,0.00233,1
2,0.434803,0.047645,0.027202,0.003969,0.001,{'C': 0.001},0.514974,0.510753,0.510522,0.515802,0.515307,0.513472,0.00233,1
3,0.437999,0.020985,0.026405,0.001497,0.01,{'C': 0.01},0.514974,0.510753,0.510522,0.515802,0.515307,0.513472,0.00233,1
4,0.468602,0.032808,0.027198,0.002227,0.1,{'C': 0.1},0.514974,0.510753,0.510522,0.515802,0.515307,0.513472,0.00233,1
5,0.498199,0.0428,0.026801,0.002927,1.0,{'C': 1.0},0.514974,0.510753,0.510522,0.515802,0.515307,0.513472,0.00233,1
6,0.450598,0.030512,0.024802,0.001833,10.0,{'C': 10.0},0.514974,0.510753,0.510522,0.515802,0.515307,0.513472,0.00233,1
7,0.405999,0.011006,0.024999,0.001414,100.0,{'C': 100.0},0.514974,0.510753,0.510522,0.515802,0.515307,0.513472,0.00233,1
8,0.418399,0.018018,0.023601,0.001626,1000.0,{'C': 1000.0},0.514974,0.510753,0.510522,0.515802,0.515307,0.513472,0.00233,1
9,0.4306,0.044368,0.0254,0.002061,10000.0,{'C': 10000.0},0.514974,0.510753,0.510522,0.515802,0.515307,0.513472,0.00233,1


Получилось плохо. Отмасштабируем

In [80]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(X_train)

X_train_scaled = scaler.transform(X_train)

In [81]:
grid = {'C': np.power(10.0, np.arange(-5, 6))}
cv = KFold(n_splits=5, shuffle=True)
clf = LogisticRegression(solver='lbfgs')
gs = GridSearchCV(clf, grid, scoring='roc_auc', cv=cv)
gs.fit(X_train_scaled, y_train)

cv_results = pd.DataFrame.from_dict(gs.cv_results_)
cv_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.563735,0.204021,0.015598,0.002423,1e-05,{'C': 1e-05},0.689239,0.699891,0.693866,0.696463,0.696297,0.695151,0.003524,11
1,0.566399,0.024616,0.013002,0.00063,0.0001,{'C': 0.0001},0.705143,0.715562,0.710529,0.712531,0.712477,0.711249,0.003451,10
2,0.888801,0.073701,0.014998,0.001672,0.001,{'C': 0.001},0.710012,0.720235,0.715718,0.717321,0.717535,0.716164,0.003401,9
3,1.205403,0.104928,0.014,0.000635,0.01,{'C': 0.01},0.710177,0.720389,0.715866,0.717542,0.71755,0.716305,0.003392,1
4,1.135801,0.067804,0.013999,0.001413,0.1,{'C': 0.1},0.710156,0.720361,0.715848,0.717541,0.717467,0.716275,0.003386,2
5,1.2438,0.10586,0.013797,0.002137,1.0,{'C': 1.0},0.710153,0.720357,0.715843,0.717537,0.717456,0.716269,0.003386,3
6,1.198203,0.105901,0.014598,0.001859,10.0,{'C': 10.0},0.710152,0.720355,0.715843,0.717535,0.717454,0.716268,0.003385,8
7,1.247804,0.086837,0.013398,0.000799,100.0,{'C': 100.0},0.710152,0.720356,0.715843,0.717536,0.717454,0.716268,0.003385,4
8,1.58255,0.358633,0.019995,0.004689,1000.0,{'C': 1000.0},0.710152,0.720356,0.715843,0.717536,0.717454,0.716268,0.003385,7
9,1.439071,0.132665,0.017996,0.002607,10000.0,{'C': 10000.0},0.710152,0.720356,0.715843,0.717536,0.717454,0.716268,0.003385,5


Удалим категориальные признаки:

In [83]:
categorical = [
    "lobby_type",
    "r1_hero", "r2_hero", "r3_hero", "r4_hero", "r5_hero",
    "d1_hero", "d2_hero", "d3_hero", "d4_hero", "d5_hero"
]

X_train2 = X_train.loc[:, [el for el in list(X_train) if el not in list(categorical)]]

In [84]:
grid2 = {'C': np.power(10.0, np.arange(-5, 6))}
cv2 = KFold(n_splits=5, shuffle=True)
clf2 = LogisticRegression(solver='lbfgs')
gs2 = GridSearchCV(clf2, grid2, scoring='roc_auc', cv=cv2)
gs2.fit(X_train2, y_train)

cv_results2 = pd.DataFrame.from_dict(gs2.cv_results_)
cv_results2

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.509408,0.18904,0.023597,0.002158,1e-05,{'C': 1e-05},0.509936,0.517649,0.51416,0.511615,0.513853,0.513443,0.002608,1
1,0.386803,0.009946,0.022601,0.001624,0.0001,{'C': 0.0001},0.509936,0.517649,0.51416,0.511615,0.513853,0.513443,0.002608,1
2,0.376396,0.015361,0.0228,0.002922,0.001,{'C': 0.001},0.509936,0.517649,0.51416,0.511615,0.513853,0.513443,0.002608,1
3,0.398302,0.014901,0.022602,0.001497,0.01,{'C': 0.01},0.509936,0.517649,0.51416,0.511615,0.513853,0.513443,0.002608,1
4,0.382601,0.028486,0.021202,0.00098,0.1,{'C': 0.1},0.509936,0.517649,0.51416,0.511615,0.513853,0.513443,0.002608,1
5,0.374401,0.0166,0.022401,0.001016,1.0,{'C': 1.0},0.509936,0.517649,0.51416,0.511615,0.513853,0.513443,0.002608,1
6,0.440338,0.044863,0.026961,0.009029,10.0,{'C': 10.0},0.509936,0.517649,0.51416,0.511615,0.513853,0.513443,0.002608,1
7,0.419248,0.032778,0.022849,0.001992,100.0,{'C': 100.0},0.509936,0.517649,0.51416,0.511615,0.513853,0.513443,0.002608,1
8,0.39552,0.031583,0.024713,0.003592,1000.0,{'C': 1000.0},0.509936,0.517649,0.51416,0.511615,0.513853,0.513443,0.002608,1
9,0.41671,0.009837,0.023311,0.002244,10000.0,{'C': 10000.0},0.509936,0.517649,0.51416,0.511615,0.513853,0.513443,0.002608,1


И еще раз отмасштабируем:

In [86]:
scaler2 = StandardScaler().fit(X_train2)
X_train2_scaled = scaler2.transform(X_train2)

grid2 = {'C': np.power(10.0, np.arange(-5, 6))}
cv2 = KFold(n_splits=5, shuffle=True)
clf2 = LogisticRegression(solver='lbfgs')
gs2 = GridSearchCV(clf2, grid2, scoring='roc_auc', cv=cv2)
gs2.fit(X_train2_scaled, y_train)

cv_results2 = pd.DataFrame.from_dict(gs2.cv_results_)
cv_results2

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.473602,0.087848,0.015796,0.007144,1e-05,{'C': 1e-05},0.697911,0.692663,0.694484,0.695225,0.694687,0.694994,0.001694,11
1,0.568711,0.059321,0.013197,0.001321,0.0001,{'C': 0.0001},0.714246,0.708843,0.711122,0.711645,0.709965,0.711164,0.00182,10
2,0.804398,0.043409,0.013642,0.000534,0.001,{'C': 0.001},0.719215,0.713656,0.716225,0.716832,0.715025,0.71619,0.001862,9
3,1.240328,0.115027,0.015398,0.001202,0.01,{'C': 0.01},0.719247,0.713788,0.716356,0.717151,0.715407,0.71639,0.001815,1
4,1.168602,0.044318,0.013002,0.000632,0.1,{'C': 0.1},0.719194,0.713776,0.716321,0.717148,0.715422,0.716372,0.0018,2
5,1.248142,0.068206,0.0136,0.000488,1.0,{'C': 1.0},0.719183,0.713774,0.716319,0.71715,0.715423,0.71637,0.001798,3
6,1.192234,0.097825,0.013402,0.001357,10.0,{'C': 10.0},0.719182,0.713773,0.716318,0.71715,0.715422,0.716369,0.001798,8
7,1.425218,0.107517,0.015805,0.004004,100.0,{'C': 100.0},0.719182,0.713773,0.716318,0.71715,0.715423,0.716369,0.001798,4
8,1.424798,0.196663,0.016202,0.001718,1000.0,{'C': 1000.0},0.719182,0.713773,0.716318,0.71715,0.715423,0.716369,0.001798,7
9,1.344393,0.105258,0.014598,0.000799,10000.0,{'C': 10000.0},0.719182,0.713773,0.716318,0.71715,0.715423,0.716369,0.001798,6


Стало совсем чуть-чуть лучше, скорее в пределах погрешности.

In [87]:
rcols = ['r%d_hero' % (p+1) for p in range(5)]
dcols = ['d%d_hero' % (p+1) for p in range(5)]

heroes = sorted(pd.unique(X_train[rcols+dcols].values.flatten()))
hcols = ['hero_%d' % h for h in heroes]

X_train3 = X_train2.copy()

for h in heroes:
    X_train3['hero_%d' % h] = np.zeros((X_train.shape[0], 1))

values = [] # сюда пишем "мешок"

ri = 0
for radiant, dire in zip(X_train.loc[:, rcols].values,
                         X_train.loc[:, dcols].values):
    t = time()
    if ri % 10000 == 0:
        print(ri)
    row_values = np.zeros(len(heroes))
    for ci, h in enumerate(heroes):
        if h in radiant:
            row_values[ci] = 1
        elif h in dire:
            row_values[ci] = -1
        else:
            row_values[ci] = 0
    values.append(row_values)
    ri += 1

values = np.array(values)

0
10000
20000
30000
40000
50000
60000
70000
80000
90000


In [88]:
len(heroes)

108

In [89]:
for i, hcol in enumerate(hcols):
    X_train3.loc[:, hcol] = values[:, i]
X_train3

Unnamed: 0_level_0,start_time,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,r2_level,r2_xp,...,hero_100,hero_101,hero_102,hero_103,hero_104,hero_105,hero_106,hero_109,hero_110,hero_112
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1430198770,5,2098,1489,20,0,0,7,3,842,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,1430220345,4,1188,1033,9,0,1,12,4,1596,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1430227081,4,1319,1270,22,0,0,12,3,1314,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1430263531,4,1779,1056,14,0,0,5,2,539,...,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1430282290,4,1431,1090,8,1,0,8,2,629,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114402,1450265551,4,1706,1198,17,0,1,8,2,616,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
114403,1450277704,4,1793,1416,17,0,1,5,3,764,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
114404,1450291848,4,1399,540,1,0,0,5,4,1448,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
114405,1450292986,3,1135,766,6,0,2,6,5,1954,...,1.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0


In [90]:
scaler3 = StandardScaler().fit(X_train3)
X_train3_scaled = scaler3.transform(X_train3)

In [91]:
grid3 = {'C': np.power(10.0, np.arange(-5, 6))}
cv3 = KFold(n_splits=5, shuffle=True)
clf3 = LogisticRegression(solver='lbfgs')
gs3 = GridSearchCV(clf3, grid3, scoring='roc_auc', cv=cv3)
gs3.fit(X_train3_scaled, y_train)

cv_results3 = pd.DataFrame.from_dict(gs3.cv_results_)
cv_results3

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,1.088976,0.239162,0.0348,0.012887,1e-05,{'C': 1e-05},0.712771,0.713923,0.714686,0.719004,0.713829,0.714843,0.002168,11
1,1.043449,0.091122,0.0256,0.004928,0.0001,{'C': 0.0001},0.738834,0.742829,0.744193,0.746023,0.742119,0.7428,0.002388,10
2,2.098347,0.181418,0.032236,0.005421,0.001,{'C': 0.001},0.747053,0.752131,0.753498,0.753175,0.752186,0.751609,0.00234,9
3,2.632996,0.353776,0.024819,0.00439,0.01,{'C': 0.01},0.747418,0.752594,0.753726,0.753102,0.752796,0.751927,0.002287,1
4,2.655697,0.175943,0.023603,0.002874,0.1,{'C': 0.1},0.747389,0.752582,0.75366,0.753036,0.752795,0.751893,0.00228,2
5,2.416116,0.072702,0.021403,0.002577,1.0,{'C': 1.0},0.747385,0.75258,0.753649,0.75303,0.752793,0.751887,0.00228,3
6,2.603221,0.201827,0.024206,0.00467,10.0,{'C': 10.0},0.747385,0.752579,0.753649,0.753031,0.752792,0.751887,0.002279,7
7,2.993201,0.413032,0.027997,0.006574,100.0,{'C': 100.0},0.747385,0.752579,0.75365,0.75303,0.752792,0.751887,0.00228,8
8,3.46316,0.385785,0.02898,0.006587,1000.0,{'C': 1000.0},0.747385,0.752579,0.75365,0.75303,0.752792,0.751887,0.002279,5
9,2.508899,0.091434,0.0234,0.00242,10000.0,{'C': 10000.0},0.747385,0.752579,0.75365,0.75303,0.752792,0.751887,0.002279,6


## Отчет по этапу 2


### 1. Какое качество получилось у логистической регрессии над всеми исходными признаками? Как оно соотносится с качеством градиентного бустинга? Чем вы можете объяснить эту разницу? Быстрее ли работает логистическая регрессия по сравнению с градиентным бустингом?

- Без масштабирования - 0.513472, сильно хуже чем бустинг. Это логично - деревья и так работают неплохо, а логистическая регрессия так не умеет.
- С масштабированием - 0.717550. Даже лучше деревьев из этапа 1.
- Работает очень сильно быстрее.

### 2.Как влияет на качество логистической регрессии удаление категориальных признаков (укажите новое значение метрики качества)? Чем вы можете объяснить это изменение?

- Чуть-чуть лучше: 0.716305 -> 0.716390.
- После удаление должно было стать получше, потому что для логистической регрессии сырые категориальные данные представляются как числовые и являются шумом.

### 3. Сколько различных идентификаторов героев существует в данной игре?

108.

Хотя при нумерации 1..112 хочется сказать "112".

### 4. Какое получилось качество при добавлении "мешка слов" по героям? Улучшилось ли оно по сравнению с предыдущим вариантом? Чем вы можете это объяснить?

- 0.716390 -> 0.751927. Неплохой прирост.
- Объяснение понятное: набор героев с той и с другой стороны - важные признаки.

### 5.Какое минимальное и максимальное значение прогноза на тестовой выборке получилось у лучшего из алгоритмов?

- Лучший алгоритм показал при кросс-валидации значения 0.747418, 0.752594, 0.753726, 0.753102, 0.752796.
- Лучший результат: 0.753726
- Худший результат: 0.747418