In [1]:
"""
Final test on "introduction to the machine learning".

Each notebook cell below starts with commented identificator,
where numbers mean "answer to the final-statement question No...".
"""

import pandas as pd
import time
import datetime

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [2]:
# Create features table "X".

game_data = pd.read_csv('D:/Work/Data_files/working_dir/features.csv', index_col='match_id')
X = game_data.drop(['duration', 'radiant_win', 'tower_status_radiant',
                   'tower_status_dire', 'barracks_status_radiant',
                   'barracks_status_dire'], axis=1)
X.head()

Unnamed: 0_level_0,start_time,lobby_type,r1_hero,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,...,radiant_ward_sentry_count,radiant_first_ward_time,dire_bottle_time,dire_courier_time,dire_flying_courier_time,dire_tpscroll_count,dire_boots_count,dire_ward_observer_count,dire_ward_sentry_count,dire_first_ward_time
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1430198770,7,11,5,2098,1489,20,0,0,7,...,0,35.0,103.0,-84.0,221.0,3,4,2,2,-52.0
1,1430220345,0,42,4,1188,1033,9,0,1,12,...,0,-20.0,149.0,-84.0,195.0,5,4,3,1,-5.0
2,1430227081,7,33,4,1319,1270,22,0,0,12,...,1,-39.0,45.0,-77.0,221.0,3,4,3,1,13.0
3,1430263531,1,29,4,1779,1056,14,0,0,5,...,0,-30.0,124.0,-80.0,184.0,0,4,2,0,27.0
4,1430282290,7,13,4,1431,1090,8,1,0,8,...,0,46.0,182.0,-80.0,225.0,6,3,3,0,-16.0


In [3]:
game_data.describe()

Unnamed: 0,start_time,lobby_type,r1_hero,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,...,dire_boots_count,dire_ward_observer_count,dire_ward_sentry_count,dire_first_ward_time,duration,radiant_win,tower_status_radiant,tower_status_dire,barracks_status_radiant,barracks_status_dire
count,97230.0,97230.0,97230.0,97230.0,97230.0,97230.0,97230.0,97230.0,97230.0,97230.0,...,97230.0,97230.0,97230.0,95404.0,97230.0,97230.0,97230.0,97230.0,97230.0,97230.0
mean,1444232000.0,2.630999,51.517104,3.442672,1233.405801,1147.899702,11.231996,0.357009,0.362285,8.271315,...,3.349553,2.448339,0.689119,-6.901922,2332.247886,0.518503,1309.22779,1286.31082,40.599095,41.337036
std,5515393.0,2.835761,32.564211,1.111741,566.588895,464.111662,9.04162,0.663889,0.626704,2.497575,...,1.155609,0.813459,0.710122,40.701397,715.80685,0.49966,853.921365,851.009148,27.871645,27.064873
min,1430199000.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-84.0,900.0,0.0,0.0,0.0,0.0,0.0
25%,1440815000.0,1.0,22.0,3.0,767.0,746.0,2.0,0.0,0.0,7.0,...,3.0,2.0,0.0,-31.0,1818.0,0.0,36.0,256.0,3.0,3.0
50%,1446338000.0,1.0,50.0,3.0,1175.0,1113.0,11.0,0.0,0.0,8.0,...,3.0,2.0,1.0,-16.0,2268.0,1.0,1824.0,1798.0,63.0,60.0
75%,1448829000.0,7.0,75.0,4.0,1704.0,1479.0,19.0,1.0,1.0,10.0,...,4.0,3.0,1.0,8.0,2778.0,1.0,1974.0,1974.0,63.0,63.0
max,1450313000.0,7.0,112.0,6.0,3319.0,4332.0,47.0,8.0,5.0,34.0,...,9.0,9.0,13.0,300.0,8452.0,1.0,2047.0,2047.0,63.0,63.0


In [4]:
game_data.shape

(97230, 108)

In [5]:
# 1. Find missing values.

X.count()[X.count()<X.count().max()]

first_blood_time               77677
first_blood_team               77677
first_blood_player1            77677
first_blood_player2            53243
radiant_bottle_time            81539
radiant_courier_time           96538
radiant_flying_courier_time    69751
radiant_first_ward_time        95394
dire_bottle_time               81087
dire_courier_time              96554
dire_flying_courier_time       71132
dire_first_ward_time           95404
dtype: int64

1. Missing values interpretation.

Features 'first_blood_time', 'first_blood_team', 'first_blood_player1'
have the same amount of NA-values (97230-77677=19553).
It can be concluded logically, that they are strongly correlated:
any first death of player gives the same value (time) to all of them.
As it was explained in final-statement: if there were no first-blood
in first 5 minutes of this game, then the features 'first_blood_...'
take NA-value.
The same logic could be extended to all other missing values -
action described by feature could occure behind the border of observed time.

In [23]:
# Fill NA-values with zeros and check absence of Na.

X = X.fillna(0)
len(X.count()[X.count()==X.count().max()])

102

In [24]:
# 2. 'Response' variable.

y = game_data.radiant_win
y.describe()

count    97230.000000
mean         0.518503
std          0.499660
min          0.000000
25%          0.000000
50%          1.000000
75%          1.000000
max          1.000000
Name: radiant_win, dtype: float64

2. Response variable.

For given dataset response variable is "radiant_win",
becaues it directly answers the main question:
"Which team have won the game?"

In [25]:
# Initialize  data-splitter.

kf = KFold(n_splits=5, shuffle=True)
kf.split(X)

<generator object _BaseKFold.split at 0x000001B6C5357B10>

In [26]:
# Cross-validation of gradient boosting.

# End result will be placed here
grad_boost_score = {'estimators_number': [], 'elapsed_time': [],
          'cv_1': [], 'cv_2': [], 'cv_3': [], 'cv_4': [],
          'cv_5': [], 'cv_mean': []}

estimators_number = [10, 20, 30]
for i in estimators_number:
    clf = GradientBoostingClassifier(n_estimators=i, max_depth=3)
    
    start_time = datetime.datetime.now()
    
    # specify scoring with 'roc_auc' from sklearn library
    score = cross_val_score(clf, X, y,
                            scoring='roc_auc',
                            cv=kf, n_jobs=-1,
                            verbose=1)
    
    end_time = datetime.datetime.now()
    elapsed_time = (end_time-start_time).total_seconds()
    
    grad_boost_score['estimators_number'].append(i)
    grad_boost_score['elapsed_time'].append(elapsed_time)
    grad_boost_score['cv_1'].append(score[0])
    grad_boost_score['cv_2'].append(score[1])
    grad_boost_score['cv_3'].append(score[2])
    grad_boost_score['cv_4'].append(score[3])
    grad_boost_score['cv_5'].append(score[4])
    grad_boost_score['cv_mean'].append(score.mean())

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    9.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   16.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   24.8s finished


In [27]:
score_data = pd.DataFrame(grad_boost_score).set_index('estimators_number')
score_data

Unnamed: 0_level_0,elapsed_time,cv_1,cv_2,cv_3,cv_4,cv_5,cv_mean
estimators_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
10,9.589308,0.66704,0.666011,0.657092,0.667096,0.665793,0.664607
20,17.249272,0.68183,0.682868,0.686565,0.683346,0.677659,0.682454
30,25.069898,0.686421,0.69148,0.682987,0.693875,0.690357,0.689024


3. Q: How long does it take to provide cross-validation with 30 trees in classifier?
   A: It took near 25 seconds (depends on hardware).
   
   Q: What quality was finally ahieved?
   A: For used configuration AUC-ROC score was 0.689+.
   
4. Q: Does it make sense to use more than 30 trees in gradient boosting classifier?
   A: In my opinion - yes, at least for some tasks this would help to achieve
   better results, since the quality of the classification is still growing
   (although only in the 3rd decimal place). And additionaly - if fill Nan values
   not with zeros but with very large or small values, then with growing number of
   trees in classifier quality increases even more obvious.
   
   Q: What could be proposed for improving time-performance for higher number of trees?
   A: Parallelizing with n_jobs=-1(already done in my code) and decreasing the max_depth
   parameter of trees.

In [28]:
# scaling
scaler = StandardScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)
X_scaled.shape

(97230, 102)

In [29]:
# Cross-validation of logistic regression
# with categorical features

# End result will be placed here
log_reg_score = {'regularisation_values': [],
                 'elapsed_time': [],
                 'cv_1': [], 'cv_2': [], 'cv_3': [],
                 'cv_4': [], 'cv_5': [], 'cv_mean': []}

regularisation_values = [i/10 for i in range(1, 17, 2)]
for i in regularisation_values:
    clf = LogisticRegression(C = i)
    
    start_time = datetime.datetime.now()
    
    # specify scoring with 'roc_auc' from sklearn library
    score = cross_val_score(clf, X_scaled, y,
                            scoring='roc_auc',
                            cv=kf, n_jobs=-1,
                            verbose=1)
    
    end_time = datetime.datetime.now()
    elapsed_time = (end_time-start_time).total_seconds()
    
    log_reg_score['regularisation_values'].append(i)
    log_reg_score['elapsed_time'].append(elapsed_time)
    log_reg_score['cv_1'].append(score[0])
    log_reg_score['cv_2'].append(score[1])
    log_reg_score['cv_3'].append(score[2])
    log_reg_score['cv_4'].append(score[3])
    log_reg_score['cv_5'].append(score[4])
    log_reg_score['cv_mean'].append(score.mean())

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out

In [30]:
score_data = pd.DataFrame(log_reg_score).set_index('regularisation_values')
score_data

Unnamed: 0_level_0,elapsed_time,cv_1,cv_2,cv_3,cv_4,cv_5,cv_mean
regularisation_values,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0.1,2.615091,0.719479,0.716539,0.718242,0.718182,0.708747,0.716238
0.3,2.191625,0.71967,0.719643,0.711602,0.717704,0.714075,0.716539
0.5,2.062263,0.713052,0.714325,0.723801,0.723414,0.706665,0.716251
0.7,1.686012,0.716698,0.71447,0.720863,0.711855,0.718464,0.71647
0.9,1.680144,0.716256,0.718885,0.713064,0.718104,0.715081,0.716278
1.1,1.634618,0.718147,0.715618,0.713713,0.716548,0.717572,0.716319
1.3,1.670802,0.711373,0.716201,0.715047,0.72074,0.717982,0.716269
1.5,1.663038,0.711735,0.714788,0.717338,0.713061,0.724878,0.71636


LR better than GB. Probably due to:
1. changing value to zero instead of large or small outlier (better for LR than to GB) and 
2. scaling (without it LR MUCH worse)
3. quality of GB have not reached its mazimum 

In [31]:
categorical = ['r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero',
               'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero',
               'lobby_type']
X_reduced = X.drop(columns = categorical)
X_reduced_scaled = scaler.fit_transform(X_reduced)

In [32]:
# Cross-validation of logistic regression
# without categorical features

# End result will be placed here
log_reg_score = {'regularisation_values': [],
                 'elapsed_time': [],
                 'cv_1': [], 'cv_2': [], 'cv_3': [],
                 'cv_4': [], 'cv_5': [], 'cv_mean': []}

regularisation_values = [i/10 for i in range(1, 17, 2)]
for i in regularisation_values:
    clf = LogisticRegression(C = i)
    
    start_time = datetime.datetime.now()
    
    # specify scoring with 'roc_auc' from sklearn library
    score = cross_val_score(clf, X_reduced_scaled, y,
                            scoring='roc_auc',
                            cv=kf, n_jobs=-1,
                            verbose=1)
    
    end_time = datetime.datetime.now()
    elapsed_time = (end_time-start_time).total_seconds()
    
    log_reg_score['regularisation_values'].append(i)
    log_reg_score['elapsed_time'].append(elapsed_time)
    log_reg_score['cv_1'].append(score[0])
    log_reg_score['cv_2'].append(score[1])
    log_reg_score['cv_3'].append(score[2])
    log_reg_score['cv_4'].append(score[3])
    log_reg_score['cv_5'].append(score[4])
    log_reg_score['cv_mean'].append(score.mean())

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out

In [33]:
score_data = pd.DataFrame(log_reg_score).set_index('regularisation_values')
score_data

Unnamed: 0_level_0,elapsed_time,cv_1,cv_2,cv_3,cv_4,cv_5,cv_mean
regularisation_values,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0.1,1.597741,0.712569,0.720271,0.7129,0.717308,0.719544,0.716519
0.3,1.492836,0.714585,0.714851,0.719786,0.718435,0.715135,0.716558
0.5,1.523015,0.717199,0.71981,0.710879,0.715389,0.71854,0.716363
0.7,1.549929,0.713584,0.710384,0.71839,0.721504,0.71713,0.716199
0.9,1.542138,0.71672,0.716505,0.715375,0.712049,0.720806,0.716291
1.1,1.523381,0.71669,0.718521,0.710279,0.717134,0.72012,0.716549
1.3,1.49423,0.713692,0.716508,0.714367,0.718809,0.718901,0.716455
1.5,1.576358,0.7126,0.719648,0.716227,0.711811,0.72099,0.716255


Quality increased, and it is not so obvious approach - REMOVING of important features can improve the model!

In [40]:
hero_types = []
for feature in categorical[0:-1]:
    for hero in X[feature]:
        if hero not in hero_types:
            hero_types.append(hero)
hero_types

[11,
 42,
 33,
 29,
 13,
 8,
 35,
 17,
 15,
 22,
 27,
 68,
 26,
 53,
 20,
 92,
 88,
 104,
 3,
 73,
 72,
 110,
 36,
 58,
 91,
 50,
 71,
 30,
 25,
 39,
 19,
 101,
 94,
 51,
 7,
 75,
 46,
 66,
 67,
 93,
 38,
 65,
 12,
 99,
 44,
 10,
 41,
 34,
 102,
 32,
 95,
 84,
 81,
 16,
 6,
 96,
 43,
 79,
 47,
 1,
 2,
 63,
 97,
 80,
 54,
 83,
 5,
 60,
 77,
 112,
 21,
 69,
 85,
 82,
 87,
 62,
 18,
 74,
 28,
 61,
 14,
 90,
 70,
 78,
 52,
 48,
 49,
 4,
 98,
 59,
 56,
 86,
 37,
 100,
 23,
 57,
 106,
 40,
 9,
 76,
 31,
 64,
 109,
 103,
 55,
 105,
 89,
 45]

In [None]:
# N — количество различных героев в выборке
X_pick = np.zeros((data.shape[0], N))

for i, match_id in enumerate(data.index):
    for p in xrange(5):
        X_pick[i, data.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
        X_pick[i, data.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1