In [216]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold

from datetime import datetime

In [217]:
na_time = 10000
kfold_splits = 5
ns_estimators = [5]

# files
features_csv = "path/to/features.csv"

In [218]:
features_0 = pd.read_csv(features_csv, index_col="match_id")
features = features_0
y = features["radiant_win"]

In [219]:
# drop "future" columns
X = features.drop(columns=features.columns[-6:])

# find and fill gaps
counts = X.count()
total = len(X.index)
with_gaps = []
for i, count in enumerate(counts):
    if count < total:
        with_gaps.append(X.columns[i])
        
X[with_gaps] = X[with_gaps].fillna(na_time)

In [220]:
X = X.values
y = y.values

In [221]:
kf = KFold(n_splits=kfold_splits, shuffle=True)
qualities = {}
times = {}
pred_times = {}

# for different n_estimators measure time and quality
for n_estimators in ns_estimators:
    quality = []
    pred_time = []
    start_time = datetime.now()
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        clf = GradientBoostingClassifier(n_estimators=n_estimators)
        clf.fit(X_train, y_train)
        pred_start_time = datetime.now()
        y_pred = clf.predict_proba(X_test)[:,1]
        pred_time.append(datetime.now() - pred_start_time)
        quality.append(roc_auc_score(y_test, y_pred))
    pred_times[n_estimators] = np.mean(pred_time)
    times[n_estimators] = datetime.now() - start_time
    qualities[n_estimators] = np.mean(quality)

In [222]:
print("qualities:")
print(qualities)

print("\nlearn_times:")
for time in times.values():
    print(time.seconds, "secs")

print("\npred_times:")
for time in pred_times.values():
    print(time.microseconds, "microsecs")


qualities:
{5: 0.6404071763133969}

learn_times:
21 secs

pred_times:
10269 microsecs
