In [None]:
import numpy as np
import pandas as pd
import time
import subprocess
import pylab as pl
from sklearn import preprocessing
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score

In [None]:
# Get the Accuracy for the Naive Approach (the team with more gold wins)
def naive_accuracy(X,y):    
    blue = len(X[X['Gold']>0][y==100])
    red  = len(X[X['Gold']<0][y==200])
    games = len(X)
    print 'Naive Accuracy: {:.2f}%'.format(float(blue+red)/games*100)

In [None]:
def train_classifier(clf, X_train, y_train):
    print "Training {}...".format(clf.__class__.__name__)
    start = time.time()
    clf.fit(X_train, y_train)
    end = time.time()
    print "Done!\nTraining time: {:.3f} secs".format(end - start)

In [None]:
def predict_labels(clf, features, target,text):
    if text:
        print "Predicting labels using {}...".format(clf.__class__.__name__)
    start = time.time()
    y_pred = clf.predict(features)
    end = time.time()
    if text:
        print "Done!\nPrediction time: {:.3f} secs".format(end - start)
    if text:
        print "Confusion Matrix:\n {}".format(confusion_matrix(target, y_pred))
    return accuracy_score(target.values, y_pred)*100

In [None]:
def learning_curve(clf, X_train, y_train, X_test, y_test):
    """Calculate the performance of the model after a set of training data."""

    # We will vary the training set size so that we have 50 different sizes
    sizes = np.round(np.linspace(100, len(X_train), 50))
    train_f1 = np.zeros(len(sizes))
    test_f1 = np.zeros(len(sizes))

    for i, s in enumerate(sizes):
        # Create and fit the decision tree regressor model
        clf.fit(X_train[:int(s)], y_train[:int(s)])

        # Find the performance on the training and testing set
        train_f1[i] = predict_labels(clf,X_train,y_train,False)
        test_f1[i] = predict_labels(clf,X_test,y_test,False)


    # Plot learning curve graph
    learning_curve_graph(sizes, train_f1, test_f1)

In [None]:
def learning_curve_graph(sizes, train_f1, test_f1):
    """Plot training and test error as a function of the training size."""

    pl.figure()
    pl.title('Performance vs Training Size')
    pl.plot(sizes, test_f1, lw=2, label = 'Test F1 Score')
    pl.plot(sizes, train_f1, lw=2, label = 'Train F1 Score')
    pl.legend()
    pl.xlabel('Training Size')
    pl.ylabel('F1 Score')
    pl.show()

In [None]:
def validation_curve(clf, X_train, y_train, X_test, y_test):
    """Calculate the performance of the model after a set of training data."""
    
    parameter = [50,100,150,200,250,300]
    train_f1 = np.zeros(len(parameter))
    test_f1 = np.zeros(len(parameter))
    
    for i, d in enumerate(parameter):
        # Create and fit the decision tree regressor model
        clf.min_samples_split = d
        clf.fit(X_train, y_train)
        # Find the performance on the training and testing set
        train_f1[i] = predict_labels(clf,X_train,y_train,False)
        test_f1[i] = predict_labels(clf,X_test,y_test,False)

    # Plot learning curve graph
    learning_curve_graph(parameter, train_f1, test_f1)

In [None]:
def validation_curve_graph(n_estimators, train_f1, test_f1):
    """Plot training and test error as a function of the depth of the decision tree learn."""

    pl.figure()
    pl.title('Decision Trees: Performance vs Max Depth')
    pl.plot(max_depth, test_f1, lw=2, label = 'test error')
    pl.plot(max_depth, train_f1, lw=2, label = 'training error')
    pl.legend()
    pl.xlabel('Max Depth')
    pl.ylabel('Error')
    pl.show()

In [None]:
def get_feature_importances(estimator):
    """Retrieve or aggregate feature importances from estimator"""
    if hasattr(estimator, "feature_importances_"):
        importances = estimator.feature_importances_

    elif hasattr(estimator, "coef_"):
        if estimator.coef_.ndim == 1:
            importances = np.abs(estimator.coef_)

        else:
            importances = np.sum(np.abs(estimator.coef_), axis=0)

    else:
        raise ValueError(
            "The underlying estimator %s has no `coef_` or "
            "`feature_importances_` attribute. Either pass a fitted estimator"
            " to SelectFromModel or call fit before calling transform."
            % estimator.__class__.__name__)

    return importances

In [None]:
# Ready csv with game data
game_data = pd.read_csv('gameData_Diffs_10Minutes.csv',keep_default_na=False)
print game_data.columns

In [None]:
#game_data = game_data[abs(game_data['Gold'])<=1000]

# Data Set for learning
X_all = game_data[game_data.columns[-14:]]
y_all = game_data['winner']

X_all = pd.DataFrame(preprocessing.scale(X_all), index = X_all.index, columns = X_all.columns)

# Separate data into train/test samples
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X_all,y_all,test_size=0.25)
print "Training set: {} samples".format(X_train.shape[0])
print "Test set: {} samples".format(X_test.shape[0])

In [None]:
print 'Overall: '
naive_accuracy(X_all,y_all)
print 'Train set: '
naive_accuracy(X_train,y_train)
print 'Test set: '
naive_accuracy(X_test,y_test)

In [None]:
# Train Classifier
#clf = LogisticRegression(solver='sag')
#clf = RandomForestClassifier(n_estimators=1000)
clf = AdaBoostClassifier()
#clf = GradientBoostingClassifier()
#clf = SVC()
train_classifier(clf, X_train, y_train)

In [None]:
# Grid Search implementation to check the best configuration for Ada Boost 
tuned_parameters = [{'n_estimators': [1,10,50,100], 'learning_rate': [0.1, 0.3, 0.5, 0.7, 1]}]
clf = GridSearchCV(AdaBoostClassifier(),param_grid=tuned_parameters)
clf.fit(X_train, y_train)


# Evaluate Classifier's performance on Train Data
print "Accuracy for training set: {}".format(predict_labels(clf, X_train, y_train, True))

# Evaluate Classifier's performance on Test Data
print "Accuracy for test set: {}%".format(predict_labels(clf, X_test, y_test, True))

print clf.best_estimator_

In [None]:
# Evaluate Classifier's performance on Train Data
print "Accuracy for training set: {}".format(predict_labels(clf, X_train, y_train, True))

# Evaluate Classifier's performance on Test Data
print "Accuracy for test set: {}%".format(predict_labels(clf, X_test, y_test, True))

In [None]:
x = 0
importance = get_feature_importances(clf)
print 'Feature importance'
for col in X_all.columns:
    print col + ' = ' + str("{0:.2f}".format(importance[x]*100))
    x = x+1

In [None]:
resultados = pd.DataFrame()
resultados['winner'] = clf.predict(X_all)
resultados.to_csv('out.csv')