In [61]:
import os
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [1]:
path_data = "../test/sklearn/training/"

In [5]:
data = pd.read_csv(os.path.join(path_data, "2014.csv"), sep=",", header=0)

In [6]:
data.head(5)

Unnamed: 0,matches,label,diff_points,away,home,neutral,losses,against,for,draws,wins
0,AR/BO_2011/7/1,0.0,0.0,0.032756,0.002676,-0.035431,-0.304048,-0.877752,0.817753,0.004506,0.299542
1,AR/BO_2011/11/11,0.0,0.0,0.032756,0.002676,-0.035431,-0.304048,-0.877752,0.817753,0.004506,0.299542
2,CO/BO_2011/7/10,2.0,2.0,0.00295,-0.055155,0.052205,-0.164188,-0.691664,0.13653,0.028377,0.135812
3,CO/BO_2013/3/22,2.0,5.0,0.00295,-0.055155,0.052205,-0.164188,-0.691664,0.13653,0.028377,0.135812
4,EC/BO_2012/9/7,2.0,1.0,-0.002653,-0.039169,0.041821,-0.054642,-0.281079,0.079487,0.013882,0.04076


## Simple logistic regression model

### X features

In [13]:
features_columns = ["away", "home", "neutral", "losses", "against", "for", "draws", "wins"]

X = data.loc[:, features_columns]

print(X.shape)

(2072, 8)


### y response

In [14]:
y = data.label
print(y.shape)

(2072,)


In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [18]:
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

((1554, 8), (518, 8))
((1554,), (518,))


## scikit-learn model

In [47]:
# 2. instantiate model
logreg = LogisticRegression(penalty='l2',C=1)

# 3. fit 
logreg.fit(X_train, y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [48]:
# 4. predict
pred_train = logreg.predict(X_train)
pred_test = logreg.predict(X_test)

In [49]:
print("Train accuracy: {0}".format(accuracy_score(y_train, pred_train, normalize=True)))

Train accuracy: 0.593951093951


In [50]:
print("Test accuracy: {0}".format(accuracy_score(y_test, pred_test, normalize=True)))

Test accuracy: 0.557915057915


### grid search with cross-validation

In [81]:
help(LogisticRegression)

Help on class LogisticRegression in module sklearn.linear_model.logistic:

class LogisticRegression(sklearn.base.BaseEstimator, sklearn.linear_model.base.LinearClassifierMixin, sklearn.linear_model.base.SparseCoefMixin)
 |  Logistic Regression (aka logit, MaxEnt) classifier.
 |  
 |  In the multiclass case, the training algorithm uses the one-vs-rest (OvR)
 |  scheme if the 'multi_class' option is set to 'ovr', and uses the cross-
 |  entropy loss if the 'multi_class' option is set to 'multinomial'.
 |  (Currently the 'multinomial' option is supported only by the 'lbfgs',
 |  'sag' and 'newton-cg' solvers.)
 |  
 |  This class implements regularized logistic regression using the
 |  'liblinear' library, 'newton-cg', 'sag' and 'lbfgs' solvers. It can handle
 |  both dense and sparse input. Use C-ordered arrays or CSR matrices
 |  containing 64-bit floats for optimal performance; any other input format
 |  will be converted (and copied).
 |  
 |  The 'newton-cg', 'sag', and 'lbfgs' solve

In [34]:
# help(GridSearchCV)

In [73]:
tuned_parameters = [{'penalty': ['l2'], 
                     'C': [1, 10, 100, 1000],
                     'solver': ['newton-cg', 'sag', 'lbfgs'],
                     'fit_intercept': [True, False]},
                    {'penalty': ['l1'], 
                     'C': [4.0, 2.0, 1.333, 1.0],
                     'solver': ['liblinear', 'saga'],
                     'fit_intercept': [True, False]}]

In [74]:
clf = GridSearchCV(LogisticRegression(max_iter=200), tuned_parameters, cv=4,
                       scoring='accuracy')

In [75]:
clf.fit(X_train, y_train)

GridSearchCV(cv=4, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=200, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'penalty': ['l2'], 'C': [1, 10, 100, 1000], 'fit_intercept': [True, False], 'solver': ['newton-cg', 'sag', 'lbfgs']}, {'penalty': ['l1'], 'C': [4.0, 2.0, 1.333, 1.0], 'fit_intercept': [True, False], 'solver': ['liblinear', 'saga']}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [76]:
print(clf.best_params_)

{'penalty': 'l2', 'C': 1, 'solver': 'newton-cg', 'fit_intercept': True}


In [77]:
y_pred = clf.predict(X_train)
print("Train accuracy: {0}".format(accuracy_score(y_train, y_pred, normalize=True)))

Train accuracy: 0.593951093951


In [80]:
y_pred = clf.predict(X_test)
print("Train accuracy: {0}".format(accuracy_score(y_test, y_pred, normalize=True)))

Train accuracy: 0.559845559846
