# Import libraries

In [1]:
import numpy as np # calculations with arrays
import pandas as pd # user-friendly DataFrames for data representation
import sklearn # machine learning algorithms
from sklearn import ensemble, linear_model
from sklearn.metrics import log_loss

import matplotlib.pyplot as plt # import plot functions
# necessary to plot in jupyter notebook:
%matplotlib inline
import seaborn as sns # make plots beautiful

# Download data from competition's page

https://inclass.kaggle.com/c/data-mining-in-action-2016-competitions-01/data

# Load data using pandas

In [2]:
train = pd.read_csv('train2.csv')
test = pd.read_csv('test2.csv')
sample_submission = pd.read_csv('sample_submission.csv')

# Data

In [3]:
# print first row
train[:1]

Unnamed: 0,year,day,team1,team2,score1,score2,target
0,2998,19,317,131,336,278,True


In [4]:
test[:1]

Unnamed: 0,Id,year,team1,team2
0,0,3021,363,161


In [5]:
# Target variable is "target" and this means we will be predicting it
sample_submission[:1]

Unnamed: 0,Id,target
0,0,0.5


## Quick look at the unique values in data...

In [6]:
print sorted(train['team2'].unique())

[2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 114, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 228, 229, 2

In [7]:
print sorted(test['team2'].unique())

[1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 22, 23, 24, 25, 26, 27, 29, 30, 31, 32, 33, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 228, 229, 230, 

# Cross-validation

### Lets split data randomly to train and validatation. We will train our algorithms on selected train set and validate them on validation set. Easy as it can be!

In [8]:
# train size
train.shape 

(101609, 7)

train is quite big, so for example purposes we'll sample only part of it

In [9]:
from sklearn.cross_validation import ShuffleSplit

for itr, ite in ShuffleSplit(len(train), n_iter=1, train_size=0.4, test_size=0.1, random_state=0):
    pass



information about all functions can be found on the internet, for example

http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ShuffleSplit.html

In [10]:
# or you can open it in you Jupyter notebook executing function in this manner
?ShuffleSplit()

In [11]:
len(itr), len(ite)

(40643, 10161)

In [12]:
itr[:5], ite[:5]

(array([22710, 41665, 91975, 57348, 39931]),
 array([ 37078, 101474,  29858,  61674,   1049]))

now we have validation set "ite" to check the quality of our solution

# features and target

In [17]:
sample_submission[:2]

Unnamed: 0,Id,target
0,0,0.5
1,1,0.5


we need to change 'target' column in "sample_submission" to our predictions.

For now we will select only features that are present in both train and test:

In [18]:
features = []
for c in train.columns:
    if c in test.columns and c!='target':
        features += [c]
        print '"{}" is present in test and train'.format(c)
    else:
        print '"{}" is NOT present in test'.format(c)
        
features

"year" is present in test and train
"day" is NOT present in test
"team1" is present in test and train
"team2" is present in test and train
"score1" is NOT present in test
"score2" is NOT present in test
"target" is NOT present in test


['year', 'team1', 'team2']

here we split train on "train" and "validation" parts

In [19]:
xtrain = train.loc[itr, features]    
ytrain = train.loc[itr, 'target']

xval = train.loc[ite, features]
yval = train.loc[ite, 'target']

# Baseline solution

lets make baseline first by predicting the mean value

In [20]:
train.target.mean()

0.50096940231672393

In [21]:
constant_prediction = yval * 0 + train.target.mean()
constant_prediction = constant_prediction.values
constant_prediction

array([ 0.5009694,  0.5009694,  0.5009694, ...,  0.5009694,  0.5009694,
        0.5009694])

In [22]:
log_loss(yval, constant_prediction)

0.6931565015839517

In [23]:
submission = sample_submission.copy()
submission.target = train['target'].mean() # notice here that we can refer to a column 'target' in two ways
submission.to_csv('constant_submission.csv', index=False)

In [24]:
submission.head(4)

Unnamed: 0,Id,target
0,0,0.500969
1,1,0.500969
2,2,0.500969
3,3,0.500969


Now this should score like "Baseline - Constant" on Leaderboard!
You can submit this by going to 

https://inclass.kaggle.com/c/data-mining-in-action-2016-competitions-01/submissions/attach

# Machine learning

Finally, lets try machine learning!

In [25]:
alg = linear_model.LogisticRegression()
alg.fit(xtrain, ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [26]:
prediction = alg.predict_proba(xval)[:,1]

In [27]:
log_loss(yval, prediction)

0.69278081748011511

### Well, not so far from the constant solution... Let's try to understand why.

What's a linear model such as LogisticRegression is trying to do is multiply each variable on some coefficient and add add it up, in our case:

y_predicted = column1 \* coef1 + column2 \* coef2 + column3 \* coef3 + bias

We can print coefficients and bias:

In [28]:
alg.coef_, alg.intercept_

(array([[  1.78001477e-07,   3.28971478e-04,  -2.98418706e-04]]),
 array([  3.70539525e-09]))

But clearly, "team1" and "team2" are _categorical_ columns, just like names of the teams. 

So we need to turn "team" columns to something linear algorithm can work with. For example first few rows from here

In [29]:
train.head(3)

Unnamed: 0,year,day,team1,team2,score1,score2,target
0,2998,19,317,131,336,278,True
1,2998,28,61,29,301,259,True
2,2998,28,110,141,359,267,True


In [30]:
train.loc[:5, 'team1']

0    317
1     61
2    110
3    352
4    229
5    164
Name: team1, dtype: int64

To this:

In [31]:
pd.get_dummies(train.loc[:5, 'team1'])

Unnamed: 0,61,110,164,229,317,352
0,0,0,0,0,1,0
1,1,0,0,0,0,0
2,0,1,0,0,0,0
3,0,0,0,0,0,1
4,0,0,0,1,0,0
5,0,0,1,0,0,0


So each team name now has it's own column. Read about "pd.get_dummies" here:

http://pandas.pydata.org/pandas-docs/stable/generated/pandas.get_dummies.html

### But let's come back to more interesting stuff for now
### We are competition's solvers, remember? Lets dive into the space of more complicated models!

In [32]:
alg = ensemble.RandomForestClassifier(15, n_jobs=4)
alg.fit(xtrain, ytrain)
prediction = alg.predict_proba(xval)[:,1]

In [33]:
log_loss(yval, prediction)

1.1141510822605858

Surprisingly, this doesn't work very well. Now, like competition pro, let's make our models bigger!

In [34]:
alg = ensemble.RandomForestClassifier(150, n_jobs=4)
alg.fit(xtrain, ytrain)
prediction = alg.predict_proba(xval)[:,1]

In [35]:
log_loss(yval, prediction)

0.74357620371501032

### Almost there! But for now let's skip this model too and go to _real_ competitions stuff

In [36]:
import xgboost

In [37]:
param = {}
param['max_depth'] = 8
param['booster'] = 'gbtree'
param['objective'] = 'binary:logistic'
param['eval_metric'] = 'logloss'
param['eta'] = 0.1

numround = 100

In [38]:
x = xgboost.XGBClassifier()

Xgboost parameters

https://github.com/dmlc/xgboost/blob/master/doc/parameter.md

In [41]:
xtrain

Unnamed: 0,year,team1,team2
22710,3003,45,289
41665,3007,229,228
91975,3018,66,261
57348,3011,289,251
39931,3007,173,139
46413,3008,228,143
15490,3001,63,337
13518,3001,347,284
47757,3009,44,119
88958,3017,250,73


In [40]:
Xdatatrain = xgboost.DMatrix(data = xtrain, label = ytrain)
Xdatatest = xgboost.DMatrix(data = xval, label = yval)

plst = list(param.items())
watchlist = [(Xdatatrain, 'train'), (Xdatatest, 'eval')]            

#bst = xgboost.train(plst, Xdatatrain, numround, evals = watchlist, verbose_eval = 10)
# ypredxgb_tr = bst.predict(Xdatatrain)

Wow! Finally our model better than constant predictions! Congratulations! Don't hesitate, submit!

In [95]:
ss = sample_submission.copy()

ss.target = bst.predict(xgboost.DMatrix(test[features]))
ss.to_csv('mighty_xgboost.csv', index=False)

### Strange, but it seems like we got 0.658 instead of 0.649! 

### What could it be? Perhabs we need to train on all data instead of just 40% of it? Or may be should think over our cross-validation process?

### Let's overview now what we just did here:
1) made cross-validation

2) tried linear models, they didn't work, but we figured out how to tackle this problem

3) tried random forest and almost beat constant benchmark

4) tried xgboost and finally beat constant prediction!

### But there is the last thing you must know before you'll start this challenge by trying to make the most thorough parameter tuning: the data has it's secrets and those who will find them will be generously rewarded...

### now, good luck with it!