### One Hot Encorder will all features, including group_1, char_10,  char_48

In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.sparse import hstack

from sklearn.preprocessing import OneHotEncoder
from sklearn import cross_validation as cv
from sklearn.metrics import roc_auc_score
import xgboost as xgb

from helper_function import process
from helper_function import uniqueValue
from helper_function import visualizeDist

%matplotlib inline

# Process the data

### Load the dataset

In [2]:
# read training and test set
train_id, train_total = process('./data/act_train.csv')
test_id, test_total = process('./data/act_test.csv')

### Study the characteristics of the dataset

In [4]:
# one-hot-key encorder
hotFeatures = ['activity_category', 'group_1', 'char_10', 'char_48', 
               'activity_year', 'activity_month', 'activity_day', 
               'people_year', 'people_month', 'people_day'] \
            + ['char_' + str(i) for i in range(1, 10)] \
            + ['char_' + str(i) for i in range(12, 20)]
        
noFeatures = []
for i in train_total.columns:
    if i not in hotFeatures and i != 'outcome':
        noFeatures.append(i)

In [5]:
# One-Hot-Encorder
encorder = OneHotEncoder()
hotTrain = encorder.fit_transform(train_total[hotFeatures])
hotTest = encorder.transform(test_total[hotFeatures])

X_train_total = hstack((hotTrain, train_total[noFeatures]))
y_train_total = train_total['outcome']
X_test_total = hstack((hotTest, test_total[noFeatures]))

# del [train_total, test_total]

In [6]:
print("Training set: ", X_train_total.shape)
print("Test set: ", X_test_total.shape)

Training set:  (2197291, 36942)
Test set:  (498687, 36942)


# Start Learning

### Random State

In [7]:
random_state = 42

### Split the training set

In [8]:
(X_train, X_cv, y_train, y_cv) = cv.train_test_split(X_train_total, y_train_total, test_size=0.3, 
                                                     random_state=random_state)

### Train the classifier

In [9]:
xg_train = xgb.DMatrix(X_train, label=y_train)
xg_cv = xgb.DMatrix(X_cv, label=y_cv)

In [10]:
param = {'booster': 'gblinear',
         'max_depth': 11,
         'eta': 0.05,
         'silent': 0,
         'objective': 'binary:logistic',
         'nthread': 2,
         'eval_metric': 'auc',
         'colsample_bytree': 0.92,
         'colsample_bylevel': 0.9,
         'subsample': 0.85,
         'min_child_weight': 0
#          'lambda':5,
#          'lambda_bias':0,
#          'alpha':1
        }

num_round = 325
watchlist = [(xg_train, 'train'), (xg_cv, 'CV')]
bst = xgb.train(param, xg_train, num_round, watchlist)

[0]	train-auc:0.919403	CV-auc:0.919078
[1]	train-auc:0.937581	CV-auc:0.937165
[2]	train-auc:0.952279	CV-auc:0.951705
[3]	train-auc:0.964311	CV-auc:0.963607
[4]	train-auc:0.973731	CV-auc:0.972980
[5]	train-auc:0.980440	CV-auc:0.979675
[6]	train-auc:0.984842	CV-auc:0.984067
[7]	train-auc:0.987680	CV-auc:0.986901
[8]	train-auc:0.989590	CV-auc:0.988813
[9]	train-auc:0.990956	CV-auc:0.990181
[10]	train-auc:0.991977	CV-auc:0.991207
[11]	train-auc:0.992770	CV-auc:0.992002
[12]	train-auc:0.993403	CV-auc:0.992637
[13]	train-auc:0.993922	CV-auc:0.993156
[14]	train-auc:0.994354	CV-auc:0.993590
[15]	train-auc:0.994720	CV-auc:0.993957
[16]	train-auc:0.995032	CV-auc:0.994271
[17]	train-auc:0.995301	CV-auc:0.994542
[18]	train-auc:0.995534	CV-auc:0.994778
[19]	train-auc:0.995738	CV-auc:0.994985
[20]	train-auc:0.995917	CV-auc:0.995167
[21]	train-auc:0.996075	CV-auc:0.995327
[22]	train-auc:0.996216	CV-auc:0.995470
[23]	train-auc:0.996341	CV-auc:0.995598
[24]	train-auc:0.996454	CV-auc:0.995712
[25]	train

In [11]:
# CV evaluation
pred_train = bst.predict(xg_train)
pred_cv = bst.predict(xg_cv)
print("Train: ", roc_auc_score(y_train, pred_train))
print("CV: ", roc_auc_score(y_cv, pred_cv))

Train:  0.997760477782
CV:  0.996877319652


### Create output file

In [12]:
xg_train_total = xgb.DMatrix(X_train_total, label=y_train_total)
xg_test_total = xgb.DMatrix(X_test_total)
bst = xgb.train(param, xg_train_total)
pred_test = bst.predict(xg_test_total)

In [13]:
result = pd.DataFrame({'activity_id': test_id, 'outcome': pred_test})
result.to_csv('./result/xgb_result.csv', index=None)