# AdaBoost Regression with all features

In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.sparse import hstack

from sklearn.preprocessing import OneHotEncoder
from sklearn import cross_validation as cv
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import AdaBoostRegressor

from helper_function import process
from helper_function import uniqueValue
from helper_function import visualizeDist

%matplotlib inline

# Process the data

### Load the dataset

In [2]:
# read training and test set
train_id, train_total = process('./data/act_train.csv')
test_id, test_total = process('./data/act_test.csv')

### One Hot Encorder

In [3]:
# one-hot-key encorder
hotFeatures = ['activity_category', 'group_1', 'char_10', 'char_48', 
               'activity_year', 'activity_month', 'activity_day', 
               'people_year', 'people_month', 'people_day'] \
            + ['char_' + str(i) for i in range(1, 10)] \
            + ['char_' + str(i) for i in range(12, 20)]
        
noFeatures = []
for i in train_total.columns:
    if i not in hotFeatures and i != 'outcome':
        noFeatures.append(i)

In [4]:
# One-Hot-Encorder
encorder = OneHotEncoder()
hotTrain = encorder.fit_transform(train_total[hotFeatures])
hotTest = encorder.transform(test_total[hotFeatures])

X_train_total = hstack((hotTrain, train_total[noFeatures]))
y_train_total = train_total['outcome']
X_test_total = hstack((hotTest, test_total[noFeatures]))

# del [train_total, test_total]

In [5]:
print("Training set: ", X_train_total.shape)
print("Test set: ", X_test_total.shape)

Training set:  (2197291, 36942)
Test set:  (498687, 36942)


# Start Learning

### Random State

In [6]:
random_state = 42

### Split the training set

In [7]:
(X_train, X_cv, y_train, y_cv) = cv.train_test_split(X_train_total, y_train_total, 
                                                     test_size=0.3, random_state=random_state)

### Train the classifier

In [17]:
clf = AdaBoostRegressor(base_estimator=None, n_estimators=50, learning_rate=0.1, 
                        loss='square', random_state=random_state)

In [18]:
clf.fit(X_train, y_train)

AdaBoostRegressor(base_estimator=None, learning_rate=0.1, loss='square',
         n_estimators=50, random_state=42)

In [19]:
# CV evaluation
pred_train = clf.predict(X_train)
pred_cv = clf.predict(X_cv)
print("Train: ", roc_auc_score(y_train, pred_train))
print("CV: ", roc_auc_score(y_cv, pred_cv))

Train:  0.896099700356
CV:  0.896890092118


### Predict on the test set

In [20]:
pred_test = clf.predict(X_test_total)

### Create output file

In [21]:
result = pd.DataFrame({'activity_id': test_id, 'outcome': pred_test})
result.to_csv('./result/AdaB_result.csv', index=None)