# Random Forest Regression with all features

In [2]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.sparse import hstack

from sklearn.preprocessing import OneHotEncoder
from sklearn import cross_validation as cv
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestRegressor

from helper_function import process
from helper_function import uniqueValue
from helper_function import visualizeDist

%matplotlib inline

# Process the data

### Load the dataset

In [3]:
# read training and test set
train_id, train_total = process('./data/act_train.csv')
test_id, test_total = process('./data/act_test.csv')

### One Hot Encorder

In [8]:
# one-hot-key encorder
hotFeatures = ['activity_category', 'group_1', 'char_10', 'char_48', 
               'activity_year', 'activity_month', 'activity_day', 
               'people_year', 'people_month', 'people_day'] \
            + ['char_' + str(i) for i in range(1, 10)] \
            + ['char_' + str(i) for i in range(12, 20)]
        
noFeatures = []
for i in train_total.columns:
    if i not in hotFeatures and i != 'outcome':
        noFeatures.append(i)

In [9]:
# One-Hot-Encorder
encorder = OneHotEncoder()
hotTrain = encorder.fit_transform(train_total[hotFeatures])
hotTest = encorder.transform(test_total[hotFeatures])

X_train_total = hstack((hotTrain, train_total[noFeatures]))
y_train_total = train_total['outcome']
X_test_total = hstack((hotTest, test_total[noFeatures]))

# del [train_total, test_total]

In [10]:
print("Training set: ", X_train_total.shape)
print("Test set: ", X_test_total.shape)

Training set:  (2197291, 36942)
Test set:  (498687, 36942)


# Start Learning

### Random State

In [11]:
random_state = 42

### Split the training set

In [12]:
(X_train, X_cv, y_train, y_cv) = cv.train_test_split(X_train_total, y_train_total, 
                                                     test_size=0.3, random_state=random_state)

### Train the classifier

In [13]:
clf = RandomForestRegressor(n_estimators=10, n_jobs=6, criterion='mse', max_depth=None, 
                            min_samples_split=2, min_samples_leaf=1, verbose=1, 
                            min_weight_fraction_leaf=0.0, max_features='auto', 
                            max_leaf_nodes=None, bootstrap=True, oob_score=False, 
                            random_state=random_state, warm_start=False)
clf.fit(X_train, y_train)

[Parallel(n_jobs=6)]: Done  11 out of  10 | elapsed: 130.8min remaining:  -713.3s
[Parallel(n_jobs=6)]: Done  11 out of  10 | elapsed: 133.1min remaining:  -725.9s
[Parallel(n_jobs=6)]: Done  11 out of  10 | elapsed: 137.4min remaining:  -749.6s
[Parallel(n_jobs=6)]: Done  11 out of  10 | elapsed: 138.4min remaining:  -754.7s
[Parallel(n_jobs=6)]: Done  11 out of  10 | elapsed: 143.7min remaining:  -783.6s
[Parallel(n_jobs=6)]: Done  11 out of  10 | elapsed: 215.3min remaining:  -1174.4s
[Parallel(n_jobs=6)]: Done  11 out of  10 | elapsed: 219.7min remaining:  -1198.3s
[Parallel(n_jobs=6)]: Done  11 out of  10 | elapsed: 222.3min remaining:  -1212.6s
[Parallel(n_jobs=6)]: Done  11 out of  10 | elapsed: 222.9min remaining:  -1216.1s
[Parallel(n_jobs=6)]: Done  10 out of  10 | elapsed: 222.9min finished


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=6, oob_score=False, random_state=42,
           verbose=1, warm_start=False)

In [14]:
# CV evaluation
pred_train = clf.predict(X_train)
pred_cv = clf.predict(X_cv)
print("Train: ", roc_auc_score(y_train, pred_train))
print("CV: ", roc_auc_score(y_cv, pred_cv))

[Parallel(n_jobs=6)]: Done  11 out of  10 | elapsed:    1.4s remaining:   -0.1s
[Parallel(n_jobs=6)]: Done  11 out of  10 | elapsed:    1.4s remaining:   -0.1s
[Parallel(n_jobs=6)]: Done  11 out of  10 | elapsed:    1.5s remaining:   -0.1s
[Parallel(n_jobs=6)]: Done  11 out of  10 | elapsed:    1.5s remaining:   -0.1s
[Parallel(n_jobs=6)]: Done  11 out of  10 | elapsed:    1.6s remaining:   -0.1s
[Parallel(n_jobs=6)]: Done  11 out of  10 | elapsed:    2.3s remaining:   -0.2s
[Parallel(n_jobs=6)]: Done  11 out of  10 | elapsed:    2.5s remaining:   -0.2s
[Parallel(n_jobs=6)]: Done  11 out of  10 | elapsed:    2.5s remaining:   -0.2s
[Parallel(n_jobs=6)]: Done  11 out of  10 | elapsed:    2.6s remaining:   -0.2s
[Parallel(n_jobs=6)]: Done  10 out of  10 | elapsed:    2.6s finished
[Parallel(n_jobs=6)]: Done  11 out of  10 | elapsed:    0.5s remaining:   -0.0s
[Parallel(n_jobs=6)]: Done  11 out of  10 | elapsed:    0.6s remaining:   -0.1s
[Parallel(n_jobs=6)]: Done  11 out of  10 | elapse

Train:  0.999998505789
CV:  0.998151016528


### Predict on the test set

In [15]:
pred_test = clf.predict(X_test_total)

[Parallel(n_jobs=6)]: Done  11 out of  10 | elapsed: 131.1min remaining:  -715.0s
[Parallel(n_jobs=6)]: Done  11 out of  10 | elapsed: 131.2min remaining:  -715.9s
[Parallel(n_jobs=6)]: Done  11 out of  10 | elapsed: 131.7min remaining:  -718.1s
[Parallel(n_jobs=6)]: Done  11 out of  10 | elapsed: 132.8min remaining:  -724.3s
[Parallel(n_jobs=6)]: Done  11 out of  10 | elapsed: 140.3min remaining:  -765.4s
[Parallel(n_jobs=6)]: Done  11 out of  10 | elapsed: 214.2min remaining:  -1168.3s
[Parallel(n_jobs=6)]: Done  11 out of  10 | elapsed: 218.2min remaining:  -1190.2s
[Parallel(n_jobs=6)]: Done  11 out of  10 | elapsed: 219.5min remaining:  -1197.0s
[Parallel(n_jobs=6)]: Done  11 out of  10 | elapsed: 220.8min remaining:  -1204.2s
[Parallel(n_jobs=6)]: Done  10 out of  10 | elapsed: 220.8min finished


NameError: name 'X_test' is not defined

### Create output file

In [23]:
result = pd.DataFrame({'activity_id': test_id, 'outcome': pred_test})
result.to_csv('./result/RF_result.csv', index=None)