## Preprocessing/Merging People and Activities

This script converts features in people and activities into integers, then merges everything into a single table. Makes it easy to drop into classifiers in Sklearn or XGBoost. 

Conveniently, most of the data can be easily encoded to numeric values with simple string splitting. 

Scored ~0.944 with Random Forest Classifier in Sklearn out of the box. 


In [2]:
import numpy as np
import pandas as pd

act_train = pd.read_csv("D:/kaggle/Predicting Red Hat Business Value/act_train.csv")
act_test = pd.read_csv("D:/kaggle/Predicting Red Hat Business Value/act_test.csv")
people = pd.read_csv("D:/kaggle/Predicting Red Hat Business Value/people.csv")

In [3]:
# Save the test IDs for Kaggle submission
test_ids = act_test['activity_id']

def preprocess_acts(data, train_set=True):
    
    # Getting rid of data feature for now
    data = data.drop(['date', 'activity_id'], axis=1)
    if(train_set):
        data = data.drop(['outcome'], axis=1)
    
    ## Split off _ from people_id
    data['people_id'] = data['people_id'].apply(lambda x: x.split('_')[1])
    data['people_id'] = pd.to_numeric(data['people_id']).astype(int)
    
    columns = list(data.columns)
    
    # Convert strings to ints
    for col in columns[1:]:
        data[col] = data[col].fillna('type 0')
        data[col] = data[col].apply(lambda x: x.split(' ')[1])
        data[col] = pd.to_numeric(data[col]).astype(int)
    return data


In [4]:

def preprocess_people(data):
    
    # TODO refactor this duplication
    data = data.drop(['date'], axis=1)
    data['people_id'] = data['people_id'].apply(lambda x: x.split('_')[1])
    data['people_id'] = pd.to_numeric(data['people_id']).astype(int)
    
    #  Values in the people df is Booleans and Strings    
    columns = list(data.columns)
    bools = columns[11:]
    strings = columns[1:11]
    
    for col in bools:
        data[col] = pd.to_numeric(data[col]).astype(int)        
    for col in strings:
        data[col] = data[col].fillna('type 0')
        data[col] = data[col].apply(lambda x: x.split(' ')[1])
        data[col] = pd.to_numeric(data[col]).astype(int)
    return data

In [5]:
# Preprocess each df
peeps = preprocess_people(people)
actions_train = preprocess_acts(act_train)
actions_test = preprocess_acts(act_test, train_set=False)

In [6]:
features = actions_train.merge(peeps, how='left', on='people_id')
labels = act_train['outcome']
test = actions_test.merge(peeps, how='left', on='people_id')
features.sample(10)

Unnamed: 0,people_id,activity_category,char_1_x,char_2_x,char_3_x,char_4_x,char_5_x,char_6_x,char_7_x,char_8_x,...,char_29,char_30,char_31,char_32,char_33,char_34,char_35,char_36,char_37,char_38
957725,262144,5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,31
1243162,302975,2,0,0,0,0,0,0,0,0,...,0,1,1,1,1,1,1,1,1,56
499244,182260,5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,59
673277,213140,1,6,6,4,3,1,2,2,4,...,0,0,0,0,0,0,0,0,0,86
1207283,29680,2,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,1,1,98
1121765,290989,5,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,95
593863,198785,1,12,1,1,3,2,2,3,4,...,0,0,0,0,0,0,0,0,0,0
655116,210061,5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,48
513534,184849,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,65
1343715,320999,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,31


In [7]:
## Split Training Data
from sklearn.cross_validation import train_test_split

num_test = 0.20
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=num_test, random_state=23)

## Out of box random forest
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.grid_search import GridSearchCV

clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, 
                                 max_depth=1, random_state=0).fit(X_train, y_train)
clf.score(X_test, y_test) 



0.85779105673111711

In [8]:
## Training Predictions
proba = clf.predict_proba(X_test)
preds = proba[:,1]
score = roc_auc_score(y_test, preds)
print("Area under ROC {0}".format(score))

Area under ROC 0.9260733458255788


In [9]:
# Test Set Predictions
test_proba = clf.predict_proba(test)
test_preds = test_proba[:,1]

# Format for submission
output = pd.DataFrame({ 'activity_id' : test_ids, 'outcome': test_preds })
output.head()
output.to_csv('gauty.csv', index = False)