In [1]:
import pandas as pd
import numpy as np
%load_ext autoreload

# Load data

In [2]:
train_data = pd.read_csv('data/train_v2.csv')
columns = list(train_data)
N_original, M_original = train_data.shape
columns, N_original

(['id',
  'timestamp',
  'product_id',
  'product_department',
  'product_category',
  'card_id',
  'user_id',
  'C15',
  'C16',
  'C17',
  'C18',
  'C19',
  'C20',
  'C21',
  'amount',
  'isfraud'],
 32369524)

# Data pipeline
* With trees, normalizing features is not necessary

In [99]:
%autoreload
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from include.DatetimeFromTimestamp import DatetimeFromTimestamp
#from include.HourOfDay import HourOfDay
#from include.DataFrameDropper import DataFrameDropper
from include.DataFrameSelector import DataFrameSelector
from include.FilterNMostCommon import FilterNMostCommon
from include.UserEvaluator import UserEvaluator

#columns_to_drop = ['id', 'timestamp', 'product_id', 'product_department', 'product_category', 'card_id', 'user_id']
columns_to_use = ['C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'amount'] #the hot encoded attributes are also used
pipeline_normal = Pipeline([
    #('hour_creator', HourOfDay()),
    ('datetime_creator', DatetimeFromTimestamp()),
    ('user_evaluator', UserEvaluator(col_names=list(train_data)[:-1] + ['datetime'])),
    ('dataframe_selector', DataFrameSelector(attribute_names=columns_to_use)),
])

pipeline_1hot = Pipeline([
    ('dataframe_selector', DataFrameSelector(['product_category'])),
    ('filter_n_most_common', FilterNMostCommon(N=5, attribute_name='product_category')),
    ('1hot_encoder', OneHotEncoder(sparse = False))
])

pipeline_full = FeatureUnion(transformer_list=[
    ('pipeline_normal', pipeline_normal),
    ('pipeline_1hot', pipeline_1hot),
])

random_seed = 42

# Train/Test split
* Normally I would use random sampling, stratified by an attribute of major relevance, however, in this case the test data that was given follows the train data in time. Therefore, in order to do local testing my first guess would be that it is better to remake that scenario and sample the data by simply splitting it sorted as it is, by time.
* Cross validation is not necessary given that we have a test set big enough

In [100]:
split_by = 2
N_train = 1000000
N_test = 6000000

start_at = N_original - N_train - N_test
split_at = start_at + N_train

train_X = pd.DataFrame(train_data.iloc[start_at:split_at,:-1])
train_Y = train_data.iloc[start_at:split_at,-1]
test_X = pd.DataFrame(train_data.iloc[split_at:,:-1])
test_Y = train_data.iloc[split_at:,-1]

train_X.shape, train_Y.shape, test_X.shape, test_Y.shape

((1000000, 15), (1000000,), (6000000, 15), (6000000,))

# Test
1. 0.6654 - 1 hot encoding of `product_category`
2. 0.6264 - `hour` actualy decreases score. It will be removed for now, however it might be useful while combined with other attributes.

In [101]:
train_X['user_id'].value_counts()

a99f214a    780220
73b81e30       563
cafca232       316
f01ffaa6       316
6d6314dc       313
afeffc18       311
c357dbff       297
907e894c       270
eec6d022       244
8e1fb6c6       199
936e92fb       148
6d144653       138
987552d1       130
91436dab       116
b09da1c4       111
d857ffbb       109
754f30e7       107
ba3bf9e8       105
f58a1c3b        95
db99ea17        92
41cfe06b        91
3b34164a        89
a4eb98c9        86
56b07489        82
ad2af41a        80
53c3ad4d        78
03559b29        77
98326b84        75
6b5b2eea        74
8ed6e9a8        69
             ...  
972215d4         1
39f8cf5e         1
b5ed975d         1
39ba8327         1
dbbda7c2         1
63b9a766         1
7c54a420         1
ecf343eb         1
fbe378c9         1
a3c8daee         1
f140f050         1
2956813a         1
f7dda743         1
7f864095         1
3d66433e         1
0be5d4ba         1
fc227582         1
875710b4         1
40cdcb33         1
bdb5032e         1
101daa91         1
a5f9896c    

In [102]:
train_X_treated = pipeline_full.fit_transform(train_X)
test_X_treated = pipeline_full.transform(test_X)

train_X_treated.shape, test_X_treated.shape

{25389332: (Timestamp('2014-10-27 12:28:05.439000'), 191.77), 25404001: (Timestamp('2014-10-27 12:01:42.091000'), 191.77), 25414517: (Timestamp('2014-10-27 12:11:44.100000'), 191.77), 25426667: (Timestamp('2014-10-27 12:00:38.267000'), 191.77), 25439461: (Timestamp('2014-10-27 12:22:28.310000'), 191.77), 25448002: (Timestamp('2014-10-27 12:54:05.069000'), 191.77), 25451177: (Timestamp('2014-10-27 12:43:55.481000'), 191.77), 25454799: (Timestamp('2014-10-27 12:55:30.932000'), 191.77), 25467938: (Timestamp('2014-10-27 12:10:57.018000'), 191.77), 25470944: (Timestamp('2014-10-27 12:05:38.448000'), 191.77), 25473017: (Timestamp('2014-10-27 12:05:33.017000'), 191.77), 25474884: (Timestamp('2014-10-27 12:13:14.273000'), 191.77), 25475026: (Timestamp('2014-10-27 12:55:30.220000'), 191.77), 25487159: (Timestamp('2014-10-27 12:01:25.663000'), 191.77), 25503259: (Timestamp('2014-10-27 12:54:16.285000'), 191.77), 25507217: (Timestamp('2014-10-27 12:22:14.150000'), 191.77), 25520726: (Timestamp('2

((1000000, 14), (6000000, 14))

In [73]:
from sklearn.tree import DecisionTreeClassifier

current_model = DecisionTreeClassifier

In [74]:
from sklearn.metrics import roc_auc_score

model = current_model(random_state=random_seed)
model.fit(train_X_treated, train_Y)
test_pred_prob = model.predict_proba(test_X_treated)[:,1]

roc_auc_score(test_Y, test_pred_prob)

0.6263779372728707

# Submit

### Load submit data

In [75]:
submit_data = pd.read_csv('data/test_v2.csv')

### Prepare train and submit data

In [76]:
train_data_X = train_data.iloc[:,:-1]
train_data_Y = train_data.iloc[:,-1]
train_data_X.shape, train_data_Y.shape

((32369524, 15), (32369524,))

In [77]:
train_data_X_treated = pipeline_full.fit_transform(train_data_X)
train_data_X_treated.shape

(32369524, 15)

In [78]:
submit_data_treated = pipeline_full.transform(submit_data)

### Train model & predict

In [79]:
model = current_model(random_state=random_seed)
model.fit(train_data_X_treated, train_data_Y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')

In [80]:
pred_prob = model.predict_proba(submit_data_treated)[:,1]

In [81]:
submission = pd.DataFrame()
submission['id'] = submit_data['id']
submission['isfraud'] = pred_prob
submission.head()

Unnamed: 0,id,isfraud
0,32263877,0.018971
1,32263886,0.0
2,32263890,0.319438
3,32263895,0.319438
4,32263896,0.034431


In [82]:
submission.to_csv(path_or_buf = 'data/submit.csv', index = False)