In [1]:
import pandas as pd
import numpy as np
%load_ext autoreload

# Load data

In [2]:
train_data = pd.read_csv('data/train_v2.csv')
columns = list(train_data)
N_original, M_original = train_data.shape
columns, N_original

(['id',
  'timestamp',
  'product_id',
  'product_department',
  'product_category',
  'card_id',
  'user_id',
  'C15',
  'C16',
  'C17',
  'C18',
  'C19',
  'C20',
  'C21',
  'amount',
  'isfraud'],
 32369524)

# Data pipeline
* With trees, normalizing features is not necessary

In [105]:
%autoreload
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
#from include.DatetimeFromTimestamp import DatetimeFromTimestamp
#from include.HourOfDay import HourOfDay
#from include.DataFrameDropper import DataFrameDropper
from include.DataFrameSelector import DataFrameSelector
from include.FilterNMostCommon import FilterNMostCommon
from include.UserEvaluator import UserEvaluator

#columns_to_drop = ['id', 'timestamp', 'product_id', 'product_department', 'product_category', 'card_id', 'user_id']
columns_to_use = ['C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'amount'] #the hot encoded attributes are also used
pipeline_normal = Pipeline([
    #('hour_creator', HourOfDay()),
    ('user_evaluator', UserEvaluator(column_names=list(train_data))),
    ('dataframe_selector', DataFrameSelector(attribute_names=columns_to_use)),
])

pipeline_1hot = Pipeline([
    ('dataframe_selector', DataFrameSelector(['product_category'])),
    ('filter_n_most_common', FilterNMostCommon(N=5, attribute_name='product_category')),
    ('1hot_encoder', OneHotEncoder(sparse = False))
])

pipeline_full = FeatureUnion(transformer_list=[
    ('pipeline_normal', pipeline_normal),
    ('pipeline_1hot', pipeline_1hot),
])

random_seed = 42

# Train/Test split
* Normally I would use random sampling, stratified by an attribute of major relevance, however, in this case the test data that was given follows the train data in time. Therefore, in order to do local testing my first guess would be that it is better to remake that scenario and sample the data by simply splitting it sorted as it is, by time.
* Cross validation is not necessary given that we have a test set big enough

In [106]:
split_by = 2
N_train = 1000000
N_test = 6000000

start_at = N_original - N_train - N_test
split_at = start_at + N_train

train_X = pd.DataFrame(train_data.iloc[start_at:split_at,:-1])
train_Y = train_data.iloc[start_at:split_at,-1]
test_X = pd.DataFrame(train_data.iloc[split_at:,:-1])
test_Y = train_data.iloc[split_at:,-1]

train_X.shape, train_Y.shape, test_X.shape, test_Y.shape

((1000000, 15), (1000000,), (6000000, 15), (6000000,))

In [107]:
train_X['card_id'].value_counts()

ecad2386    685084
92f5800b     79772
e2fcccd2     30650
9c13b419     13975
03528b27     12199
54c5d545     11801
03a08c3f     10436
febd1138      9582
7358e05e      8331
98fed791      8070
e9739828      7568
e2a1ca37      6472
1dc72b4d      6178
ce183bbd      5139
f0d41ff1      4623
51cedd4e      4606
d36838b1      3910
cf0327f9      3283
d44c074c      3267
de97da65      2981
3c4b944d      2705
f888bf4c      2310
39947756      2040
3e2bf98d      1909
53de0284      1752
75076517      1748
a5184c22      1638
c8e3e3c1      1603
a37bf1e4      1546
442cfede      1528
             ...  
abe9b3fe         1
d65f9e62         1
6bceaf84         1
e9ec1d4d         1
33954382         1
444d80aa         1
d84afa2e         1
2ab2e91a         1
d3010987         1
4d15bfba         1
5353a7c1         1
dfa6b076         1
d6038c05         1
6837c486         1
5e9ef6fe         1
22018773         1
9d9e17df         1
8e4b0a7f         1
16fd08fc         1
448619a0         1
dd1f7fd3         1
baabf9fe    

# Test
1. 0.6654 - 1 hot encoding of `product_category`
2. 0.6264 - `hour` actualy decreases score. It will be removed for now, however it might be useful while combined with other attributes.

In [108]:
train_X_treated = pipeline_full.fit_transform(train_X)
test_X_treated = pipeline_full.transform(test_X)

train_X_treated.shape, test_X_treated.shape

{'50e219e0': (424776, 30890745), 'f028772b': (305212, 19154036), '3e814130': (89952, 4374429), '28905ebd': (168851, 10156417), 'f66779e6': (3483, 194148), '75fa27f6': (2870, 149373), 'c0dd3be3': (1772, 59456), '335d28a8': (2023, 114828), '72722551': (201, 11075), 'dedf689d': (61, 3438), '76b2941d': (115, 5993), '0569f928': (337, 19204), '42a36e14': (119, 5762), '5378d028': (23, 1817), '9ccfa2ea': (19, 1159), '8fd0aea4': (13, 778), 'bcf865d9': (54, 3294), 'a818d37a': (63, 5075), '70fb0e29': (56, 1869)}


((1000000, 14), (6000000, 14))

In [73]:
from sklearn.tree import DecisionTreeClassifier

current_model = DecisionTreeClassifier

In [74]:
from sklearn.metrics import roc_auc_score

model = current_model(random_state=random_seed)
model.fit(train_X_treated, train_Y)
test_pred_prob = model.predict_proba(test_X_treated)[:,1]

roc_auc_score(test_Y, test_pred_prob)

0.6263779372728707

# Submit

### Load submit data

In [75]:
submit_data = pd.read_csv('data/test_v2.csv')

### Prepare train and submit data

In [76]:
train_data_X = train_data.iloc[:,:-1]
train_data_Y = train_data.iloc[:,-1]
train_data_X.shape, train_data_Y.shape

((32369524, 15), (32369524,))

In [77]:
train_data_X_treated = pipeline_full.fit_transform(train_data_X)
train_data_X_treated.shape

(32369524, 15)

In [78]:
submit_data_treated = pipeline_full.transform(submit_data)

### Train model & predict

In [79]:
model = current_model(random_state=random_seed)
model.fit(train_data_X_treated, train_data_Y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')

In [80]:
pred_prob = model.predict_proba(submit_data_treated)[:,1]

In [81]:
submission = pd.DataFrame()
submission['id'] = submit_data['id']
submission['isfraud'] = pred_prob
submission.head()

Unnamed: 0,id,isfraud
0,32263877,0.018971
1,32263886,0.0
2,32263890,0.319438
3,32263895,0.319438
4,32263896,0.034431


In [82]:
submission.to_csv(path_or_buf = 'data/submit.csv', index = False)