In [1]:
import pandas as pd
import numpy as np
%load_ext autoreload

# Load data

In [2]:
train_data = pd.read_csv('data/train_v2.csv')
columns = list(train_data)
columns

['id',
 'timestamp',
 'product_id',
 'product_department',
 'product_category',
 'card_id',
 'user_id',
 'C15',
 'C16',
 'C17',
 'C18',
 'C19',
 'C20',
 'C21',
 'amount',
 'isfraud']

# Train/Test split
* Normally I would use random sampling, stratified by an attribute of major relevance, however, in this case the test data that was given follows the train data in time. Therefore, in order to do local testing my first guess would be that it is better to remake that scenario and sample the data by simply splitting it sorted as it is, by time.
* Cross validation is not necessary given that we have a test set big enough

In [3]:
split_by = 2
N_train = 100000

N_original, M_original = train_data.shape
start_at = N_original - N_train * split_by
split_at = start_at + N_train

train_X = pd.DataFrame(train_data.iloc[start_at:split_at-1,:-1])
train_Y = train_data.iloc[start_at:split_at-1,-1]
test_X = pd.DataFrame(train_data.iloc[split_at:,:-1])
test_Y = train_data.iloc[split_at:,-1]

train_X.shape, train_Y.shape, test_X.shape, test_Y.shape

((99999, 15), (99999,), (100000, 15), (100000,))

# Data pipeline
* With trees, normalizing features is not necessary

In [6]:
%autoreload
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from include.DatetimeFromTimestamp import DatetimeFromTimestamp
from include.DataFrameDropper import DataFrameDropper
from include.DataFrameSelector import DataFrameSelector
from include.FilterNMostCommon import FilterNMostCommon

columns_to_drop = ['id', 'timestamp', 'product_id', 'product_department', 'product_category', 'card_id', 'user_id', 'datetime']

pipeline_normal = Pipeline([
    ('datetime_creator', DatetimeFromTimestamp()),
    ('dataframe_dropper', DataFrameDropper(attribute_names=columns_to_drop)),
])

pipeline_1hot = Pipeline([
    ('dataframe_selector', DataFrameSelector(['product_category'])),
    ('filter_n_most_common', FilterNMostCommon(N=5, attribute_name='product_category')),
    ('1hot_encoder', OneHotEncoder(sparse = False))
])

pipeline_full = FeatureUnion(transformer_list=[
    ('pipeline_normal', pipeline_normal),
    ('pipeline_1hot', pipeline_1hot),
])

train_X_treated = pipeline_full.fit_transform(train_X)
test_X_treated = pipeline_full.transform(test_X)

train_X_treated.shape, test_X_treated.shape

((99999, 14), (100000, 14))

# Test

In [19]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score

model = DecisionTreeClassifier()
model.fit(train_X_treated, train_Y)
pred_prob = model.predict_proba(test_X_treated)[:,1]

roc_auc_score(test_Y, pred_prob)

0.7592812983899936

# Submit

In [8]:
vc = train_sample['product_category'].value_counts()
vc

50e219e0    44810
f028772b    29437
28905ebd    17342
3e814130     4551
75fa27f6     1554
76b2941d     1207
335d28a8      538
f66779e6      525
c0dd3be3       16
0569f928        7
a818d37a        5
70fb0e29        3
bcf865d9        3
dedf689d        1
72722551        1
Name: product_category, dtype: int64

In [None]:
Data