In [1]:
import pandas as pd
import numpy as np
%load_ext autoreload

# Load data

In [2]:
train_data = pd.read_csv('data/train_v2.csv')
columns = list(train_data)
N_original, M_original = train_data.shape
columns, N_original

(['id',
  'timestamp',
  'product_id',
  'product_department',
  'product_category',
  'card_id',
  'user_id',
  'C15',
  'C16',
  'C17',
  'C18',
  'C19',
  'C20',
  'C21',
  'amount',
  'isfraud'],
 32369524)

# Data pipeline
* With trees, normalizing features is not necessary

In [4]:
%autoreload
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from include.DatetimeFromTimestamp import DatetimeFromTimestamp
#from include.HourOfDay import HourOfDay
#from include.DataFrameDropper import DataFrameDropper
from include.DataFrameSelector import DataFrameSelector
from include.FilterNMostCommon import FilterNMostCommon
#from include.UserEvaluator import UserEvaluator
from include.ConcatEncoder import ConcatEncoder

#columns_to_drop = ['id', 'timestamp', 'product_id', 'product_department', 'product_category', 'card_id', 'user_id']
columns_to_use = ['C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'amount', 'cat_dep_id', 'card_user']
mrf_prod = 1e-5
mrf_card = 1e-5
#the hot encoded attributes are also used
pipeline_normal = Pipeline([
    #('hour_creator', HourOfDay()),
    #('datetime_creator', DatetimeFromTimestamp()),
    #('user_evaluator', UserEvaluator()),
    ('concat_encoder_product', ConcatEncoder(['product_category', 'product_department', 'product_id'], attr_name='cat_dep_id', min_rel_freq=mrf_prod)),
    ('concat_encoder_card', ConcatEncoder(['card_id', 'user_id'], attr_name='card_user', min_rel_freq=mrf_card)),
    ('dataframe_selector', DataFrameSelector(attribute_names=columns_to_use)),
])

pipeline_1hot = Pipeline([
    ('dataframe_selector', DataFrameSelector(['product_category'])),
    ('filter_n_most_common', FilterNMostCommon(N=5, attribute_name='product_category', minRelFreq=0.05)),
    ('1hot_encoder', OneHotEncoder(sparse = False))
])

pipeline_full = FeatureUnion(transformer_list=[
    ('pipeline_normal', pipeline_normal),
    ('pipeline_1hot', pipeline_1hot),
])

random_seed = 42

In [5]:
#from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

current_model = DecisionTreeClassifier

# Train/Test split
* Normally I would use random sampling, stratified by an attribute of major relevance, however, in this case the test data that was given follows the train data in time. Therefore, in order to do local testing my first guess would be that it is better to remake that scenario and sample the data by simply splitting it sorted as it is, by time.
* Cross validation is not necessary given that we have a test set big enough

In [35]:
split_by = 2
N_train = 1000000
N_test = 6000000

start_at = N_original - N_train - N_test
split_at = start_at + N_train

train_X = pd.DataFrame(train_data.iloc[start_at:split_at,:-1])
train_Y = train_data.iloc[start_at:split_at,-1]
test_X = pd.DataFrame(train_data.iloc[split_at:,:-1])
test_Y = train_data.iloc[split_at:,-1]

train_X.shape, train_Y.shape, test_X.shape, test_Y.shape

((1000000, 15), (1000000,), (6000000, 15), (6000000,))

# Test


In [9]:
pipeline_normal.fit(train_data, train_data['isfraud'])
train_X_treated = pipeline_normal.transform(train_X)
test_X_treated = pipeline_normal.transform(test_X)

train_X_treated.shape, test_X_treated.shape

((1000000, 9), (6000000, 9))

In [10]:
#from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

current_model = DecisionTreeClassifier

In [11]:
from sklearn.metrics import roc_auc_score

model = current_model(random_state=random_seed)
model.fit(train_X_treated, train_Y)
test_pred_prob = model.predict_proba(test_X_treated)[:,1]

roc_auc_score(test_Y, test_pred_prob)

0.6194719072846742

# Submit
1. 0.6654 - 1 hot encoding of `product_category`
2. 0.6264 - `hour` actualy decreases score. It will be removed for now, however it might be useful while combined with other attributes.
3. A lot of insuccessful attempts

...

7. with the ratios were made.
8. 0.6594 - Decision tree with label encoding of product stuff
9. 0.6760 - also with label encoding of card and user stuff

### Load submit data

In [6]:
submit_data = pd.read_csv('data/test_v2.csv')

### Prepare train and submit data

In [7]:
train_data_X = train_data.iloc[:,:-1]
train_data_Y = train_data.iloc[:,-1]
train_data_X.shape, train_data_Y.shape

((32369524, 15), (32369524,))

In [8]:
pipeline_normal.fit(train_data, train_data['isfraud'])
train_data_X_treated = pipeline_normal.transform(train_data_X)
del train_data_X
train_data_X_treated.shape

(32369524, 10)

In [9]:
submit_data_treated = pipeline_normal.transform(submit_data)

### Train model & predict

In [None]:
model = current_model(random_state=random_seed)
model.fit(train_data_X_treated, train_data_Y)

In [None]:
pred_prob = model.predict_proba(submit_data_treated)[:,1]

In [None]:
submission = pd.DataFrame()
submission['id'] = submit_data['id']
submission['isfraud'] = pred_prob
submission.head()

In [None]:
submission.to_csv(path_or_buf = 'data/submit.csv', index = False)

# Visualize the Tree

In [None]:
print('Node count:', model.tree_.node_count)