In [1]:
import pandas as pd
import numpy as np
%load_ext autoreload

In [4]:
a =[1,2,3,3,1,2,0,5]
set(a)


{0, 1, 2, 3, 5}

# Load data

In [2]:
train_data = pd.read_csv('data/train_v2.csv')
columns = list(train_data)
N_original, M_original = train_data.shape
columns, N_original

(['id',
  'timestamp',
  'product_id',
  'product_department',
  'product_category',
  'card_id',
  'user_id',
  'C15',
  'C16',
  'C17',
  'C18',
  'C19',
  'C20',
  'C21',
  'amount',
  'isfraud'],
 32369524)

# Data pipeline
* With trees, normalizing features is not necessary

In [47]:
%autoreload
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from include.DatetimeFromTimestamp import DatetimeFromTimestamp
#from include.HourOfDay import HourOfDay
#from include.DataFrameDropper import DataFrameDropper
from include.DataFrameSelector import DataFrameSelector
from include.FilterNMostCommon import FilterNMostCommon
#from include.UserEvaluator import UserEvaluator

#columns_to_drop = ['id', 'timestamp', 'product_id', 'product_department', 'product_category', 'card_id', 'user_id']
columns_to_use = ['C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'amount', 'daily_transactions_ratio', 'amount_transactions_ratio']
#the hot encoded attributes are also used
pipeline_normal = Pipeline([
    #('hour_creator', HourOfDay()),
    ('datetime_creator', DatetimeFromTimestamp()),
    #('user_evaluator', UserEvaluator()),
    #('dataframe_selector', DataFrameSelector(attribute_names=columns_to_use)),
])

pipeline_1hot = Pipeline([
    ('dataframe_selector', DataFrameSelector(['product_category'])),
    ('filter_n_most_common', FilterNMostCommon(N=5, attribute_name='product_category', minRelFreq=0.05)),
    ('1hot_encoder', OneHotEncoder(sparse = False))
])

pipeline_full = FeatureUnion(transformer_list=[
    ('pipeline_normal', pipeline_normal),
    ('pipeline_1hot', pipeline_1hot),
])

random_seed = 42

# Train/Test split
* Normally I would use random sampling, stratified by an attribute of major relevance, however, in this case the test data that was given follows the train data in time. Therefore, in order to do local testing my first guess would be that it is better to remake that scenario and sample the data by simply splitting it sorted as it is, by time.
* Cross validation is not necessary given that we have a test set big enough

In [207]:
split_by = 2
N_train = 1000000
N_test = 6000000

start_at = N_original - N_train - N_test
split_at = start_at + N_train

train_X = pd.DataFrame(train_data.iloc[start_at:split_at,:-1])
train_Y = train_data.iloc[start_at:split_at,-1]
test_X = pd.DataFrame(train_data.iloc[split_at:,:-1])
test_Y = train_data.iloc[split_at:,-1]

train_X.shape, train_Y.shape, test_X.shape, test_Y.shape

((1000000, 15), (1000000,), (6000000, 15), (6000000,))

# Test
1. 0.6654 - 1 hot encoding of `product_category`
2. 0.6264 - `hour` actualy decreases score. It will be removed for now, however it might be useful while combined with other attributes.

In [208]:
train_X_treated = pipeline_full.fit_transform(train_X)
test_X_treated = pipeline_full.transform(test_X)

train_X_treated.shape, test_X_treated.shape

((1000000, 15), (6000000, 15))

In [19]:
from sklearn.ensemble import RandomForestClassifier

current_model = DecisionTreeClassifier

In [167]:
from sklearn.metrics import roc_auc_score

model = current_model(random_state=random_seed)
model.fit(train_X_treated, train_Y)
test_pred_prob = model.predict_proba(test_X_treated)[:,1]

roc_auc_score(test_Y, test_pred_prob)

0.5858859085912868

# Submit

### Load submit data

In [7]:
submit_data = pd.read_csv('data/test_v2.csv')

### Prepare train and submit data

In [52]:
train_data_X = train_data.iloc[-500000:,:-1]
train_data_Y = train_data.iloc[-500000:,-1]
train_data_X.shape, train_data_Y.shape

((500000, 15), (500000,))

In [53]:
pipeline_full.fit(train_data)
train_data_X_treated = pipeline_normal.transform(train_data_X)
del train_data_X
train_data_X_treated.shape

(500000, 18)

In [34]:
submit_data_treated = pipeline_full.transform(submit_data)

### Train model & predict

In [35]:
model = current_model(random_state=random_seed)
model.fit(train_data_X_treated, train_data_Y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')

In [36]:
pred_prob = model.predict_proba(submit_data_treated)[:,1]

In [37]:
submission = pd.DataFrame()
submission['id'] = submit_data['id']
submission['isfraud'] = pred_prob
submission.head()

Unnamed: 0,id,isfraud
0,32263877,0.01869
1,32263886,0.05679
2,32263890,0.319111
3,32263895,0.319111
4,32263896,0.038599


In [38]:
submission.to_csv(path_or_buf = 'data/submit.csv', index = False)

In [56]:
train_data_X_treated

Unnamed: 0,id,timestamp,product_id,product_department,product_category,card_id,user_id,C15,C16,C17,C18,C19,C20,C21,amount,datetime,daily_transactions_ratio,amount_transactions_ratio,isfraud
31869524,31869524,1414529257314,c4e18dd6,85f751fd,50e219e0,d59101cb,d764c3f0,320,50,2348,3,427,100004,61,191.77,2014-10-28 20:47:37.314,2.100000,2.100000,0
31869525,31869525,1414529750888,f3845767,1fbe01fe,28905ebd,ecad2386,a99f214a,320,50,2617,0,35,-1,51,184.09,2014-10-28 20:55:50.888,1.355458,1.363393,0
31869526,31869526,1414526738907,c4e18dd6,85f751fd,50e219e0,13684a79,8ca254bd,320,50,2009,0,555,100233,102,191.77,2014-10-28 20:05:38.907,1.325153,1.325153,0
31869527,31869527,1414528139338,5c9ae867,83a0ad1a,f028772b,ecad2386,a99f214a,320,50,2323,0,687,100081,48,198.20,2014-10-28 20:28:59.338,1.355458,1.363393,0
31869528,31869528,1414528558170,28f93029,6256f5b4,f028772b,ecad2386,a99f214a,320,50,1863,3,39,-1,23,178.23,2014-10-28 20:35:58.170,1.355458,1.363393,0
31869529,31869529,1414527612977,c4e18dd6,85f751fd,50e219e0,febd1138,a99f214a,320,50,2351,3,163,-1,61,191.77,2014-10-28 20:20:12.977,1.355458,1.363393,0
31869530,31869530,1414526676617,7687a86e,5b08c53b,3e814130,ecad2386,a99f214a,300,250,2295,2,35,100074,23,190.23,2014-10-28 20:04:36.617,1.355458,1.363393,1
31869531,31869531,1414526758593,c4e18dd6,85f751fd,50e219e0,cf0327f9,e00be79f,300,50,2657,3,35,100013,23,191.77,2014-10-28 20:05:58.593,1.581818,1.581818,0
31869532,31869532,1414529717420,863fa89d,7294ea0f,3e814130,ecad2386,a99f214a,320,50,1973,3,39,-1,23,181.43,2014-10-28 20:55:17.420,1.355458,1.363393,0
31869533,31869533,1414529185436,c4e18dd6,85f751fd,50e219e0,feb2dcdb,33ab65db,320,50,1882,3,35,-1,13,191.77,2014-10-28 20:46:25.436,1.000000,1.000000,0


In [62]:
train_data[train_data['user_id'] == '440f1c2c']

Unnamed: 0,id,timestamp,product_id,product_department,product_category,card_id,user_id,C15,C16,C17,C18,C19,C20,C21,amount,isfraud,datetime
31845747,31845747,1414529749641,c4e18dd6,85f751fd,50e219e0,13684a79,440f1c2c,320,50,2654,3,38,-1,23,191.77,0,2014-10-28 20:55:49.641
31869552,31869552,1414529253033,c4e18dd6,85f751fd,50e219e0,13684a79,440f1c2c,320,50,2654,3,38,-1,23,191.77,0,2014-10-28 20:47:33.033
31952979,31952979,1414529711410,c4e18dd6,85f751fd,50e219e0,13684a79,440f1c2c,320,50,2654,3,38,-1,23,191.77,0,2014-10-28 20:55:11.410
32039016,32039016,1414533218849,c4e18dd6,85f751fd,50e219e0,13684a79,440f1c2c,320,50,2654,3,38,-1,23,191.77,0,2014-10-28 21:53:38.849
32055779,32055779,1414531736306,c4e18dd6,85f751fd,50e219e0,13684a79,440f1c2c,320,50,2654,3,38,-1,23,191.77,0,2014-10-28 21:28:56.306
32092346,32092346,1414533058534,c4e18dd6,85f751fd,50e219e0,13684a79,440f1c2c,320,50,2657,3,38,-1,23,191.77,0,2014-10-28 21:50:58.534
32100009,32100009,1414533515315,c4e18dd6,85f751fd,50e219e0,13684a79,440f1c2c,320,50,2654,3,38,-1,23,191.77,0,2014-10-28 21:58:35.315
32106406,32106406,1414533556700,c4e18dd6,85f751fd,50e219e0,13684a79,440f1c2c,320,50,2654,3,38,-1,23,191.77,0,2014-10-28 21:59:16.700
