In [1]:
import pandas as pd
import numpy as np
%load_ext autoreload

# Load data

In [2]:
train_data = pd.read_csv('data/train_v2.csv')
columns = list(train_data)
N_original, M_original = train_data.shape
columns, N_original

(['id',
  'timestamp',
  'product_id',
  'product_department',
  'product_category',
  'card_id',
  'user_id',
  'C15',
  'C16',
  'C17',
  'C18',
  'C19',
  'C20',
  'C21',
  'amount',
  'isfraud'],
 32369524)

# Data pipeline
* With trees, normalizing features is not necessary

In [152]:
%autoreload
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from include.DatetimeFromTimestamp import DatetimeFromTimestamp
#from include.HourOfDay import HourOfDay
#from include.DataFrameDropper import DataFrameDropper
from include.DataFrameSelector import DataFrameSelector
from include.FilterNMostCommon import FilterNMostCommon
from include.UserEvaluator import UserEvaluator

#columns_to_drop = ['id', 'timestamp', 'product_id', 'product_department', 'product_category', 'card_id', 'user_id']
columns_to_use = ['C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'amount'] #the hot encoded attributes are also used
pipeline_normal = Pipeline([
    #('hour_creator', HourOfDay()),
    ('datetime_creator', DatetimeFromTimestamp()),
    ('user_evaluator', UserEvaluator()),
    #('dataframe_selector', DataFrameSelector(attribute_names=columns_to_use)),
])

pipeline_1hot = Pipeline([
    ('dataframe_selector', DataFrameSelector(['product_category'])),
    ('filter_n_most_common', FilterNMostCommon(N=5, attribute_name='product_category', minRelFreq=0.05)),
    ('1hot_encoder', OneHotEncoder(sparse = False))
])

pipeline_full = FeatureUnion(transformer_list=[
    ('pipeline_normal', pipeline_normal),
    ('pipeline_1hot', pipeline_1hot),
])

random_seed = 42

# Train/Test split
* Normally I would use random sampling, stratified by an attribute of major relevance, however, in this case the test data that was given follows the train data in time. Therefore, in order to do local testing my first guess would be that it is better to remake that scenario and sample the data by simply splitting it sorted as it is, by time.
* Cross validation is not necessary given that we have a test set big enough

In [153]:
split_by = 2
N_train = 1000000
N_test = 6000000

start_at = N_original - N_train - N_test
split_at = start_at + N_train

train_X = pd.DataFrame(train_data.iloc[start_at:split_at,:-1])
train_Y = train_data.iloc[start_at:split_at,-1]
test_X = pd.DataFrame(train_data.iloc[split_at:,:-1])
test_Y = train_data.iloc[split_at:,-1]

train_X.shape, train_Y.shape, test_X.shape, test_Y.shape

((1000000, 15), (1000000,), (6000000, 15), (6000000,))

# Test
1. 0.6654 - 1 hot encoding of `product_category`
2. 0.6264 - `hour` actualy decreases score. It will be removed for now, however it might be useful while combined with other attributes.

In [154]:
d={'a':1,'b':2}
for lol, ok in d.items():
    print(ok)

1
2


In [155]:
train_X_treated = pipeline_normal.fit_transform(train_X)
test_X_treated = pipeline_normal.transform(test_X)

train_X_treated.shape, test_X_treated.shape

((1000000, 18), (6000000, 18))

In [156]:
train_X_treated

Unnamed: 0,id,timestamp,product_id,product_department,product_category,card_id,user_id,C15,C16,C17,C18,C19,C20,C21,amount,datetime,daily_transactions_ratio,amount_transactions_ratio
25369524,25369524,1414410716446,c4e18dd6,85f751fd,50e219e0,92f5800b,a99f214a,320,50,2424,1,161,100190,71,191.77,2014-10-27 11:51:56.446,780220.0,778003.882184
25369525,25369525,1414407969408,5b626596,57fe1b20,f028772b,ecad2386,a99f214a,320,50,2528,0,39,-1,221,185.01,2014-10-27 11:06:09.408,780220.0,778003.882184
25369526,25369526,1414410932397,e2a5dc06,26fa1946,3e814130,ecad2386,a99f214a,320,50,2283,0,163,100076,95,186.61,2014-10-27 11:55:32.397,780220.0,778003.882184
25369527,25369527,1414409928158,bb1ef334,d6137915,f028772b,ecad2386,a99f214a,320,50,2316,0,167,-1,16,188.02,2014-10-27 11:38:48.158,780220.0,778003.882184
25369528,25369528,1414408628280,c4342784,e8f79e60,f028772b,ecad2386,a99f214a,320,50,2528,0,39,100077,221,186.44,2014-10-27 11:17:08.280,780220.0,778003.882184
25369529,25369529,1414407776069,a434fa42,9a977531,f028772b,ecad2386,a99f214a,320,50,2227,0,935,100079,48,196.63,2014-10-27 11:02:56.069,780220.0,778003.882184
25369530,25369530,1414409252481,1e0acfb4,a38dc379,50e219e0,ecad2386,2233417b,320,50,1921,3,47,-1,23,167.88,2014-10-27 11:27:32.481,2.0,1.732021
25369531,25369531,1414408452915,f3845767,1fbe01fe,28905ebd,ecad2386,a99f214a,320,50,1722,0,35,-1,79,184.09,2014-10-27 11:14:12.915,780220.0,778003.882184
25369532,25369532,1414410629077,c4e18dd6,85f751fd,50e219e0,92f5800b,a99f214a,320,50,2424,1,161,100189,71,191.77,2014-10-27 11:50:29.077,780220.0,778003.882184
25369533,25369533,1414408502412,51d8d3b4,fa2fd0b3,f028772b,ecad2386,a99f214a,320,50,2227,0,935,100077,48,196.67,2014-10-27 11:15:02.412,780220.0,778003.882184


In [None]:
from sklearn.tree import DecisionTreeClassifier

current_model = DecisionTreeClassifier

In [74]:
from sklearn.metrics import roc_auc_score

model = current_model(random_state=random_seed)
model.fit(train_X_treated, train_Y)
test_pred_prob = model.predict_proba(test_X_treated)[:,1]

roc_auc_score(test_Y, test_pred_prob)

0.6263779372728707

In [147]:
l = [(1,2),(3,5),(6,7),(8,9)]
#l=[1,3],[2,5]
a,b=zip(*l)
print(a)
print(b)

(1, 3, 6, 8)
(2, 5, 7, 9)


# Submit

### Load submit data

In [75]:
submit_data = pd.read_csv('data/test_v2.csv')

### Prepare train and submit data

In [76]:
train_data_X = train_data.iloc[:,:-1]
train_data_Y = train_data.iloc[:,-1]
train_data_X.shape, train_data_Y.shape

((32369524, 15), (32369524,))

In [77]:
train_data_X_treated = pipeline_full.fit_transform(train_data_X)
train_data_X_treated.shape

(32369524, 15)

In [78]:
submit_data_treated = pipeline_full.transform(submit_data)

### Train model & predict

In [79]:
model = current_model(random_state=random_seed)
model.fit(train_data_X_treated, train_data_Y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')

In [80]:
pred_prob = model.predict_proba(submit_data_treated)[:,1]

In [81]:
submission = pd.DataFrame()
submission['id'] = submit_data['id']
submission['isfraud'] = pred_prob
submission.head()

Unnamed: 0,id,isfraud
0,32263877,0.018971
1,32263886,0.0
2,32263890,0.319438
3,32263895,0.319438
4,32263896,0.034431


In [82]:
submission.to_csv(path_or_buf = 'data/submit.csv', index = False)