# data

In [1]:
import numpy as np  
import pandas as pd
import matplotlib.pyplot as plt

#### load data

In [2]:
# read uid
uids = pd.read_csv("data/uids_full_v6.csv",usecols=['TransactionID','uid'])

In [3]:
# read data
X_test = pd.read_csv("data/X_test.csv")
X_train = pd.read_csv("data/X_train.csv")
Y_train = pd.read_csv("data/Y_train.csv")

#### normalize

In [4]:
cols = X_train.columns
for c in cols:
    if c not in ['TransactionID','uid']:
        X_test[c] = (X_test[c] - X_train[c].min())/(X_train[c].max()- X_train[c].min())
for c in cols:
    if c not in ['TransactionID','uid']:
        X_train[c] = (X_train[c] - X_train[c].min())/(X_train[c].max()- X_train[c].min())

# make RNN training data

In [5]:
# merge X Y
merge = pd.merge(X_train,Y_train, on='TransactionID')
# 把同一個使用者的資料分為一組
merge_group = merge.groupby("uid")
# 看使用者交易總數的分布
np.percentile(merge_group.size().values, [25, 50, 75, 80, 85, 90])
# RNN time step => 3

array([1., 1., 3., 3., 4., 6.])

## train data

In [6]:
F_Num = len(merge.columns)-3
X_Train_Seq = []
Y_Train_Seq = []
p = [-99 for i in range(F_Num)]
for gn, data in merge_group:
    data_len = len(data)
    # y 
    data_y = data.isFraud.values
    # x
    data = data.drop(['uid','isFraud','TransactionID'], axis=1)
    data_value = data.values
    
    # 把交易紀錄分為三個一組
    if data_len == 3:
        X_Train_Seq.append(data_value)
        Y_Train_Seq.append(data_y)
    elif data_len <= 3:
        padding_num = 3-data_len
        padding = np.array([ p for i in range(padding_num)])
        X_Train_Seq.append(np.append(data_value,padding, axis=0))
        Y_Train_Seq.append(np.append(data_y,[0 for i in range(padding_num)]))
    else:
        for i in range(data_len//3):
            X_Train_Seq.append(data_value[i*3:i*3+3])
            Y_Train_Seq.append(data_y[i*3:i*3+3])
        if data_len%3>0:
            padding_num = 3-data_len%3
            padding = np.array([ p for i in range(padding_num)])
            X_Train_Seq.append(np.append(data_value[-(data_len%3):], padding, axis=0))
            Y_Train_Seq.append(np.append(data_y[-(data_len%3):], [0 for i in range(padding_num)], axis=0))

## test data

In [10]:
X_test

Unnamed: 0,TransactionID,TransactionAmt,ProductCD,card1,card2,card3,card5,card6,addr1,addr2,...,uid_cents_ct,C14_uid_std,uid_C13_ct,uid_V314_ct,uid_V127_ct,uid_V136_ct,uid_V309_ct,uid_V307_ct,uid_V320_ct,outsider15
0,3663549.0,0.000993,1.0,0.540871,0.023952,0.386364,0.920290,0.75,0.160998,0.838710,...,0.023121,0.003010,0.051724,0.000000,0.027901,0.007605,0.000000,0.027778,0.007435,1.0
1,3663550.0,0.001526,1.0,0.188089,0.023952,0.386364,0.920290,0.75,0.453515,0.838710,...,0.005780,0.001479,0.051724,0.023810,0.017121,0.000000,0.000000,0.017045,0.000000,1.0
2,3663551.0,0.005346,1.0,0.199816,0.948104,0.386364,0.920290,0.75,0.845805,0.838710,...,0.000000,0.001348,0.022414,0.071429,0.010146,0.007605,0.016129,0.010732,0.003717,1.0
3,3663552.0,0.008914,1.0,0.574212,0.520958,0.386364,0.485507,0.75,0.240363,0.838710,...,0.005780,0.001024,0.005172,0.023810,0.000634,0.000000,0.000000,0.000631,0.000000,1.0
4,3663553.0,0.002120,1.0,0.978271,0.704591,0.386364,0.130435,0.75,0.374150,0.838710,...,0.005780,0.002968,0.017241,0.071429,0.004439,0.000000,0.016129,0.004419,0.000000,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
506686,4170235.0,0.002957,0.0,0.737641,0.550898,0.651515,0.905797,0.75,0.419501,0.548387,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
506687,4170236.0,0.000373,0.0,0.123822,0.616766,0.651515,0.905797,0.75,0.000000,0.000000,...,0.190751,0.023958,0.013793,0.190476,0.000634,0.000000,0.021505,0.005682,0.000000,0.0
506688,4170237.0,0.001526,1.0,0.900264,0.780439,0.386364,0.920290,0.75,0.517007,0.838710,...,0.005780,0.002049,0.001724,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
506689,4170238.0,0.006317,1.0,0.897965,0.832335,0.386364,0.905797,0.75,0.176871,0.838710,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0


In [7]:
test_group = X_test.groupby("uid")
X_Test_Seq = []
TranID_Seq = []
for gn, data in test_group:
    data_len = len(data)
    # x
    TranID_Seq.extend(list(data.TransactionID.values))
    data = data.drop(['uid','TransactionID'], axis=1)
    data_value = data.values
    # 把交易紀錄分為三個一組
    if data_len == 3:
        X_Test_Seq.append(data_value)
    elif data_len <= 3:
        padding_num = 3-data_len
        TranID_Seq.extend([-1 for i in range(padding_num)])
        padding = np.array([ p for i in range(padding_num)])
        X_Test_Seq.append(np.append(data_value,padding, axis=0))
    else:
        for i in range(data_len//3):
            X_Test_Seq.append(data_value[i*3:i*3+3])
        if data_len%3>0:
            padding_num = 3-data_len%3
            TranID_Seq.extend([-1 for i in range(padding_num)])
            padding = np.array([ p for i in range(padding_num)])
            X_Test_Seq.append(np.append(data_value[-(data_len%3):], padding, axis=0))

## save pickle

In [9]:
import pickle as pkl
fileObject = open("data/X_Train_Seq", 'wb')
pkl.dump(X_Train_Seq, fileObject)
fileObject.close()

fileObject = open("data/Y_Train_Seq", 'wb')
pkl.dump(Y_Train_Seq, fileObject)
fileObject.close()

fileObject = open("data/X_Test_Seq", 'wb')
pkl.dump(X_Test_Seq, fileObject)
fileObject.close()

fileObject = open("data/TranID_Seq", 'wb')
pkl.dump(TranID_Seq, fileObject)
fileObject.close()