# data

In [1]:
import numpy as np  
import pandas as pd
import matplotlib.pyplot as plt

#### load data

In [2]:
# read uid
uids = pd.read_csv("data/uids_full_v6.csv",usecols=['TransactionID','uid'])

In [3]:
# read data
X_test = pd.read_csv("data/X_test.csv")
X_train = pd.read_csv("data/X_train.csv")
Y_train = pd.read_csv("data/Y_train.csv")

#### normalize

In [4]:
cols = X_train.columns
for c in cols:
    if c not in ['TransactionID','uid']:
        X_test[c] = (X_test[c] - X_train[c].min())/(X_train[c].max()- X_train[c].min())
for c in cols:
    if c not in ['TransactionID','uid']:
        X_train[c] = (X_train[c] - X_train[c].min())/(X_train[c].max()- X_train[c].min())

# make RNN training data

In [5]:
# merge X Y
merge = pd.merge(X_train,Y_train, on='TransactionID')
# 把同一個使用者的資料分為一組
merge_group = merge.groupby("uid")
# 看使用者交易總數的分布
np.percentile(merge_group.size().values, [25, 50, 75, 80, 85, 90])
# RNN time step => 3

array([1., 1., 3., 3., 4., 6.])

## train data

In [6]:
F_Num = len(merge.columns)-3
X_Train_Seq = []
Y_Train_Seq = []
p = [-99 for i in range(F_Num)]
for gn, data in merge_group:
    data_len = len(data)
    # y 
    data_y = data.isFraud.values
    # x
    data = data.drop(['uid','isFraud','TransactionID'], axis=1)
    data_value = data.values
    
    # 把交易紀錄分為三個一組
    if data_len == 3:
        X_Train_Seq.append(data_value)
        Y_Train_Seq.append(data_y)
    elif data_len <= 3:
        padding_num = 3-data_len
        padding = np.array([ p for i in range(padding_num)])
        X_Train_Seq.append(np.append(data_value,padding, axis=0))
        Y_Train_Seq.append(np.append(data_y,[0 for i in range(padding_num)]))
    else:
        for i in range(data_len//3):
            X_Train_Seq.append(data_value[i*3:i*3+3])
            Y_Train_Seq.append(data_y[i*3:i*3+3])
        if data_len%3>0:
            padding_num = 3-data_len%3
            padding = np.array([ p for i in range(padding_num)])
            X_Train_Seq.append(np.append(data_value[-(data_len%3):], padding, axis=0))
            Y_Train_Seq.append(np.append(data_y[-(data_len%3):], [0 for i in range(padding_num)], axis=0))

## test data

In [7]:
test_group = X_test.groupby("uid")
X_Test_Seq = []
TranID_Seq = []
for gn, data in test_group:
    data_len = len(data)
    # x
    TranID_Seq.extend(list(data.TransactionID.values))
    data = data.drop(['uid','TransactionID'], axis=1)
    data_value = data.values
    # 把交易紀錄分為三個一組
    if data_len == 3:
        X_Test_Seq.append(data_value)
    elif data_len <= 3:
        padding_num = 3-data_len
        TranID_Seq.extend([-1 for i in range(padding_num)])
        padding = np.array([ p for i in range(padding_num)])
        X_Test_Seq.append(np.append(data_value,padding, axis=0))
    else:
        for i in range(data_len//3):
            X_Test_Seq.append(data_value[i*3:i*3+3])
        if data_len%3>0:
            padding_num = 3-data_len%3
            TranID_Seq.extend([-1 for i in range(padding_num)])
            padding = np.array([ p for i in range(padding_num)])
            X_Test_Seq.append(np.append(data_value[-(data_len%3):], padding, axis=0))

## save pickle

In [9]:
import pickle as pkl
fileObject = open("data/X_Train_Seq", 'wb')
pkl.dump(X_Train_Seq, fileObject)
fileObject.close()

fileObject = open("data/Y_Train_Seq", 'wb')
pkl.dump(Y_Train_Seq, fileObject)
fileObject.close()

fileObject = open("data/X_Test_Seq", 'wb')
pkl.dump(X_Test_Seq, fileObject)
fileObject.close()

fileObject = open("data/TranID_Seq", 'wb')
pkl.dump(TranID_Seq, fileObject)
fileObject.close()