## 1. RNN for structured data

In [75]:
import pickle
import numpy as np
with open('data.pickle','rb') as load:
    data=pickle.load(load)
with open('le.pickle','rb') as load:
    le=pickle.load(load)
with open('ohe.pickle','rb') as load:
    ohe=pickle.load(load)
with open('train_test_index.pickle','rb') as load:
    train_test_index=pickle.load(load)
with open('feature_final.pickle','rb') as load:
    feature_final=pickle.load(load)



### I will use transaction date time feature as the time step. 
### Previously, all samples are taken as iid. By involving the time step feature, all data can be seen as time series data!
### The transaction sequence probably contain some sequential pattern. When we train RF and Xgboost this pattern may lost.
### In order to feed the sequential data into RNN, we have to reshape the samples to 3D: (samples, time_step, input_features)
### Now let's reshape sample first! 

In [76]:
import pandas as pd
data_raw=pd.read_json('transactions.txt',lines=True)
data=pd.concat([data,data_raw['transactionDateTime']],axis=1)
data=pd.concat([data,data_raw['isFraud']],axis=1)
data.head(3)

Unnamed: 0,customerId,acqCountry,cardPresent,merchantCategoryCode,merchantCountryCode,cardCVV,cardLast4Digits,merchantName,posConditionCode,posEntryMode,...,dateOfLastAddressChange_year,dateOfLastAddressChange_month,transactionDateTime_year,transactionDateTime_month,transactionDateTime_time,currentBalance,transactionAmount,creditLimit,transactionDateTime,isFraud
0,733493772,US,False,rideshare,US,492,9184,Lyft,1,5,...,2014,8,2016,1,19,0.0,111.33,5000,2016-01-08T19:04:50,True
1,733493772,US,False,rideshare,US,492,9184,Uber,1,9,...,2014,8,2016,1,22,111.33,24.75,5000,2016-01-09T22:32:39,False
2,733493772,US,False,rideshare,US,492,9184,Lyft,1,5,...,2014,8,2016,1,13,136.08,187.4,5000,2016-01-11T13:36:55,False


### Sort data by transactionDateTime so that all txn are time ordered(Txn is the short name of Transaction)

In [77]:
data=data.sort_values(by='transactionDateTime')

### Divid entire data into pieses groupby customer

In [81]:
g=data.groupby('customerId')
cust_iter=g.__iter__()
cust=[i[1] for i in cust_iter]

In [87]:
print('Each element in the list cust is the collection of txn under each customer!')
print('There are totally %d distinct customers in the dataset!'%len(cust))
print('The first customer has following txn:')
cust[0]

Each element in the list cust is the collection of txn under each customer!
There are totally 5000 distinct customers in the dataset!
The first customer has following txn:


Unnamed: 0,customerId,acqCountry,cardPresent,merchantCategoryCode,merchantCountryCode,cardCVV,cardLast4Digits,merchantName,posConditionCode,posEntryMode,...,dateOfLastAddressChange_year,dateOfLastAddressChange_month,transactionDateTime_year,transactionDateTime_month,transactionDateTime_time,currentBalance,transactionAmount,creditLimit,transactionDateTime,isFraud
583485,100547107,US,True,fastfood,US,786,1222,Popeyes #414406,01,09,...,2011,10,2016,01,01,0.00,64.66,2500,2016-01-02T01:47:46,False
583486,100547107,US,True,fastfood,US,786,1222,Popeyes #700785,01,80,...,2011,10,2016,01,23,64.66,5.31,2500,2016-01-16T23:35:41,False
583487,100547107,US,True,fastfood,US,786,1222,Arbys #46046,01,02,...,2011,10,2016,01,21,69.97,204.23,2500,2016-01-17T21:50:04,False
583488,100547107,US,True,fastfood,US,786,1222,Shake Shack #968081,08,05,...,2011,10,2016,01,05,274.20,155.28,2500,2016-01-29T05:19:50,False
583489,100547107,US,True,fastfood,US,786,1222,In-N-Out #27394,01,09,...,2011,10,2016,01,20,429.48,91.14,2500,2016-01-30T20:49:10,False
583490,100547107,US,True,fastfood,US,786,1222,Quizno's #205329,01,02,...,2011,10,2016,02,22,0.00,2.80,2500,2016-02-06T22:31:44,False
583491,100547107,US,True,fastfood,US,786,1222,Domino's Pizza #962710,01,05,...,2011,10,2016,02,06,2.80,119.77,2500,2016-02-10T06:48:23,False
583492,100547107,US,True,fastfood,US,786,1222,Domino's Pizza #660134,01,05,...,2011,10,2016,02,08,122.57,211.11,2500,2016-02-11T08:28:07,False
583493,100547107,US,True,fastfood,US,786,1222,Krispy Kreme #122012,01,09,...,2011,10,2016,02,01,333.68,66.23,2500,2016-02-20T01:01:47,False
583494,100547107,US,True,fastfood,US,786,1222,Wendys #327348,01,02,...,2011,10,2016,02,04,399.91,1.06,2500,2016-02-26T04:02:55,False


### Txn Count Distribution 

In [86]:
#Cust_len is a list with the txn_count value for each customer 
cust_len=np.array([len(i) for i in cust])
print('Max of txn_count:%d'%cust_len.max())
print('Min of txn_count:%d'%cust_len.min())
print('Median of txn_count:%d'%np.median(cust_len))
print('The top 10 txn_count value is as following, the left column is txn_count, right is count_value')
print(pd.Series(cust_len).value_counts()[:10])

Max of txn_count:10034
Min of txn_count:1
Median of txn_count:47
The top 10 txn_count value is as following, the left column is txn_count, right is count_value
11    79
14    79
20    78
10    75
23    72
15    72
13    71
27    68
22    67
19    66
dtype: int64


### Generate new features rolling average & deviance for each customer 

In [109]:
from tqdm import tqdm_notebook as tqdm
cust_new=[]
for i in tqdm(cust):
    i['win_5_mean']=i['transactionAmount'].rolling(5,center=True,min_periods=1).mean()
    i['win_10_mean']=i['transactionAmount'].rolling(10,center=True,min_periods=1).mean()
    i['dev_5']=np.abs(i['transactionAmount']-i['win_5_mean'])
    i['dev_10']=np.abs(i['transactionAmount']-i['win_10_mean'])
    cust_new.append(i)
with open('cust_new.pickle','wb') as save:
    pickle.dump(cust_new,save)

HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))

### For any customer with txn_count<10, expend txn_count to 10 by adding all columns as 0.

In [111]:
#Load cust_new from disk to avoid running above new feture generation process again
with open('cust_new.pickle','rb') as load:
    cust_new=pickle.load(load)

In [153]:
#Padding for customer with txn_count<10
padding_index=np.where(cust_len<10)[0] #index of customers with txn_count<10
cols=cust_new[0].columns #column_names of cust_new
for i in padding_index:
    padding_row_count=10-len(cust_new[i]) #how many padding rows need to be append for this customer i
    padding_df=pd.DataFrame(data=np.zeros((padding_row_count,len(cols))),columns=cols) #generate padding dataframe 
    cust_new[i]=cust_new[i].append(padding_df) #Append padding df to customer i

In [156]:
#Test if txn_count>10 for all customers
any([len(i)<10 for i in cust_new]) # All customers have at least 10 txn now!

False

### Reshape samples with time_step=10 for each customer

In [193]:
#generate samples(1,time_step,input_feat) based on the single txn_df of each customer with window size=10
#txn_df is just the df in cust_new
def moving_window_sample_generator(df,size=10):
    if len(df)<size:
        print('The length of df is smaller than window size!')
    else:
        sample_list=[]  #new list to contain all moving window samples
                        #move the win_head from first row to last row of df
                        #if the win_tail > last row of df, break the loop
                        #else create a new sample with size=10
        for i in range(len(df)): 
            win_head=i 
            win_tail=i+size
            if win_tail >= len(df):
                break
            else:
                sample=df.iloc[win_head:win_tail,:]
                # the new sample is not a single txn anymore
                # it is a df with 10 txn 
                # in order to under/over sampleing for such samples, I will use following strategy:
                # if there is one more more than one fraud txn in a sample, I will take it as 1, else 0 as fraud indicator
                # this strategy will make the count of samples containing fraud txn = count of samples without any fraud txn
                sample_fraud_label=int(any(sample['isFraud']))
                sample_list.append((sample,sample_fraud_label))
    return sample_list

In [198]:
#Let's take an example
ex=moving_window_sample_generator(cust_new[0])
print('Based on the above algrithm, the sample count formula for a df is as following:')
print('sample_count = len_df - 9')
print('the txn_count of the first customer in cust_new is %d'%len(cust_new[0]))
print('the count of generated samples for the first customer in cust_new is %d'%len(ex))
print('The dimension of any new sample now is 10*31')
ex[0][0]

Based on the above algrithm, the sample count formula for a df is as following:
sample_count = len_df - 9
the txn_count of the first customer in cust_new is 85
the count of generated samples for the first customer in cust_new is 75
The dimension of any new sample now is 10*31


Unnamed: 0,customerId,acqCountry,cardPresent,merchantCategoryCode,merchantCountryCode,cardCVV,cardLast4Digits,merchantName,posConditionCode,posEntryMode,...,transactionDateTime_time,currentBalance,transactionAmount,creditLimit,transactionDateTime,isFraud,win_5_mean,win_10_mean,dev_5,dev_10
583485,100547107,US,True,fastfood,US,786,1222,Popeyes #414406,1,9,...,1,0.0,64.66,2500,2016-01-02T01:47:46,False,91.4,104.124,26.74,39.464
583486,100547107,US,True,fastfood,US,786,1222,Popeyes #700785,1,80,...,23,64.66,5.31,2500,2016-01-16T23:35:41,False,107.37,87.236667,102.06,81.926667
583487,100547107,US,True,fastfood,US,786,1222,Arbys #46046,1,2,...,21,69.97,204.23,2500,2016-01-17T21:50:04,False,104.124,91.884286,100.106,112.345714
583488,100547107,US,True,fastfood,US,786,1222,Shake Shack #968081,8,5,...,5,274.2,155.28,2500,2016-01-29T05:19:50,False,91.752,106.7875,63.528,48.4925
583489,100547107,US,True,fastfood,US,786,1222,In-N-Out #27394,1,9,...,20,429.48,91.14,2500,2016-01-30T20:49:10,False,114.644,102.281111,23.504,11.141111
583490,100547107,US,True,fastfood,US,786,1222,Quizno's #205329,1,2,...,22,0.0,2.8,2500,2016-02-06T22:31:44,False,116.02,92.159,113.22,89.359
583491,100547107,US,True,fastfood,US,786,1222,Domino's Pizza #962710,1,5,...,6,2.8,119.77,2500,2016-02-10T06:48:23,False,98.21,106.745,21.56,13.025
583492,100547107,US,True,fastfood,US,786,1222,Domino's Pizza #660134,1,5,...,8,122.57,211.11,2500,2016-02-11T08:28:07,False,80.194,166.724,130.916,44.386
583493,100547107,US,True,fastfood,US,786,1222,Krispy Kreme #122012,1,9,...,1,333.68,66.23,2500,2016-02-20T01:01:47,False,121.738,146.301,55.508,80.071
583494,100547107,US,True,fastfood,US,786,1222,Wendys #327348,1,2,...,4,399.91,1.06,2500,2016-02-26T04:02:55,False,218.804,162.909,217.744,161.849


In [229]:
#Generate new samples for all customers
cust_final=[]
for cust in tqdm(cust_new[:30]):
    cust_final.append(moving_window_sample_generator(cust))
with open('cust_final.pickle','wb') as save:
    pickle.dump(cust_final,save)

HBox(children=(IntProgress(value=0, max=30), HTML(value='')))

In [205]:
#currently, the cust_final has three dimensions
#the length of first dimension is 5000, indicates all customers
#the second dimesnion indicates a list of all samples generated by moving_window for a customer
#the third dimension is a tuple(single sample(10*31),fraud_indicator(scalar))
print('the length of cust_final is: %d'%len(cust_final))
print('the fraud_indicator for this sample is: %d'%cust_final[0][0][1])
print('the sample is as following:')
cust_final[0][0][0]

the length of cust_final is: 30
the fraud_indicator for this sample is: 0
the sample is as following:


Unnamed: 0,customerId,acqCountry,cardPresent,merchantCategoryCode,merchantCountryCode,cardCVV,cardLast4Digits,merchantName,posConditionCode,posEntryMode,...,transactionDateTime_time,currentBalance,transactionAmount,creditLimit,transactionDateTime,isFraud,win_5_mean,win_10_mean,dev_5,dev_10
583485,100547107,US,True,fastfood,US,786,1222,Popeyes #414406,1,9,...,1,0.0,64.66,2500,2016-01-02T01:47:46,False,91.4,104.124,26.74,39.464
583486,100547107,US,True,fastfood,US,786,1222,Popeyes #700785,1,80,...,23,64.66,5.31,2500,2016-01-16T23:35:41,False,107.37,87.236667,102.06,81.926667
583487,100547107,US,True,fastfood,US,786,1222,Arbys #46046,1,2,...,21,69.97,204.23,2500,2016-01-17T21:50:04,False,104.124,91.884286,100.106,112.345714
583488,100547107,US,True,fastfood,US,786,1222,Shake Shack #968081,8,5,...,5,274.2,155.28,2500,2016-01-29T05:19:50,False,91.752,106.7875,63.528,48.4925
583489,100547107,US,True,fastfood,US,786,1222,In-N-Out #27394,1,9,...,20,429.48,91.14,2500,2016-01-30T20:49:10,False,114.644,102.281111,23.504,11.141111
583490,100547107,US,True,fastfood,US,786,1222,Quizno's #205329,1,2,...,22,0.0,2.8,2500,2016-02-06T22:31:44,False,116.02,92.159,113.22,89.359
583491,100547107,US,True,fastfood,US,786,1222,Domino's Pizza #962710,1,5,...,6,2.8,119.77,2500,2016-02-10T06:48:23,False,98.21,106.745,21.56,13.025
583492,100547107,US,True,fastfood,US,786,1222,Domino's Pizza #660134,1,5,...,8,122.57,211.11,2500,2016-02-11T08:28:07,False,80.194,166.724,130.916,44.386
583493,100547107,US,True,fastfood,US,786,1222,Krispy Kreme #122012,1,9,...,1,333.68,66.23,2500,2016-02-20T01:01:47,False,121.738,146.301,55.508,80.071
583494,100547107,US,True,fastfood,US,786,1222,Wendys #327348,1,2,...,4,399.91,1.06,2500,2016-02-26T04:02:55,False,218.804,162.909,217.744,161.849


### Generate train and test data

In [256]:
from random import sample as random_sample
#since current sample is a df not single row, do not use train_test_split
#shuffle samples amoung all customers
shuffle_sample=[]
for cust in cust_final:
        for sample in cust:
            shuffle_sample.append(sample)
#generate sample index, train_index, test_index            
index=[i for i in range(len(shuffle_sample))]
test_len=int(len(index)*0.2)
test_index=random_sample(index,test_len)
train_index=list(set(index).difference(set(test_index)))

In [257]:
train=[shuffle_sample[i] for i in train_index]
test=[shuffle_sample[i] for i in test_index]

### Under Sampleing
### I will use under sampleing here, generate samples with balanced class from training set

In [260]:
#Create a function to generate balanced sample
from random import sample as random_sample
def balanced_sample_generator(data):
    #Shuffle samples amoung all customers
    shuffle_fraud=[]
    shuffle_norm=[]
    for sample in data:
        if sample[1]==1:
            shuffle_fraud.append(sample[0])
        else:
            shuffle_norm.append(sample[0])

    #random select normal samples to the length of fraud samples
    fraud_len=len(shuffle_fraud)
    under_norm=random_sample(shuffle_norm,fraud_len)
    print('fraud sample length is: %d'%len(shuffle_fraud))
    print('normal sample length is: %d'%len(under_norm))

    #Combine fraud and normal
    sample_final=shuffle_fraud.copy()
    sample_final.extend(under_norm)
    print('final sample length is: %d'%len(sample_final))
    return sample_final
    
train_balanced=balanced_sample_generator(train)

fraud sample length is: 216
normal sample length is: 216
final sample length is: 432


### Encoding for balanced dataset

In [262]:
with open('le.pickle','rb') as load:
    le=pickle.load(load)
with open('ohe.pickle','rb') as load:
    ohe=pickle.load(load)
with open('feature_final.pickle','rb') as load:
    feature_final=pickle.load(load)



In [271]:
feature_category=feature_final[:22]#the ahead 22 features are categorical, rest are interval
feature_category

['customerId',
 'acqCountry',
 'cardPresent',
 'merchantCategoryCode',
 'merchantCountryCode',
 'cardCVV',
 'cardLast4Digits',
 'merchantName',
 'posConditionCode',
 'posEntryMode',
 'transactionType',
 'expirationDateKeyInMatch',
 'CVVInMatch',
 'accountOpenDate_year',
 'accountOpenDate_month',
 'currentExpDate_year',
 'currentExpDate_month',
 'dateOfLastAddressChange_year',
 'dateOfLastAddressChange_month',
 'transactionDateTime_year',
 'transactionDateTime_month',
 'transactionDateTime_time']

In [297]:
feature_interval=['currentBalance','transactionAmount','creditLimit','win_5_mean','win_10_mean','dev_5','dev_10']

In [315]:
#To explain more clearly, not use pipeline here, encoding step by step
#ordinal encoding for all categorical features & combine interval features:
train_le=[np.append(le.transform(sample[feature_category]), #transform categorical features to numeric
                    np.array(sample[feature_interval]),     #combine interval features
                    axis=1) for sample in train_balanced]
#train_y is a 2D array
train_y=np.array([sample['isFraud'] for sample in train_balanced])

#onehot encoding
train_ohe=[ohe.transform(sample) for sample in train_le]

#take a look at sample
train_ohe[0]

# Dimension Deduction & StandardScaler can be also used here

<10x13682 sparse matrix of type '<class 'numpy.float64'>'
	with 287 stored elements in COOrdinate format>

In [302]:
with open('LSTM_train_X.pickle','wb') as save:
    pickle.dump(train_ohe,save)
with open('LSTM_train_y.pickle','wb') as save:
    pickle.dump(train_y,save)

### Modeling

In [316]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, TimeDistributed, Dropout
import tensorflow as tf
from keras.callbacks import EarlyStopping, ModelCheckpoint
n_steps = 10
n_inputs = 13682
n_neurons = 10
n_outputs = 1
keep_prob=0.9
lr=0.001
epochs=10
batch_size=300

model=Sequential()
model.add(LSTM(100,return_sequences=True,input_shape=(n_steps,n_inputs)))
model.add(LSTM(50,return_sequences=True))
model.add(LSTM(30,return_sequences=True))
model.add(TimeDistributed(Dense(1,activation='sigmoid')))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy',recall,precision])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_4 (LSTM)                (None, 10, 100)           5513200   
_________________________________________________________________
lstm_5 (LSTM)                (None, 10, 50)            30200     
_________________________________________________________________
lstm_6 (LSTM)                (None, 10, 30)            9720      
_________________________________________________________________
time_distributed_2 (TimeDist (None, 10, 1)             31        
Total params: 5,553,151
Trainable params: 5,553,151
Non-trainable params: 0
_________________________________________________________________
None


In [317]:
##earlystoping
es=EarlyStopping(monitor='val_loss',mode='auto',verbose=1,patience=3)
##save best model
mc=ModelCheckpoint('lstm_best.h5',monitor='val_recall',mode='min',verbose=1,save_best_only=True)
###training
model.fit(train_ohe,train_y,epochs=epochs,validation_split=0.3,
          batch_size=batch_size,verbose=2,callbacks=[mc,es],
          ) 

ValueError: Error when checking model input: the list of Numpy arrays that you are passing to your model is not the size the model expected. Expected to see 1 array(s), but instead got the following list of 432 arrays: [<10x13682 sparse matrix of type '<class 'numpy.float64'>'
	with 287 stored elements in COOrdinate format>, <10x13682 sparse matrix of type '<class 'numpy.float64'>'
	with 287 stored elements in COOrd...