In [2]:
from datetime import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
import json
from sklearn.preprocessing import LabelEncoder

from keras.preprocessing.sequence import pad_sequences

In [3]:
le = LabelEncoder()
le.fit([
 'bind_debit_card',
 'biometric_auto',
 'contacts_info',
 'id_verify',
 'loan_index',
 'loan_submission',
 'login',
 'operator',
 'personal_info',
 'register',
 'unknown'])

LabelEncoder()

In [4]:
# data generator body

def file_generator(filename):
    '''
    create a generator for files
    '''
    i = -1
    while True:
        i+=1
        if i<len(filename):
            yield filename[i]
        else:
            i=-1
            
def data_generator(file_list, batch_size = 32, total_batch = 1000):
    if type(file_list)!=list:
        file_gen = file_generator([file_list])
    else:
        file_gen = file_generator(file_list)
    file_name = next(file_gen)
    f = open(file_name, 'r')
    while True:
        n_batch = 0
        pointer = 1
        while n_batch<total_batch:
            pointer = 0
            b_sequence = []
            p_staytime_sequence = []
            p_lag_sequence = []
            info_sequence = []
            while pointer<batch_size:
                pointer += 1
                line = next(f)
                data_body = json.loads(line)
                user_id = data_body[0]
                apply_time, label = data_body[1]['order_info']['order_time'],data_body[1]['order_info']['label']
                page_sequence, page_stay_time, page_lagg_time = data_process(data_body[1]['data'])
                b_sequence.append(page_sequence)
                p_staytime_sequence.append(page_stay_time)
                p_lag_sequence.append(page_lagg_time)
                info_sequence.append([user_id, label, apply_time])
            n_batch +=1
            b_sequence = pad_sequences(b_sequence, maxlen=20, dtype='int64', padding='pre')
            p_staytime_sequence = pad_sequences(p_staytime_sequence, maxlen=20, dtype='int64', padding='pre')
            p_lag_sequence = pad_sequences(p_lag_sequence, maxlen=20, dtype='int64', padding='pre')
            yield b_sequence, [x[1] for x in info_sequence]
                 # p_staytime_sequence,\
                 # p_lag_sequence,info_sequence
            b_sequence = []
            p_staytime_sequence = []
            p_lag_sequence = []
            info_sequence = []
    file_name = next(file_gen)
    f = open(file_name, 'r')

In [5]:
def data_process(sequence_for_a_single_application):
    sequence_for_a_single_application.sort(key=lambda x : x['petime'])
    
    page_sequence = [x['pname'] for x in sequence_for_a_single_application]
    
    pstart = [x['pstime'] for x in sequence_for_a_single_application]
    
    pend = ([x['petime'] for x in sequence_for_a_single_application])
    
    page_stay_time = [(y-x)/1000 for x,y in zip(pstart, pend)]
    
    page_lagg_time = [(x-y)/1000 if (x-y)//1000<600 else -1 for x,y in zip(pstart[1:], pend[:-1])] 
    # calculate the duration between the end of last action and the start of current action
    # if this lag is more than 10 minutes we ignore the quantitative meaning of this value
    
    page_sequence = le.transform(page_sequence)
    return page_sequence, page_stay_time, page_lagg_time

In [6]:
a = data_generator(['../data/raw/dataForSequentialEmbedding/data_train.json'], batch_size = 16) # iteratively read and process data from raw sequential_behavior
# batch_size controls how much data it process each iteration

In [7]:
test = next(a)

In [8]:
test

(array([[4, 4, 4, 5, 5, 6, 4, 4, 4, 4, 4, 4, 4, 5, 4, 5, 6, 4, 4, 5],
        [4, 5, 4, 4, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 5, 4, 5, 5, 4],
        [4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 7, 7, 4, 4, 4, 5],
        [1, 1, 8, 8, 8, 8, 8, 8, 2, 2, 2, 2, 2, 7, 7, 4, 0, 0, 4, 5],
        [8, 8, 8, 8, 8, 8, 2, 2, 2, 2, 7, 7, 4, 4, 0, 0, 4, 5, 4, 4],
        [6, 4, 4, 4, 5, 4, 4, 5, 4, 4, 4, 5, 5, 4, 5, 4, 5, 4, 5, 4],
        [2, 2, 2, 7, 4, 4, 4, 4, 0, 0, 4, 5, 4, 4, 5, 4, 4, 5, 5, 4],
        [4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 7, 4, 4, 5, 4, 4, 4, 5, 5, 4],
        [8, 8, 8, 8, 8, 8, 8, 2, 2, 2, 2, 2, 7, 4, 4, 0, 0, 0, 4, 5],
        [4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5],
        [1, 8, 8, 8, 8, 8, 8, 2, 2, 2, 2, 2, 7, 4, 4, 0, 0, 4, 5, 4],
        [5, 5, 4, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 5, 4],
        [4, 5, 5, 4, 4, 4, 7, 4, 4, 5, 4, 4, 4, 5, 5, 4, 4, 4, 5, 4],
        [0, 0, 3, 3, 8, 8, 2, 2, 7, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4],
        [4, 4, 4, 4,

## mtf trial

In [9]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.preprocessing.sequence import pad_sequences
from keras.optimizers import SGD
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, Input, Flatten, Dropout,Reshape,Input
from keras.layers.normalization import BatchNormalization
from keras.layers.advanced_activations import PReLU
from keras.layers import Conv1D,Conv2D, Reshape
from keras.layers import MaxPooling1D, Embedding,LSTM, MaxPooling2D, Bidirectional, SimpleRNN, UpSampling1D
from keras.models import Model,Sequential
import sys
from keras import optimizers
from keras.layers.advanced_activations import PReLU
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from sklearn.preprocessing import StandardScaler

In [10]:
mtf = pd.read_parquet('mtf.parquet') # let's suppose this is the mtf

In [11]:
# constructing embedding matrix
mtf_matrix = np.zeros(shape=(11,11))

for n,item in enumerate(le.classes_):
    if item == 'unknown':
        pass
    else:
        mtf_matrix[n,:] = mtf.loc[f'to_{item}'].values
        print(n, item)
        
mtf_matrix = StandardScaler().fit_transform(mtf_matrix)

0 bind_debit_card
1 biometric_auto
2 contacts_info
3 id_verify
4 loan_index
5 loan_submission
6 login
7 operator
8 personal_info
9 register


In [12]:
embedding_layer = Embedding(input_dim = 11,
                            output_dim = 11,
                            weights=[mtf_matrix],
                            input_length=20,
                            trainable=True)

W1110 20:05:24.062738 140605509134144 deprecation_wrapper.py:119] From /home/craiditx/.local/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.



In [13]:
input_layer = Input(shape = (20,)) # sequence with 20 sequential behavior
input_layer = Model(input_layer,input_layer)

embedding_model = Sequential()
embedding_model.add(input_layer)
embedding_model.add(embedding_layer) # embedded behavior. (None, n_timestamp, n_dim_embedding)

embedding_model.add(Reshape((20,11,1)))
embedding_model.add(Conv2D(16, (2,2), padding='same'))
embedding_model.add(Conv2D(8, (2,2), padding='same'))
embedding_model.add(Conv2D(4, (2,2), padding='same'))
embedding_model.add(Conv2D(1, (2,2), padding='same'))

embedding_model.add(Reshape((11,20)))

embedding_model.add(LSTM(20,return_sequences=True))
embedding_model.add(LSTM(16,return_sequences=True))
embedding_model.add(LSTM(8,return_sequences=False))

embedding_model.summary()

W1110 20:05:24.246940 140605509134144 deprecation_wrapper.py:119] From /home/craiditx/.local/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W1110 20:05:24.256791 140605509134144 deprecation_wrapper.py:119] From /home/craiditx/.local/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W1110 20:05:24.270979 140605509134144 deprecation_wrapper.py:119] From /home/craiditx/.local/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:174: The name tf.get_default_session is deprecated. Please use tf.compat.v1.get_default_session instead.

W1110 20:05:24.272225 140605509134144 deprecation_wrapper.py:119] From /home/craiditx/.local/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:181: The name tf.ConfigProto is deprecated. Please use tf.compat.v1.ConfigProto instead.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
model_1 (Model)              (None, 20)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 20, 11)            121       
_________________________________________________________________
reshape_1 (Reshape)          (None, 20, 11, 1)         0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 20, 11, 16)        80        
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 20, 11, 8)         520       
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 20, 11, 4)         132       
_________________________________________________________________
conv2d_4 (Conv2D)            (None, 20, 11, 1)         17        
__________

In [None]:
# after feature extracting layer you can add any kinds of other layers.
# normally after this we combine page stay time and lag sequence as weights to do attention
## or merged layer.

# tricks 1 : you can mask all the '-1' value in lag sequence
# tricks 2 : try api modeling not sequential modeling using keras, cause sometimes state carried by LSTM can be useful