In [159]:
from datetime import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
import json
from sklearn.preprocessing import LabelEncoder

## data loading for sequential embedding

In [2]:
%%time
sequential_data = []

with open('../data/raw/dataForSequentialEmbedding/data_train.json', 'r') as f:
    for line in f:
        sequential_data.append(json.loads(line))

CPU times: user 10.3 s, sys: 1.55 s, total: 11.9 s
Wall time: 11.9 s


- the unique key for each sample is the combination of userid and application time
    - applicants might apply more than one time (same userid, differenty application time)

In [26]:
%%time
sequential_driver = {} 

sequential_behavior = {}

for item in sequential_data:
    
    user_id = item[0]
    
    application_time = int(item[1]['order_info']['order_time'])
    
    sequential_driver.update({f"{user_id}|{application_time}" : item[1]['order_info']})
    
    sub_data = [x for x in item[1]['data'] if x['petime']<=application_time-100]
    # we only keep data occurs before application time. "-100" is not neccessary for offline data cleaning.
    ## but sometimes we use this trick for online calculation to avoid network slowdown
    
    sequential_behavior.update({f"{user_id}|{application_time}":sub_data})

CPU times: user 2.51 s, sys: 30.3 ms, total: 2.54 s
Wall time: 2.54 s


### driver understanding

In [28]:
driver = pd.DataFrame(sequential_driver).T.reset_index()
driver['user_id'] = driver['index'].apply(lambda x : x.split('|')[0])
driver['application_time'] = driver['index'].apply(lambda x : x.split('|')[1])

In [30]:
driver['new_client'].value_counts()

# we normally separate new client (0) and old client (1) because they have different data pattern and data density

0.0    121429
1.0     78571
Name: new_client, dtype: int64

In [34]:
# converting unix time to real time

driver['application_date'] = pd.to_datetime(driver['order_time'],unit='ms')

In [35]:
driver.head()

Unnamed: 0,index,overdue,new_client,order_time,label,user_id,application_time,application_date
0,56f889ee11df4a72955147cb2f29a638|1509322980000,0.0,0.0,1509323000000.0,0.0,56f889ee11df4a72955147cb2f29a638,1509322980000,2017-10-30 00:23:00
1,82ba63c78d5543b7b2fd1b44412ea954|1507609140000,1.0,0.0,1507609000000.0,0.0,82ba63c78d5543b7b2fd1b44412ea954,1507609140000,2017-10-10 04:19:00
2,d84540c274cc43b894997f633fcf47b9|1509373080000,1.0,0.0,1509373000000.0,0.0,d84540c274cc43b894997f633fcf47b9,1509373080000,2017-10-30 14:18:00
3,b8206ff0ea1f4cf4abda18d0e0145497|1507529520000,0.0,1.0,1507530000000.0,0.0,b8206ff0ea1f4cf4abda18d0e0145497,1507529520000,2017-10-09 06:12:00
4,6e5f6b151edd4d40b51ca3b75e392f8d|1506805920000,0.0,1.0,1506806000000.0,0.0,6e5f6b151edd4d40b51ca3b75e392f8d,1506805920000,2017-09-30 21:12:00


In [36]:
driver['label'].value_counts()

0.0    183986
1.0     16014
Name: label, dtype: int64

In [37]:
driver.groupby('label')['overdue'].describe()

# we only regard sample with more than 5 overdue days as bad sample (1)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,183986.0,0.311698,0.753309,0.0,0.0,0.0,0.0,5.0
1.0,16014.0,21.055264,10.136408,6.0,12.0,20.0,30.0,38.0


In [38]:
driver['user_id'].nunique()

150644

### data understanding

In [46]:
%%time
# converting data into dataframe 
## this is not neccessary during modeling phase cause we can use data_generator (presented in data usage)

behavior = []

for user_keys in sequential_behavior:
    
    user_id, application_time = user_keys.split("|")
    
    for item in sequential_behavior[user_keys]:
        
        subitem = item.copy()
        subitem.update({"user_id":user_id, "application_time":application_time})
        behavior.append(subitem)

CPU times: user 3.54 s, sys: 544 ms, total: 4.09 s
Wall time: 4.08 s


In [47]:
behavior = pd.DataFrame(behavior)

In [53]:
behavior.shape, behavior.user_id.nunique()

((5521027, 7), 117708)

In [58]:
len([x.split('|')[0] for x in sequential_behavior if len(sequential_behavior[x])==0])

# there might be several application without any behavior data due to many reasons (which can be ignored)
# data inclusion logic for keys with empty data will be presented in data usage

50084

In [60]:
behavior = behavior.sort_values(['user_id', 'application_time', 'petime'])

In [64]:
behavior['pname'].value_counts()

loan_index         2612879
personal_info       626520
id_verify           580234
contacts_info       452028
loan_submission     340359
operator            297844
bind_debit_card     253188
biometric_auto      168364
login               161418
register             22891
biometric_auth        5302
Name: pname, dtype: int64

### page view path overview

In [65]:
page_name = set(behavior['pname'].value_counts().index)

In [97]:
page_trannsition_overview = {
    f"from_{x}" : {
        f"to_{y}": 0 for y in page_name
    }
    for x in page_name
}

In [99]:
%%time
for keys in sequential_behavior:
    
    subdata = sequential_behavior[keys]
    subdata.sort(key=lambda x : x['petime'])
    
    if len(subdata)<2:
        pass
    
    else:

        prev_state = subdata[0]['pname']

        for item in subdata[1:]:
            current_state = item['pname']

            page_trannsition_overview[f"from_{prev_state}"][f"to_{current_state}"] += 1

            prev_state = item['pname']

CPU times: user 2.64 s, sys: 3.71 ms, total: 2.65 s
Wall time: 2.64 s


In [100]:
pd.DataFrame(page_trannsition_overview)

Unnamed: 0,from_biometric_auth,from_loan_submission,from_contacts_info,from_login,from_bind_debit_card,from_register,from_loan_index,from_id_verify,from_personal_info,from_operator,from_biometric_auto
to_biometric_auth,2366,0,5,41,1,1,139,2514,6,1,0
to_loan_submission,3,74836,9,87,2304,1,262420,26,11,475,0
to_contacts_info,118,0,345545,3,3,0,10193,40,96093,17,2
to_login,7,5533,335,2603,210,54,47000,1385,68,533,310
to_bind_debit_card,8,3660,14,11,133870,1,114911,7,3,687,0
to_register,0,0,0,57,1,77,30,0,0,1,0
to_loan_index,210,236415,16030,155895,115308,22732,1782315,69186,2209,162682,28680
to_id_verify,119,20,20,404,9,8,169596,408229,15,38,29
to_personal_info,2459,4,19,34,0,2,2341,1486,527990,3,92001
to_operator,6,49,89755,94,51,0,76086,11,18,131560,5


## data usage

In [41]:
le = LabelEncoder()
le.fit(['bind_credit_card',
 'bind_debit_card',
 'biometric_auto',
 'contacts_info',
 'id_verify',
 'loan_index',
 'loan_submission',
 'login',
 'operator',
 'personal_info',
 'register',
 'unknown'])

LabelEncoder()

In [125]:
## here's the logic for which data should be dropped entirely and which data should be included as missing valued data

user_id_with_data = set([x.split("|")[0] for x in sequential_behavior if len(sequential_behavior[x])!=0])

user_id_without_data = set([x.split("|")[0] for x in sequential_behavior if len(sequential_behavior[x])==0])

user_id_without_data_should_keep = [x for x in user_id_without_data if x in user_id_with_data]



In [126]:
final_user_id = list(user_id_with_data) + list(set(user_id_without_data_should_keep))

In [155]:
a = sequential_behavior[list(sequential_behavior.keys())[0]]

In [166]:
pstart = [x['pstime'] for x in a]
    
pend = ([x['petime'] for x in a])

In [177]:
data_process(a)

(array([5, 5, 5, 5, 8, 8, 5, 5, 5, 6, 6, 7, 5, 5, 5, 5, 5, 5, 5, 6, 5, 6]),
 [67.298,
  10.332,
  1.877,
  10.442,
  41.694,
  38.152,
  6.247,
  32.896,
  11.789,
  10.51,
  4.067,
  12.344,
  9.378,
  4.574,
  2.113,
  2.6,
  2.391,
  46.153,
  12.243,
  13.896,
  2.084,
  3.855],
 [23.163,
  -1,
  16.459,
  0.135,
  21.678,
  0.153,
  12.238,
  24.046,
  0.13,
  50.582,
  -1,
  0.373,
  2.91,
  -1,
  8.764,
  546.56,
  20.849,
  95.426,
  0.18,
  -1,
  0.172])

In [178]:
def data_process(sequence_for_a_single_application):
    sequence_for_a_single_application.sort(key=lambda x : x['petime'])
    
    page_sequence = [x['pname'] for x in sequence_for_a_single_application]
    
    pstart = [x['pstime'] for x in sequence_for_a_single_application]
    
    pend = ([x['petime'] for x in sequence_for_a_single_application])
    
    page_stay_time = [(y-x)/1000 for x,y in zip(pstart, pend)]
    
    page_lagg_time = [(x-y)/1000 if (x-y)//1000<600 else -1 for x,y in zip(pstart[1:], pend[:-1])] 
    # calculate the duration between the end of last action and the start of current action
    # if this lag is more than 10 minutes we ignore the quantitative meaning of this value
    
    page_sequence = le.transform(page_sequence)
    
    return page_sequence, page_stay_time, page_lagg_time

In [200]:
# data generator body

def file_generator(data):
    '''
    create a generator for files
    '''
    i = -1
    while True:
        i+=1
        if i<len(data):
            yield list(set(data[i].keys()))
        else:
            i=-1

def data_generator(original_data, batch_size = 32):
    if type(original_data)!=list:
        file_gen = file_generator([original_data])
    else:
        file_gen = file_generator(original_data)
    data_keys = next(file_gen)
    while True:
        total_batch = len(data_keys)//batch_size
        if total_batch < len(data_keys)/batch_size:
            total_batch+=1
        n_batch = 0
        while n_batch<total_batch:
            b_sequence = []
            p_staytime_sequence = []
            p_lag_sequence = []
            start = n_batch*batch_size
            end = (n_batch+1)*batch_size
            for keys in data_keys[start:end]:
                page_sequence, page_stay_time, page_lagg_time = data_process(original_data[keys])
                b_sequence.append(page_sequence)
                p_staytime_sequence.append(page_stay_time)
                p_lag_sequence.append(page_lagg_time)
            n_batch +=1
            yield b_sequence,p_staytime_sequence,p_lag_sequence
        data_keys = next(file_gen)

In [201]:
a = data_generator(sequential_behavior, batch_size = 1) # iteratively read and process data from raw sequential_behavior
# batch_size controls how much data it process each iteration

In [202]:
next(a) # these three vectors are what we are studying previously

# first, the sequence of all the page
# second, the sequence of all the page stay time
# third, the sequence of all the page transition time

([array([10,  5,  4,  4,  4,  4,  4,  4,  4,  4,  5,  4,  4,  4,  5,  5,  4,
          5,  4,  4,  4,  2,  9,  9,  9,  9,  9,  9,  9,  3,  3,  3,  3,  3,
          3,  3,  5,  3,  3,  3,  3,  8,  5,  5,  5,  5,  5,  5,  1,  1,  5,
          6,  5,  5,  5,  5])],
 [[29.671,
   10.822,
   34.145,
   0.835,
   1.639,
   4.874,
   7.986,
   6.582,
   6.566,
   1.909,
   2.179,
   2.112,
   4.68,
   13.065,
   6.679,
   1.942,
   5.177,
   2.413,
   6.79,
   4.339,
   1.583,
   18.637,
   6.894,
   1.052,
   1.05,
   1.582,
   1.091,
   1.319,
   2.192,
   0.056,
   5.667,
   2.181,
   5.288,
   2.636,
   1.778,
   5.393,
   2.001,
   0.154,
   4.81,
   4.42,
   1.663,
   17.377,
   10.796,
   2.136,
   5.758,
   4.335,
   6.326,
   3.216,
   23.61,
   0.308,
   3.809,
   12.566,
   15.64,
   3.02,
   5.198,
   14.566]],
 [[0.647,
   2.252,
   23.021,
   0.537,
   13.204,
   14.492,
   3.224,
   15.113,
   1.58,
   0.099,
   7.015,
   18.934,
   7.619,
   20.357,
   33.488,
   1.627,
   0.0