In [1]:
import pandas as pd
import numpy as np
import json
from collections import Counter
from datetime import datetime

In [2]:
%%time
graph_data = []
with open('../data/raw/dataForGraph/dataForGraph.txt', 'r') as f:
    for line in f:
        graph_data.append(json.loads(line))

CPU times: user 46.7 s, sys: 9.13 s, total: 55.8 s
Wall time: 55.9 s


In [3]:
graph_data[0].keys()

dict_keys(['apply_info', 'device_info', 'contacts_info', 'calls_info'])

# construct relations

## Constructing bi-parties graph

- Node Set
    - Center Node - user_id
    - Info Node
        - application (need to generate from data)
        - device_id
        - imsi
        - phone
        - mac
        - idfv
        - idfa
- Edges Set
    - user_id -[application]-> application
    - user_id -[apply_device_id]-> device_id
    - user_id -[device]-> device_id
    - user_id -[device_imsi]-> imsi
    - user_id -[device_mac]-> mac
    - user_id -[device_idfv]-> idfv
    - user_id -[device_idfa]-> idfa
    - user_id -[device_phone]-> phone
    - user_id -[contacts]-> phone
    - user_id -[call_in]-> phone
    - user_id -[call_out]-> phone
    
- Node properties
    - user_id [is_new_client/num_of_applications]
    - application [is_reject/is_overdue/overdue_day/application_time]
- Edge properties
    - apply_device_id [update_time]
    - device_id [update_time]
    - device_imsi [update_time]
    - device_mac [update_time]
    - device_idfv [update_time]
    - device_idfa [update_time]
    - device_phone [update_time]
    - contacts [insert_time/update_time]
    - call_in / call_out
        - the properties for call_in/call_out edges are the important properties to measure social behavior weights in this graph
        - update time
        - last_{n}d_call_cnt : n=7/14/21/28 (flexible options) 
        - last_{n}d_call_duration : n=7/14/21/28 (flexible options) 
        - last_{n}d_call_average_duration : n=7/14/21/28 (flexible options) 
        - etc.

In [80]:
test_data = graph_data[0:10] # using ten samples as an example

userid_node_set = [] # node type, node value, is_new_client, num_of_applications
application_node_set = [] # node type, node value, is_reject, is_overdue, overdue_day
device_info_node_set = [] # node type, node value, update time
phone_node_set = [] # node type, node value, update time

apply_info_edge_set = [] # edge type, $START, $END, updatetime
contacts_edge_set = [] # edge type, $START, $END,insert time, update time
calls_edge_set = [] # edge type, $START, $END, update time, last_15d_call_cnt, last_15d_call_duration_sum, 
                    # last15d_call_duration_avg

In [81]:
def get_feature(df_series):
    # just a sample for feature extraction
    cnt = df_series.shape[0]
    summation = df_series['duration'].astype(int).sum()
    avg = df_series['duration'].astype(int).mean()
    return cnt, summation, avg

In [82]:
for item in test_data:
    application = item['apply_info']
    device = item['device_info']
    contacts = item['contacts_info']
    callLogs = [x for x in item['calls_info'] if 'num_md5' in x] 
    # there might be some errors that calling numbers are missing in the data
    
    user_id = application['apply_user_id']
    application_no = "|".join([application['apply_user_id'], application['apply_time']])
    apply_device = application['apply_device_id']
    apply_phone = device['phone']
    apply_time = int(datetime.timestamp(datetime.strptime('2017-10-29 21:12:11', '%Y-%m-%d %H:%M:%S'))*1000)
    
    userid_node_set.append(('userid', application['apply_user_id'],
                            application['is_new_client'],application['num_of_applications']))
    application_node_set.append(('application', application_no, application['is_reject'],
                            application['is_overdue'], application['overdue_day']))
    apply_info_edge_set.append(('application', user_id, application_no, apply_time))
    
    
    # organize device id relations
    apply_info_edge_set.append(('apply_device_id', user_id, apply_device, apply_time))
    device_id_set = set([apply_device] + device['device_id'])
    for did in device_id_set:
        if did!=apply_device:
            apply_info_edge_set.append((f'device_{did}', user_id, did, apply_time))
        device_info_node_set.append(('device_id', did, apply_time))
 
    # organize other device relations
    for dtype in ['imsi', 'mac', 'idfv', 'idfa']:
        device_info_node_set.extend([(dtype, x, apply_time) for x in device[dtype]])
        apply_info_edge_set.extend([(f'device_{dtype}', user_id, x, apply_time) for x in device[dtype]])
    
    # organize device phone number relations
    phone_node_set.extend([('phone', x, apply_time) for x in apply_phone])
    apply_info_edge_set.extend([('device_apply_phone', user_id, x, apply_time) for x in apply_phone])
    
    # organize contact relations
    phone_node_set.extend([('phone', x, apply_time) for x in contacts])
    contacts_edge_set.extend([('contacts', user_id, x, contacts[x][0], contacts[x][1]) for x in contacts])
    
    # organize call Logs relations
    phone_node_set.extend([('phone', x['num_md5'], apply_time) for x in callLogs])
    call_in_relation = [x for x in callLogs if x['type']=='1']
    call_out_relation = [x for x in callLogs if x['type']=='2']
    ## call_in relation
    if len(call_in_relation)==0:
        pass
    else:
        feature_dict = pd.DataFrame(call_in_relation)
        feature_dict = feature_dict[(feature_dict.date.astype(int)>=apply_time-15*24*60*60*1000)&\
                                    (feature_dict.date.astype(int)<apply_time)].groupby('num_md5').apply(get_feature).to_dict()
        
        calls_edge_set.extend([('call_in', user_id, x, 
                                feature_dict.get(x,[0,np.nan,np.nan])[0], 
                                feature_dict.get(x,[0,np.nan,np.nan])[1],
                                feature_dict.get(x,[0,np.nan,np.nan])[2], 
                                apply_time)
                               for x in set([n['num_md5'] for n in call_in_relation])])
    ## call_out relation
    
    if len(call_out_relation)==0:
        pass
    else:
        feature_dict = pd.DataFrame(call_out_relation)
        feature_dict = feature_dict[(feature_dict.date.astype(int)>=apply_time-15*24*60*60*1000)&\
                                    (feature_dict.date.astype(int)<apply_time)].groupby('num_md5').apply(get_feature).to_dict()
        
        calls_edge_set.extend([('call_out', user_id, x, 
                                feature_dict.get(x,[0,np.nan,np.nan])[0], 
                                feature_dict.get(x,[0,np.nan,np.nan])[1],
                                feature_dict.get(x,[0,np.nan,np.nan])[2], 
                                apply_time)
                               for x in set([n['num_md5'] for n in call_out_relation])])
    
    

In [83]:
%%time
userid_node_set =  set(userid_node_set) # node type, node value, is_new_client, num_of_applications
application_node_set =  set(application_node_set) # node type, node value, is_reject, is_overdue, overdue_day
device_info_node_set =  set(device_info_node_set) # node type, node value, update time
phone_node_set =  set(phone_node_set) # node type, node value, update time

apply_info_edge_set =  set(apply_info_edge_set) # edge type, $START, $END, updatetime
contacts_edge_set =  set(contacts_edge_set) # edge type, $START, $END,insert time, update time
calls_edge_set = set(calls_edge_set) # edge type, $START, $END, update time, last_15d_call_cnt, last_15d_call_duration_sum, 


CPU times: user 6.55 ms, sys: 319 µs, total: 6.87 ms
Wall time: 6.92 ms


In [84]:
len(contacts_edge_set),len(apply_info_edge_set), len(calls_edge_set)

(1643, 49, 1982)

In [85]:
len(phone_node_set)

3109

In [86]:
import networkx as nx

In [87]:
graph = nx.Graph()

In [88]:
for item in userid_node_set:
    graph.add_node(item[1], node_type=item[0], is_new_client=item[2],num_of_applications=item[3] )
    
for item in application_node_set:
    graph.add_node(item[1], node_type=item[0],is_reject =item[2], is_overdue=item[3], overdue_day=item[4])
    
for item in set(list(device_info_node_set)+list(phone_node_set)):
    graph.add_node(item[1], node_type=item[0],update_time=item[2])

In [89]:
for item in apply_info_edge_set:
    graph.add_edge(item[1], item[2], edge_type=item[0], update_time=item[3])
    
for item in contacts_edge_set:
    graph.add_edge(item[1], item[2], edge_type=item[0], insert_time=item[3], update_time=item[4])
    
for item in calls_edge_set:
    graph.add_edge(item[1], item[2], edge_type=item[0], call_cnt=item[3], call_sum=item[4], call_avg=item[5],
                   update_time=item[6])

In [90]:
graph.number_of_edges()

2852

In [91]:
graph.number_of_nodes()

3158

In [92]:
graph.node['74ffce7e86634f9fba64055759e2a5a2']

{'node_type': 'phone', 'update_time': 1509282731000}

In [95]:
nx.degree(graph)

DegreeView({'7432f6cc97ca4de79bbbb29378e48e3f': 716, 'c4a91252cdbf494fb0a80e6da6097132': 122, '25f61ac95a464bc79387a7f83f1a3d20': 453, '029b50315c72439480d6d7d5a200f96c': 346, '8c888c27711646cc96ac4c7462736b18': 298, 'e3ba997de423487f827f9f21934174bd': 143, 'fcec99b494fb45889408448d5d85866a': 67, '61c243d23e5e4dac8a8d3686924a1572': 253, '08b9ef5bd9bb4a208c8364ef3bb662e7': 287, '381d66e67b9d4ef0aea850a37d744f4a': 167, '8c888c27711646cc96ac4c7462736b18|2017-10-29 03:09:42': 1, '7432f6cc97ca4de79bbbb29378e48e3f|2017-10-29 03:12:04': 1, '25f61ac95a464bc79387a7f83f1a3d20|2017-10-29 03:12:00': 1, 'e3ba997de423487f827f9f21934174bd|2017-10-29 21:12:11': 1, '381d66e67b9d4ef0aea850a37d744f4a|2017-10-29 03:13:31': 1, '61c243d23e5e4dac8a8d3686924a1572|2017-10-29 21:11:58': 1, 'fcec99b494fb45889408448d5d85866a|2017-10-29 03:12:33': 1, '08b9ef5bd9bb4a208c8364ef3bb662e7|2017-10-29 21:11:44': 1, 'c4a91252cdbf494fb0a80e6da6097132|2017-10-29 21:11:41': 1, '029b50315c72439480d6d7d5a200f96c|2017-10-29 03: