In [59]:
import networkx as nx
import numpy as np
import pandas as pd

from tqdm import tqdm

In [60]:
transactions_df = pd.read_csv('banksim_dataset/bs140513_032310.csv')
transactions_df.replace("'",'', regex=True, inplace=True) 
transactions_df.head()

Unnamed: 0,step,customer,age,gender,zipcodeOri,merchant,zipMerchant,category,amount,fraud
0,0,C1093826151,4,M,28007,M348934600,28007,es_transportation,4.55,0
1,0,C352968107,2,M,28007,M348934600,28007,es_transportation,39.68,0
2,0,C2054744914,4,F,28007,M1823072687,28007,es_transportation,26.89,0
3,0,C1760612790,3,M,28007,M348934600,28007,es_transportation,17.25,0
4,0,C757503768,5,M,28007,M348934600,28007,es_transportation,35.72,0


In [61]:
G = nx.DiGraph()

for index, row in transactions_df.iterrows():
    step, customer, age, _, gender, merchant, _, category, amount, fraud = row
    if customer not in G:
        G.add_node(customer, age=age, gender=gender)
    if merchant not in G:
        G.add_node(merchant)
    G.add_node(f'T{index}', index=index, weight=amount, category=category)
    G.add_edge(customer, f'T{index}')
    G.add_edge(f'T{index}', merchant)


In [62]:
# Make sure that all data is there
assert len([n for n in G if 'C' in n]) == len(transactions_df['customer'].unique())
assert len([n for n in G if 'M' in n]) == len(transactions_df['merchant'].unique())
assert len([n for n in G if 'T' in n]) == len(transactions_df.index)

In [63]:
degree_centralities = nx.centrality.degree_centrality(G)
page_ranks = nx.pagerank(G)
lpa_communities = nx.community.label_propagation_communities(G.to_undirected())

In [64]:
transaction_nodes = [n for n in G if 'T' in n]
transactions_df['degree_cent'] = {G.nodes[t]['index']: degree_centralities[t] for t in transaction_nodes}
transactions_df['page_rank'] = {G.nodes[t]['index']: page_ranks[t] for t in transaction_nodes}
transactions_df.head()

Unnamed: 0,step,customer,age,gender,zipcodeOri,merchant,zipMerchant,category,amount,fraud,degree_cent,page_rank
0,0,C1093826151,4,M,28007,M348934600,28007,es_transportation,4.55,0,3e-06,1e-06
1,0,C352968107,2,M,28007,M348934600,28007,es_transportation,39.68,0,3e-06,1e-06
2,0,C2054744914,4,F,28007,M1823072687,28007,es_transportation,26.89,0,3e-06,1e-06
3,0,C1760612790,3,M,28007,M348934600,28007,es_transportation,17.25,0,3e-06,1e-06
4,0,C757503768,5,M,28007,M348934600,28007,es_transportation,35.72,0,3e-06,1e-06


In [65]:
communities = np.zeros(len(transactions_df), dtype=int)
for index, community in enumerate(lpa_communities):
    # All nodes in the community
    community_nodes_np = np.array(list(community)) 
    # Mask for transaction nodes
    community_t_mask = np.vectorize(lambda x: 'T' in x)(community_nodes_np)
    # Filter for transaction nodes
    community_t_nodes = community_nodes_np[community_t_mask]
    # Remove the Ts to get index
    community_indices_np = np.vectorize(lambda s: s[1:])(community_t_nodes)
    # Cast to ints
    community_indices_np = community_indices_np.astype(int)
    # Fill mask with t_nodes
    communities[community_indices_np] = index
    # Set the community value of these indicies to the index
transactions_df['community'] = communities

In [66]:
transactions_df.head()

Unnamed: 0,step,customer,age,gender,zipcodeOri,merchant,zipMerchant,category,amount,fraud,degree_cent,page_rank,community
0,0,C1093826151,4,M,28007,M348934600,28007,es_transportation,4.55,0,3e-06,1e-06,0
1,0,C352968107,2,M,28007,M348934600,28007,es_transportation,39.68,0,3e-06,1e-06,0
2,0,C2054744914,4,F,28007,M1823072687,28007,es_transportation,26.89,0,3e-06,1e-06,1
3,0,C1760612790,3,M,28007,M348934600,28007,es_transportation,17.25,0,3e-06,1e-06,0
4,0,C757503768,5,M,28007,M348934600,28007,es_transportation,35.72,0,3e-06,1e-06,0
