In [95]:
import argparse
import networkx as nx
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [96]:
def build_graph(csv):
    """Takes the original dataset and produces a graph with customers and merchants as nodes and transactions as edges.

    Args:
        data_set (Dataframe): The transaction data, columns are
        [step,customer,age,gender,zipcodeOri,merchant,zipMerchant,category,amount,fraud]

    Returns:
        DiGraph: A directed graph with the above properties
    """
    df = pd.read_csv(csv)
    df.drop(columns=['step', 'age', 'gender', 'zipcodeOri', 'zipMerchant', 'category', 'fraud'], inplace=True)
    df.replace("'",'', regex=True, inplace=True) 
    train, test = train_test_split(df, random_state=42)
    G = nx.MultiGraph()

    for _, row in train.iterrows():
        customer, merchant, amount = row
        if customer not in G:
            G.add_node(customer)
        if merchant not in G:
            G.add_node(merchant)
        G.add_edge(customer, merchant, amount=amount)
    return G

In [97]:
def get_lpa_comunities_weighted(G, df, merchant = True, customer = True):
    coms = nx.community.asyn_lpa_communities(G, weight='amount')
    com_dic = {}
    for i, com in enumerate(coms):
        for label in list(com):
            com_dic[label] = i

    if merchant:
        df['lpa_merchant'] = df['merchant'].apply(lambda x: com_dic.get(x))

    if customer:
        df['lpa_customer'] = df['customer'].apply(lambda x: com_dic.get(x))

    return df

In [98]:
def get_degree_centrality(G, df, merchant = True, customer = True):
    degs = nx.degree_centrality(G)
    
    if merchant:
        df['degree_centrality_merchant'] = df['merchant'].apply(lambda x: degs.get(x))

    if customer:
        df['degree_centrality_customer'] = df['customer'].apply(lambda x: degs.get(x))

    return df

In [99]:
def get_page_rank_unweighted(G, df, merchant = True, customer = True):
    ranks = nx.pagerank(G)
    
    if merchant:
        df['page_rank_merchant'] = df['merchant'].apply(lambda x: ranks.get(x))

    if customer:
        df['page_rank_customer'] = df['customer'].apply(lambda x: ranks.get(x))

    return df

In [100]:
G = build_graph('../dataset_makers/original_data.csv')
df = pd.read_csv('../dataset_makers/original_data.csv')
df.drop(columns=['age', 'gender', 'zipcodeOri', 'zipMerchant', 'category', 'fraud'], inplace=True)
df.replace("'",'', regex=True, inplace=True) 

In [101]:
df = get_lpa_comunities_weighted(G, df)
df = get_degree_centrality(G, df)
df = get_page_rank_unweighted(G, df)
df.drop(columns=['customer', 'merchant', 'amount'], inplace=True)
df.columns

Index(['step', 'lpa_merchant', 'lpa_customer', 'degree_centrality_merchant',
       'degree_centrality_customer', 'page_rank_merchant',
       'page_rank_customer'],
      dtype='object')

In [102]:
df.head()

Unnamed: 0,step,lpa_merchant,lpa_customer,degree_centrality_merchant,degree_centrality_customer,page_rank_merchant,page_rank_customer
0,0,2,2,37.038933,0.028359,0.159634,0.000141
1,0,2,0,37.038933,0.031002,0.159634,0.000148
2,0,0,0,54.001442,0.012016,0.222589,8.2e-05
3,0,2,0,37.038933,0.02908,0.159634,0.00014
4,0,2,0,37.038933,0.02908,0.159634,0.000143


In [104]:
df2 = pd.read_csv('./original_data.csv')
df2.replace("'",'', regex=True, inplace=True) 
df2 = pd.concat([df2, df])

ValueError: columns overlap but no suffix specified: Index(['step'], dtype='object')