In [5]:
import json
from collections import Counter

import numpy as np
import networkx as nx
import pandas as pd
from scipy.cluster.hierarchy import linkage, dendrogram
from IPython.display import display

# Preprocess raw data

In [6]:
with open("data/MC3.json", 'r'):
    mc3 = json.load(open("data/MC3.json", 'r'))

In [9]:
nodes = [
    {
        'id': node['id'][0],
        'country': node['country'][0],
        'product_services': node['product_services'][0] if len(node['product_services']) > 0 else "Unknown",
        'revenue_omu': node['revenue_omu'][0] if 'revenue_omu' in node else "None",
        'type': node['type'][0]
    }
    for node in mc3['nodes']
]
links = [
    {
        'source': src,
        'target': link['target'][0],
        'type': link['type'][0],
    }
    for link in mc3['links']
    for src in link['source']
]
df_nodes = pd.DataFrame(nodes)
df_links = pd.DataFrame(links)

In [8]:
display(df_nodes)
df_links

Unnamed: 0,id,country,product_services,revenue_omu,type
0,Jones LLC,ZH,Automobiles,310612303.447,Company
1,"Coleman, Hall and Lopez",ZH,"Passenger cars, trucks, vans, and buses",162734683.9969,Company
2,Aqua Advancements Sashimi SE Express,Oceanus,Holding firm whose subsidiaries are engaged in...,115004666.6728,Company
3,Makumba Ltd. Liability Co,Utoporiana,"Car service, car parts and accessories, automo...",90986412.5191,Company
4,"Taylor, Taylor and Farrell",ZH,Fully electric vehicles (EVs) and electric veh...,81466666.6728,Company
...,...,...,...,...,...
27617,"Johnson, Lee and Rodriguez",ZH,Unknown,,Company Contacts
27618,"Bowman, Rollins and Griffin",ZH,Unknown,,Company Contacts
27619,Hardin Group,ZH,Unknown,,Company Contacts
27620,"Crane, Joyce and Jennings",ZH,Unknown,,Company Contacts


Unnamed: 0,source,target,type
0,Lake Chad Catchers Limited Liability Company ...,Erin Flores,Beneficial Owner
1,Lake Chad Catchers Limited Liability Company ...,Linda Lee,Beneficial Owner
2,Lake Chad Catchers Limited Liability Company ...,Sharon Coleman,Beneficial Owner
3,Lake Chad Catchers Limited Liability Company ...,John Rivera,Beneficial Owner
4,Lake Chad Catchers Limited Liability Company ...,Stephen Carpenter,Beneficial Owner
...,...,...,...
27166,Martinez-Diaz,Mark Thomas,Company Contacts
27167,Garcia Inc,Christopher Wagner,Company Contacts
27168,"Potter, Williams and Peters",Mr. Troy Shields,Company Contacts
27169,Tucker-Willis,Robert James,Company Contacts


#### Merge Duplicated Links

In [10]:
df_unique_links = df_links.drop_duplicates(subset=['source', 'target'])
df_unique_links['weight'] = df_unique_links.apply(lambda x: len(df_links[(df_links['source'] == x['source']) & (df_links['target'] == x['target'])]), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_unique_links['weight'] = df_unique_links.apply(lambda x: len(df_links[(df_links['source'] == x['source']) & (df_links['target'] == x['target'])]), axis=1)


#### Merge Duplicated Nodes

In [11]:
# merge duplicated nodes
nodes_id_count = Counter(df_nodes['id'])
unique_nodes_id = list(nodes_id_count.keys())
unique_nodes = [
    {   
        "id": unique_node,
        "country": df_nodes[df_nodes['id'] == unique_node]['country'].values[0],
        "product_services": list(set(df_nodes[df_nodes['id'] == unique_node]['product_services'])),
        "revenue_omu": list(set(df_nodes[df_nodes['id'] == unique_node]['revenue_omu'])),
        "type": dict(Counter(df_nodes[df_nodes['id'] == unique_node]['type']))
    }   
    for unique_node in unique_nodes_id
]
unique_nodes

[{'id': 'Jones LLC',
  'country': 'ZH',
  'product_services': ['Automobiles', 'Unknown'],
  'revenue_omu': ['None', 310612303.447],
  'type': {'Company': 1, 'Beneficial Owner': 5, 'Company Contacts': 5}},
 {'id': 'Coleman, Hall and Lopez',
  'country': 'ZH',
  'product_services': ['Passenger cars, trucks, vans, and buses'],
  'revenue_omu': [162734683.9969],
  'type': {'Company': 1}},
 {'id': 'Aqua Advancements Sashimi SE Express',
  'country': 'Oceanus',
  'product_services': ['Holding firm whose subsidiaries are engaged in the businesses of refining and chemicals, process and pollution control equipment, minerals, fertilizers, polymers and fibers, commodity trading and services, forest and consumer products, and ranching'],
  'revenue_omu': [115004666.6728],
  'type': {'Company': 1}},
 {'id': 'Makumba Ltd. Liability Co',
  'country': 'Utoporiana',
  'product_services': ['Car service, car parts and accessories, automotive technology, diagnostics for repair shops, antilock braking and 

#### Add Unknown Nodes

In [12]:
# Add unknown nodes
df_unknown_src = df_links[~df_links['source'].isin(df_nodes['id'])]
unknown_src_list = list(set(df_unknown_src['source']))
target_list = list(set(df_links['target']))
unknown_nodes = [
    {
        "id": node,
        "country": "Unknown",
        "product_services": ["Unknown"],
        "revenue_omu": ["Unknown"],
        "type": {"Unknown":1}
    }
    for node in unknown_src_list
] + [
    {
        "id": node,
        "country": "Unknown",
        "product_services": ["Unknown"],
        "revenue_omu": ["Unknown"],
        "type" : dict(Counter(df_links[df_links['target'] == node]['type']))
    }
    for node in target_list
]
unknown_nodes

[{'id': 'Bradford, Franklin and Mcmahon',
  'country': 'Unknown',
  'product_services': ['Unknown'],
  'revenue_omu': ['Unknown'],
  'type': {'Unknown': 1}},
 {'id': 'Graham, Ward and Schmidt',
  'country': 'Unknown',
  'product_services': ['Unknown'],
  'revenue_omu': ['Unknown'],
  'type': {'Unknown': 1}},
 {'id': 'Jones, Owens and Sullivan',
  'country': 'Unknown',
  'product_services': ['Unknown'],
  'revenue_omu': ['Unknown'],
  'type': {'Unknown': 1}},
 {'id': 'Gonzalez-Adams',
  'country': 'Unknown',
  'product_services': ['Unknown'],
  'revenue_omu': ['Unknown'],
  'type': {'Unknown': 1}},
 {'id': 'Martin-Ayala',
  'country': 'Unknown',
  'product_services': ['Unknown'],
  'revenue_omu': ['Unknown'],
  'type': {'Unknown': 1}},
 {'id': 'Dominguez, Willis and White',
  'country': 'Unknown',
  'product_services': ['Unknown'],
  'revenue_omu': ['Unknown'],
  'type': {'Unknown': 1}},
 {'id': 'Bryan-Odonnell',
  'country': 'Unknown',
  'product_services': ['Unknown'],
  'revenue_omu'

In [13]:
df_all_unique_nodes = pd.DataFrame(unique_nodes + unknown_nodes)
df_all_unique_nodes

Unnamed: 0,id,country,product_services,revenue_omu,type
0,Jones LLC,ZH,"[Automobiles, Unknown]","[None, 310612303.447]","{'Company': 1, 'Beneficial Owner': 5, 'Company..."
1,"Coleman, Hall and Lopez",ZH,"[Passenger cars, trucks, vans, and buses]",[162734683.9969],{'Company': 1}
2,Aqua Advancements Sashimi SE Express,Oceanus,[Holding firm whose subsidiaries are engaged i...,[115004666.6728],{'Company': 1}
3,Makumba Ltd. Liability Co,Utoporiana,"[Car service, car parts and accessories, autom...",[90986412.5191],{'Company': 1}
4,"Taylor, Taylor and Farrell",ZH,[Fully electric vehicles (EVs) and electric ve...,[81466666.6728],{'Company': 1}
...,...,...,...,...,...
51726,Robin Bridges,Unknown,[Unknown],[Unknown],{'Beneficial Owner': 1}
51727,John Robbins,Unknown,[Unknown],[Unknown],{'Company Contacts': 1}
51728,Blake Baker,Unknown,[Unknown],[Unknown],{'Beneficial Owner': 1}
51729,Danielle Kelly,Unknown,[Unknown],[Unknown],{'Beneficial Owner': 1}


#### Add node attribute

In [14]:
src_list = list(df_links['source'])
trgt_list = list(df_links['target'])
df_all_unique_nodes['type_length'] = df_all_unique_nodes['type'].apply(lambda x: sum(list(x.values())))
df_all_unique_nodes['is_source'] = df_all_unique_nodes['id'].apply(lambda x: 1 if x in src_list else 0)
df_all_unique_nodes['is_target'] = df_all_unique_nodes['id'].apply(lambda x: 1 if x in trgt_list else 0)
df_all_unique_nodes['total_revenue'] = df_all_unique_nodes['revenue_omu'].apply(lambda x: round(sum(list(map(lambda y: 0 if (y=="Unknown" or y=="None") else int(y), x))),2))


In [16]:
df_all_unique_nodes['neighbor_BO']=df_all_unique_nodes['id'].apply(lambda x: len(df_links[(df_links['source']==x) & (df_links['type']=='Beneficial Owner')]))
df_all_unique_nodes['neighbor_CC']=df_all_unique_nodes['id'].apply(lambda x: len(df_links[(df_links['source']==x) & (df_links['type']=='Company Contacts')]))
df_all_unique_nodes['total_rev_per_BO'] = df_all_unique_nodes['total_revenue']/df_all_unique_nodes['neighbor_BO'].where(df_all_unique_nodes['neighbor_BO'] != 0, 1)

0         62122460.6
1        162734683.0
2         57502333.0
3          7582201.0
4         81466666.0
            ...     
51726            0.0
51727            0.0
51728            0.0
51729            0.0
51730            0.0
Name: total_rev_per_BO, Length: 51731, dtype: float64

In [17]:
df_all_unique_nodes['total_rev_per_BO'] = df_all_unique_nodes['total_rev_per_BO'].round(decimals=2)

In [18]:
df_all_unique_nodes

Unnamed: 0,id,country,product_services,revenue_omu,type,type_length,is_source,is_target,total_revenue,neighbor_BO,neighbor_CC,total_rev_per_BO
0,Jones LLC,ZH,"[Automobiles, Unknown]","[None, 310612303.447]","{'Company': 1, 'Beneficial Owner': 5, 'Company...",11,1,0,310612303,5,3,62122460.6
1,"Coleman, Hall and Lopez",ZH,"[Passenger cars, trucks, vans, and buses]",[162734683.9969],{'Company': 1},1,0,0,162734683,0,0,162734683.0
2,Aqua Advancements Sashimi SE Express,Oceanus,[Holding firm whose subsidiaries are engaged i...,[115004666.6728],{'Company': 1},1,1,0,115004666,2,1,57502333.0
3,Makumba Ltd. Liability Co,Utoporiana,"[Car service, car parts and accessories, autom...",[90986412.5191],{'Company': 1},1,1,0,90986412,12,1,7582201.0
4,"Taylor, Taylor and Farrell",ZH,[Fully electric vehicles (EVs) and electric ve...,[81466666.6728],{'Company': 1},1,0,0,81466666,0,0,81466666.0
...,...,...,...,...,...,...,...,...,...,...,...,...
51726,Robin Bridges,Unknown,[Unknown],[Unknown],{'Beneficial Owner': 1},1,0,1,0,0,0,0.0
51727,John Robbins,Unknown,[Unknown],[Unknown],{'Company Contacts': 1},1,0,1,0,0,0,0.0
51728,Blake Baker,Unknown,[Unknown],[Unknown],{'Beneficial Owner': 1},1,0,1,0,0,0,0.0
51729,Danielle Kelly,Unknown,[Unknown],[Unknown],{'Beneficial Owner': 1},1,0,1,0,0,0,0.0


In [25]:
df_all_unique_nodes.to_csv("data/nodes.csv", index=False)
df_unique_links.to_csv("data/links.csv", index=False)

# Filtering Ocean Nodes

In [24]:
ocean_nodes = pd.read_csv('ocean_nodes.csv')
ocean_nodes_id = set(ocean_nodes['id'].values)
df_all_unique_nodes['is_ocean'] = df_all_unique_nodes['id'].apply(lambda x: 1 if x in ocean_nodes_id else 0)

# Generate Graph Dict

In [26]:
nodes = pd.read_csv('data/nodes.csv')
links = pd.read_csv('data/links.csv')

#### generate graph

In [27]:
connected_nodes = nodes[(nodes['is_source']==1) | (nodes['is_target']==1)]
G = nx.node_link_graph({
    'nodes': connected_nodes.to_dict('records'),
    'links': links.to_dict('records'),
    'multigraph': False,
    'graph':{},
    'directed': False
})

#### divide graph into disconnected subgraphs

In [28]:
subgraphs = list(G.subgraph(c) for c in nx.connected_components(G))
subgraphs.sort(key=len, reverse=True)

#### generate subgraphs dict

In [29]:
subgraphs_dict = [
    {
        'id': i+1,
        'graph': nx.node_link_data(g),
        'num_nodes': g.number_of_nodes(),
        'num_links': g.number_of_edges(),
        'num_ocean_nodes': sum([1 for _, d in g.nodes(data=True) if d['is_ocean'] == 1]),
        'density': round(2*sum([d['weight'] for _, _, d in g.edges(data=True)])/(g.number_of_nodes()*(g.number_of_nodes()-1)),2),
        'average_revenue': round(sum([
            r for _, d in g.nodes(data=True)
                if type(d['revenue_omu']) != float
                    for r in eval(d['revenue_omu'])
                        if type(r) != str
            ])/g.number_of_nodes(),2),
    }
    for i, g in enumerate(subgraphs)
]

In [30]:
# swap source and target which are converted in the wrong way during generating graph using networkx
for g in subgraphs_dict:
    df_nodes = pd.DataFrame(g['graph']['nodes'])
    for l in g['graph']['links']:
        if df_nodes[df_nodes['id'] == l['source']]['is_source'].values[0] == 0:
            s, t = l['source'], l['target']
            l['source'], l['target'] = t, s

In [31]:
with open('data/graphs.json', 'w') as f:
    json.dump({'graphs': subgraphs_dict}, f)

# Reorder Nodes
- reordering sources and targets by hierarchical clustering algorithm

In [32]:
def reorder_nodes(matrix, keys):
    Z = linkage(matrix, method='average', metric='euclidean')
    dn = dendrogram(Z, no_plot=True)['leaves']
    ordered_nodes = [keys[i] for i in dn]
    
    return ordered_nodes

In [33]:
with open('data/graphs.json', 'r') as f:
    graphs = json.load(f)['graphs']

In [34]:
# reorder nodes source first, target last
for g in graphs:
    nodes = g['graph']['nodes']
    links = g['graph']['links']

    sources = set([i['source'] for i in links])
    targets = set([i['target'] for i in links])
    node_list = list(sources) + list(targets)
    
    ordered_nodes = sorted(nodes, key=lambda x: node_list.index(x['id']) if x['id'] in node_list else float('inf'))
    g['graph']['nodes'] = ordered_nodes

In [35]:
# reordering nodes
for g in graphs:
    nodes = g['graph']['nodes']
    links = g['graph']['links']
    src_len = len([i for i,n in enumerate(nodes) if n['is_source'] == 1])
    
    sources = nodes[:src_len]
    targets = nodes[src_len:]
    
    order = []
    
    G = nx.node_link_graph(g['graph'])
    adj_matrix = nx.adjacency_matrix(G)
    if src_len > 3 and len(targets)>2:    
        # reorder sources
        matrix = adj_matrix.todense()[:len(sources), len(sources):].tolist()
        order = reorder_nodes(matrix, [i['id'] for i in sources])
        
        matrix = adj_matrix.todense()[len(sources):, :len(sources)].tolist()
        order = order + reorder_nodes(matrix, [i['id'] for i in targets])
        ordered_nodes = sorted(nodes, key=lambda x: order.index(x['id']) if x['id'] in order else float('inf'))
        g['graph']['nodes'] = ordered_nodes

  adj_matrix = nx.adjacency_matrix(G)


# Add similarity
- Add similarity between two sources
- Add maximum similarity attribute of the nodes

In [36]:
# add similarity between two sources
for g in graphs:
    nodes = g['graph']['nodes']
    links = g['graph']['links']
    sources = nodes[:len(set([i['source'] for i in links]))]

    for n in nodes:
        n['similarity'] = 0

    if len(sources) < 2:
        g['similarity'] = []
        continue
    
    G = nx.node_link_graph(g['graph'])
    matrix = nx.adjacency_matrix(G, weight=None).toarray()[:len(sources), len(sources):]
    similarity = []
    for i in range(len(sources)):
        for j in range(i+1, len(sources)):
            if np.count_nonzero(matrix[i] + matrix[j]) > 1:
                similarity.append([sources[i]['id'], sources[j]['id'], round(np.dot(matrix[i], matrix[j]) / np.count_nonzero(matrix[i] + matrix[j]),2)])
            else:
                similarity.append([sources[i]['id'], sources[j]['id'], 0])
            
            nodes[i]['similarity'] = similarity[-1][2] if similarity[-1][2] > nodes[i]['similarity'] else nodes[i]['similarity']
            nodes[j]['similarity'] = similarity[-1][2] if similarity[-1][2] > nodes[j]['similarity'] else nodes[j]['similarity']        
    
    g['similarity'] = similarity

  matrix = nx.adjacency_matrix(G, weight=None).toarray()[:len(sources), len(sources):]


In [37]:
with open('data/graphs.json', 'w') as f:
    json.dump({'graphs': graphs}, f)