In [10]:
import pandas as pd
import networkx as nx
import json
import csv

In [7]:
with open('data/raw/mc2_challenge_graph.json', 'r') as file:
    data = json.load(file)

In [8]:
G = nx.MultiDiGraph()

# Add nodes to the graph
for node in data['nodes']:
    G.add_node(node['id'], **node)  # Using **node to unpack the node attributes

# Add links (edges) to the graph
for link in data['links']:
    source = link['source']
    target = link['target']
    G.add_edge(source, target, **link)  # Using **link to unpack the edge attributes

In [12]:
edges_list = list(G.edges(data=True))
print(edges_list[0:10])

[("AquaDelight Inc and Son's", 'BaringoAmerica Marine Ges.m.b.H.', {'arrivaldate': '2034-02-12', 'hscode': '630630', 'valueofgoods_omu': 141015.0, 'volumeteu': 0.0, 'weightkg': 4780, 'dataset': 'MC2', 'source': "AquaDelight Inc and Son's", 'target': 'BaringoAmerica Marine Ges.m.b.H.'}), ("AquaDelight Inc and Son's", 'BaringoAmerica Marine Ges.m.b.H.', {'arrivaldate': '2034-03-13', 'hscode': '630630', 'valueofgoods_omu': 141015.0, 'volumeteu': 0.0, 'weightkg': 6125, 'dataset': 'MC2', 'source': "AquaDelight Inc and Son's", 'target': 'BaringoAmerica Marine Ges.m.b.H.'}), ("AquaDelight Inc and Son's", '-15045', {'arrivaldate': '2028-02-07', 'hscode': '470710', 'volumeteu': 0.0, 'weightkg': 10855, 'dataset': 'MC2', 'source': "AquaDelight Inc and Son's", 'target': '-15045'}), ("AquaDelight Inc and Son's", '-15045', {'arrivaldate': '2028-02-23', 'hscode': '470710', 'volumeteu': 0.0, 'weightkg': 11250, 'dataset': 'MC2', 'source': "AquaDelight Inc and Son's", 'target': '-15045'}), ("AquaDelight

In [14]:
# Extract the edges
edges = G.edges(data=True)

# Write to CSV
with open('data/processed/edges.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    
    # Write header
    writer.writerow(["Source", "Target", "arrivaldate", "hscode", "valueofgoods_omu", "volumeteu", "weightkg", "dataset", "source", "target"])
    
    for edge in edges:
        source, target, data = edge
        arrivaldate = data.get('arrivaldate', None)
        hscode = data.get('hscode', None)
        valueofgoods_omu = data.get('valueofgoods_omu', None)
        volumeteu = data.get('volumeteu', None)
        weightkg = data.get('weightkg', None)
        dataset = data.get('dataset', None)
        source_data = data.get('source', None)
        target_data = data.get('target', None)
        writer.writerow([source, target, arrivaldate, hscode, valueofgoods_omu, volumeteu, weightkg, dataset, source_data, target_data])

In [15]:
import matplotlib.pyplot as plt
import seaborn as sns

In [21]:
df = pd.read_csv(r"data/processed/edges.csv")
df["arrivaldate"] = pd.to_datetime(df["arrivaldate"])
df

Unnamed: 0,Source,Target,arrivaldate,hscode,valueofgoods_omu,volumeteu,weightkg,dataset,source,target
0,AquaDelight Inc and Son's,BaringoAmerica Marine Ges.m.b.H.,2034-02-12,630630,141015.0,0.0,4780,MC2,AquaDelight Inc and Son's,BaringoAmerica Marine Ges.m.b.H.
1,AquaDelight Inc and Son's,BaringoAmerica Marine Ges.m.b.H.,2034-03-13,630630,141015.0,0.0,6125,MC2,AquaDelight Inc and Son's,BaringoAmerica Marine Ges.m.b.H.
2,AquaDelight Inc and Son's,-15045,2028-02-07,470710,,0.0,10855,MC2,AquaDelight Inc and Son's,-15045
3,AquaDelight Inc and Son's,-15045,2028-02-23,470710,,0.0,11250,MC2,AquaDelight Inc and Son's,-15045
4,AquaDelight Inc and Son's,-15045,2028-09-11,470710,,0.0,11165,MC2,AquaDelight Inc and Son's,-15045
...,...,...,...,...,...,...,...,...,...,...
5464373,Mar del Oeste Ltd. Corporation,Karnataka Ltd. Corporation Manatee,2034-12-25,304740,,0.0,945,MC2,Mar del Oeste Ltd. Corporation,Karnataka Ltd. Corporation Manatee
5464374,Playa del Mar OJSC,Caracola de Coral CJSC,2034-12-26,845430,,0.0,2170,MC2,Playa del Mar OJSC,Caracola de Coral CJSC
5464375,Playa del Mar OJSC,Caracola de Coral CJSC,2034-12-26,845430,,0.0,2170,MC2,Playa del Mar OJSC,Caracola de Coral CJSC
5464376,Costa del Mar NV,Selous Game Reserve Kga,2034-12-28,160414,,0.0,6710,MC2,Costa del Mar NV,Selous Game Reserve Kga


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5464378 entries, 0 to 5464377
Data columns (total 10 columns):
 #   Column            Dtype         
---  ------            -----         
 0   Source            object        
 1   Target            object        
 2   arrivaldate       datetime64[ns]
 3   hscode            int64         
 4   valueofgoods_omu  float64       
 5   volumeteu         float64       
 6   weightkg          int64         
 7   dataset           object        
 8   source            object        
 9   target            object        
dtypes: datetime64[ns](1), float64(2), int64(2), object(5)
memory usage: 416.9+ MB


In [24]:
pd.set_option('display.float_format', lambda x: '%.2f' % x)
df.describe()

Unnamed: 0,hscode,valueofgoods_omu,volumeteu,weightkg
count,5464378.0,281.0,4943445.0,5464378.0
mean,610697.74,1665142.3,1.47,37265.71
std,263019.74,4299192.29,7.6,1113041.57
min,100119.0,1100.0,0.0,0.0
25%,350300.0,148130.0,0.0,3060.0
50%,640340.0,504485.0,0.0,10300.0
75%,848340.0,1202560.0,0.0,19730.0
max,999999.0,44744530.0,1215.0,495492485.0


In [25]:
df.isnull().sum()

Source                    0
Target                    0
arrivaldate               0
hscode                    0
valueofgoods_omu    5464097
volumeteu            520933
weightkg                  0
dataset                   0
source                    0
target                    0
dtype: int64