In [50]:
import csv
import os
import json
from pathlib import Path
import numpy as np
from datetime import datetime

In [22]:
def check_data_availability(processed_data_dir: Path) -> bool:
    nodes_path = processed_data_dir / "nodes.csv"
    edges_path = processed_data_dir / "edges.csv"
    return nodes_path.is_file() and edges_path.is_file()

In [23]:
def write_to_csv(data: list, file_path: Path):
    if data:
        keys = set().union(*(d.keys() for d in data))
        with open(file_path, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=keys)
            writer.writeheader()
            for item in data:
                writer.writerow(item)

In [24]:
def resolve_data_directories(current_path: Path) -> (Path, Path):
    raw_data_dir = Path("data/raw")
    processed_data_dir = Path("data/processed")
    for parent in current_path.parents:
        if (parent / raw_data_dir).exists():
            return parent / raw_data_dir, parent / processed_data_dir
    return None, None


In [27]:
def load_json_data(file_path: Path) -> dict:
    with open(file_path, 'r') as file:
        return json.load(file)

In [33]:

# get current directory
current_path = Path(os.getcwd())

# get data directories
raw_data_dir, processed_data_dir = resolve_data_directories(current_path=current_path)

raw_path = raw_data_dir / "mc2.json"

data = load_json_data(raw_path)

# Extract nodes and edges
nodes = data.get('nodes', [])
edges = data.get('links', [])

In [46]:
keys_edges = ["arrivaldate", "hscode", "valueofgoods_omu", "valueofgoodsusd", "volumeteu", "weightkg", "dataset", "source", "target"]
keys_nodes = ["shpcountry", "rcvcountry", "dataset", "id"]

In [48]:
def convert_list_to_numpy(dicts: list, keys: list):
    
    processed = []

    for d in dicts:
        row = [d.get(key, None) for key in keys]
        processed.append(row)
        
    processed = np.array(processed, dtype=object)
    
    return processed

In [49]:
edges_processed = convert_list_to_numpy(edges, keys_edges)
edges_processed[:4,:]

array([['2034-02-12', '630630', 141015.0, None, 0.0, 4780, 'MC2',
        "AquaDelight Inc and Son's", 'BaringoAmerica Marine Ges.m.b.H.'],
       ['2034-03-13', '630630', 141015.0, None, 0.0, 6125, 'MC2',
        "AquaDelight Inc and Son's", 'BaringoAmerica Marine Ges.m.b.H.'],
       ['2028-02-07', '470710', None, None, 0.0, 10855, 'MC2',
        "AquaDelight Inc and Son's", '-15045'],
       ['2028-02-23', '470710', None, None, 0.0, 11250, 'MC2',
        "AquaDelight Inc and Son's", '-15045']], dtype=object)

In [52]:
date_column_index = 0
dates = [datetime.strptime(date, "%Y-%m-%d") for date in edges_processed[:, date_column_index]]

sorted_indices = np.argsort(dates)

sorted_data = edges_processed[sorted_indices]

sorted_data[:4, :]

array([['2028-01-01', '870899', None, None, 0.0, 3520, 'MC2',
        'The Salted Pearl AG Marine conservation', 'Pao gan LC Freight '],
       ['2028-01-01', '390799', None, 39920.0, 0.0, 8650, 'MC2',
        'Barco de Plata Seafarer Corp Freight ', 'Pao gan SE Seal'],
       ['2028-01-01', '392410', None, None, 5.0, 18820, 'MC2',
        'Arunachal Pradesh s Kga', 'hǎi dǎn Corporation Wharf'],
       ['2028-01-01', '392410', None, None, 0.0, 2340, 'MC2',
        'Tamil Nadu s Pic Express', '-2220']], dtype=object)

In [None]:
def process_nodes_and_edges(nodes: list, edges: list) -> (list, list):
    
    return None, None

In [30]:
from pathlib import Path
import os
import json
import csv

def prepareData2(dataset: str):
    
    # get current directory
    current_path = Path(os.getcwd())

    # get data directories
    raw_data_dir, processed_data_dir = resolve_data_directories(current_path=current_path)

    # Check if data already preprocessed
    nodes_path = processed_data_dir / "nodes.csv"
    edges_path = processed_data_dir / "edges.csv"

    if nodes_path.is_file() and edges_path.is_file():
        print("Data already preprocessed.")
        return
    else:
        # Check if raw source data is available
        raw_path = raw_data_dir / f"{dataset}.json"
        if not raw_path.is_file():
            print(f"No raw data available at {raw_path}")
            # load dataset from github
            return

        data = load_json_data(raw_path)
            
        # Extract nodes and edges
        nodes = data.get('nodes', [])
        edges = data.get('links', [])

        # Output-paths for nodelist and edgelist
        node_list_path = processed_data_dir / "nodes.csv"
        edge_list_path = processed_data_dir / "edges.csv"

         # Write nodes to CSV file
        if nodes:
            write_to_csv(nodes, node_list_path)

        # Write edges to CSV file
        if edges:
            write_to_csv(edges, edge_list_path)

prepareData2("mc2")

Data already preprocessed.
