### Imports

In [1]:
import pandas as pd
import os

### Check which datasets are multigraphs

In [2]:
def is_multigraph(folder, dataset, ori_str, dest_str):
    # load edges in ram
    edges = pd.read_csv(folder+dataset+"_edges.csv", sep=';', dtype=pd.StringDtype())
    # keep only columns containing origin and destination ids
    only_ids = edges[[ori_str, dest_str]]
    num_edges = len(only_ids)
    unique_edges = only_ids.drop_duplicates()
    num_unique_edges = len(unique_edges)
    if num_unique_edges < num_edges:
        print("{}: total number {} vs unique edges {}".format(dataset, num_edges, num_unique_edges))
        return True
    else:
        return False

In [None]:
is_multigraph("../data/my_datasets/", "AUI", "NDG_ORIGINE", "NDG_DESTINAZIONE")
is_multigraph("../data/my_datasets/", "CTP", "ID_NODO_ORIGINE", "ID_NODO_DESTINAZIONE")
is_multigraph("../data/my_datasets/", "NEXI", "ID_NODO_ORIGINE", "ID_NODO_DESTINAZIONE")

AUI: total number 1281116 vs unique edges 247321
CTP: total number 1674271 vs unique edges 1519361


### Create subgraphs for faster tests

In [5]:
# set input and output folders
input_folder ="../data/my_datasets/"
output_folder="../data/my_datasets/small/"

In [None]:
def make_subgraph(dataset, id_name, id_ori_name, id_dest_name, num_lines):
    print("Creating "+ dataset + " subgraph of {} nodes".format(num_lines))
	#Load nodes
	nodes = pd.read_csv(input_folder+dataset+"_nodes.csv", sep=';', dtype=pd.StringDtype())
	#Sort descending
	#sorted_nodes = nodes.sort_values(by=id_name, ascending=False)
	#Store the first num_lines rows in new csv
	nodes.head(num_lines).to_csv(output_folder+dataset+"_nodes.csv", sep=';', index=False)
	#Creating list of UNIQUE NDGS of the first num_lines rows
	nodes_ids = nodes.head(num_lines)[id_name].tolist()

	#Load edges
	edges = pd.read_csv(input_folder+dataset+"_edges.csv", sep=';', dtype=pd.StringDtype())
	#Select rows from existing nodes
	from_existing = edges[edges[id_ori_name].isin(nodes_ids)]
	#Filter those and take only the edges to existing nodes
	to_existing = from_existing[from_existing[id_dest_name].isin(nodes_ids)]
	#Store the edges
	edg_name = output_folder+dataset+"_edges.csv"
	to_existing.to_csv(edg_name, sep=';', index=False)
	os.system("wc -l "+ edg_name + " | cut -f1 -d' '")

In [None]:
# Delete all files from putput folder
os.system("rm "+output_folder+"*.csv")
# Create AUI subgraph of n_nodes nodes
n_nodes = 200
make_subgraph("AUI", "NDG", "NDG_ORIGINE", "NDG_DESTINAZIONE", n_nodes)
# Create CTP subgraph of n_nodes nodes
n_nodes = 500
make_subgraph("CTP", "CD_NDG", "ID_NODO_ORIGINE", "ID_NODO_DESTINAZIONE", 500)
# Create NEXI subgraph of n_nodes nodes
n_nodes = 40000
make_subgraph("NEXI", "ID_NODO", "ID_NODO_ORIGINE", "ID_NODO_DESTINAZIONE", 40000)