In [2]:
import config as cg_config
import os
import json
import pickle
import random
import networkx as nx
import pandas as pd
import stellargraph as sg
from tqdm import tqdm
from stellargraph import StellarGraph

# Corrupted graphs 

In [3]:
def prepare_samples(num_per_label):

    file_pairs = []
    numbers = ["0" + str(i) for i in range(10)] + \
              [str(i) for i in range(10, num_per_label)]
    labels = ["pos", "neg"]

    for num in numbers:
        for l in labels:
            file_graph = "pair_graph_sample_" + l +"_" + num +"_graph.json"
            file_pred = "pair_graph_sample_" + l +"_" + num +"_prediction_edge.json"
            file_pairs.append((file_graph, file_pred, l))
    return file_pairs


def generate_atom_graph(file_dir, file_graph, file_pred):

    file_graph_path = os.path.join(file_dir, file_graph)
    file_pred_path = os.path.join(file_dir, file_pred)
    with open(file_graph_path, "r") as g:
        data = json.load(g)
    with open(file_pred_path, "r") as p:
        predicting = json.load(p)
    node_pair_to_predict = predicting["edge"]
    elements = data.get("elements", {})

    nodes = []
    pairs = []
    for node in elements["nodes"]:
        info = node["data"]
        nodes.append((info["id"],
                      {"type": info["type"]}))
    for edge in elements["edges"]:
        structure = edge["data"]
        pairs.append((structure["source"],
                      structure["target"],
                      {"type": structure["type"]}))

    atom_graph = nx.DiGraph()
    atom_graph.add_nodes_from(nodes)
    atom_graph.add_edges_from(pairs)
    return atom_graph, node_pair_to_predict

In [4]:
# adjust this number to control the num of graphs generated
num_per_label = 500

graphs = []
empty_graphs = []
empty_graphs_pairs = []

node_missing_graphs = []
node_missing_graphs_pairs = []

file_pairs = prepare_samples(num_per_label)
directory = "/opt/contextgraph/samples_all"

for file_pair in tqdm(file_pairs, desc="processing file pairs:"):
    file_graph = file_pair[0]
    file_pred = file_pair[1]
    label = file_pair[2]

    graph, node_pair_to_predict = generate_atom_graph(
        file_dir=directory,
        file_graph=file_graph,
        file_pred=file_pred
    )
    
    graphs.append(graph)
    # skip the empty graphs
    if len(graph.nodes) == 0:
        empty_graphs.append(graph)
        empty_graphs_pairs.append(file_pair)
    # check if target nodes exist in the graph only for non-empty graphs
    else:
        nodes_dict = dict(graph.nodes())
        missing_nodes1 = nodes_dict.get(node_pair_to_predict[0], "MISSING")
        missing_nodes2 = nodes_dict.get(node_pair_to_predict[0], "MISSING")
        
        if missing_nodes1 == "MISSING" or missing_nodes2 == "MISSING":
            node_missing_graphs.append((graph, node_pair_to_predict))
            node_missing_graphs_pairs.append(file_pair)

processing file pairs:: 100%|███████████████| 1000/1000 [02:43<00:00,  6.13it/s]


## 1. emtpy graphs

- The samples(files) that lead to empty graphs seem to be the same every time, means that the problem is not in the generating process, but the files themselves. (This holds for node-missing graphs)

In [5]:
# emtpy ratio
len(empty_graphs)/len(graphs)

0.031

In [6]:
count_neg = 0
count_pos = 0
for i in empty_graphs_pairs:
    if i[2] == "neg":
        count_neg += 1
    else:
        count_pos += 1
        
# most empty graphs are negative samples, sometimes even all!
count_neg / count_pos

ZeroDivisionError: division by zero

In [7]:
# take an example of empty graph
empty_pair_example = random.choice(empty_graphs_pairs)

file_graph = empty_pair_example[0]
file_pred = empty_pair_example[1]
label = empty_pair_example[2]

file_graph_path = os.path.join(directory, file_graph)
file_pred_path = os.path.join(directory, file_pred)

with open(file_graph_path, "r") as g:
    data = json.load(g)
with open(file_pred_path, "r") as p:
    predicting = json.load(p)

In [8]:
# there is no content for the key 'data'
data

{'data': [],
 'directed': True,
 'multigraph': False,
 'elements': {'nodes': [], 'edges': []}}

In [9]:
predicting

{'edge': ['pwc:model/unigan', 'pwc:model/tucker'],
 'cooc_pprs': [],
 'cooc_start_year': 2019,
 'cooc_start_month': 7}

In [10]:
label

'neg'

## 2. missing nodes

In [11]:
# missing ratio (empty graphs excluded)
len(node_missing_graphs)/len(graphs)

0.09

In [12]:
# neg/pos ratio in node-missing graphs
count_neg = 0
count_pos = 0
for i in node_missing_graphs_pairs:
    if i[2] == "neg":
        count_neg += 1
    else:
        count_pos += 1
        
# most empty graphs are positive samples
# could also because part of neg samples are empty and 
# therefore got filtered out
count_neg / count_pos

0.45161290322580644

In [13]:
# take an example of empty graph
node_missing_example = random.choice(node_missing_graphs)

node_missing_graph_example = node_missing_example[0]
target_nodes = node_missing_example[1]

nodes_dict = dict(node_missing_graph_example.nodes())

In [14]:
target_nodes

['pwc:model/mask-r-cnn--vil-base--multi-scale--3x-lr-',
 'pwc:task/object-detection']

In [15]:
nodes_dict[target_nodes[0]]

KeyError: 'pwc:model/mask-r-cnn--vil-base--multi-scale--3x-lr-'

In [16]:
nodes_dict[target_nodes[1]]

{'type': 'task'}

- After running the above code many times manually, it looks like that it is always the first node (among two target nodes) which is missing in the graph.

## 3. non-connected target nodes (also in positive samples)

In [17]:
df_degrees_avg= pd.read_csv("samples_all_average_degree_per_graph.csv", index_col=0)

In [18]:
# emtpy graphs
df_degrees_avg[df_degrees_avg["avg_degree_targets"] == 0].shape

(318, 5)

In [20]:
# focus only on positive, i.e., connected target nodes
# notice that there are average degrees that are even lower than 1
df_degrees_avg_pos = df_degrees_avg[df_degrees_avg["label"]=="pos"]
df_degrees_avg_pos["avg_degree_targets"].value_counts().iloc[:5]

0.5    140
1.0    128
2.0    111
1.5    110
2.5     84
Name: avg_degree_targets, dtype: int64

In [27]:
df_degrees_avg_pos[df_degrees_avg_pos["avg_degree_targets"] == 0.5]["graph"]

24        pair_graph_sample_pos_12_graph.json
122       pair_graph_sample_pos_61_graph.json
156       pair_graph_sample_pos_78_graph.json
180       pair_graph_sample_pos_90_graph.json
248      pair_graph_sample_pos_124_graph.json
                        ...                  
9802    pair_graph_sample_pos_4901_graph.json
9820    pair_graph_sample_pos_4910_graph.json
9830    pair_graph_sample_pos_4915_graph.json
9898    pair_graph_sample_pos_4949_graph.json
9966    pair_graph_sample_pos_4983_graph.json
Name: graph, Length: 140, dtype: object

In [28]:
# take the first graph as an example
file_graph = 'pair_graph_sample_pos_12_graph.json'
file_pred = 'pair_graph_sample_pos_12_prediction_edge.json'
g, nodes = generate_atom_graph(directory, file_graph, file_pred)

In [29]:
nodes

['pwc:method/cspdarknet53', 'pwc:model/yolov4-608']

In [30]:
dict(g.degree()).get(nodes[0], 0)

1

In [31]:
dict(g.degree()).get(nodes[1], 0)

0

- There is a certain amount of graphs (averaged degress of target nodes = 0.5), whose target nodes may not be connected to any other node, even in the case of a positive samples. Also some other graphs (averaged degress of target nodes = 1) may have two target nodes that are only connected to each other.

---