I am testing topology based methods at trace level with a different set that has connected components

In [1]:
import torch
from pykeen.evaluation import evaluate, RankBasedEvaluator
from pykeen.metrics.ranking import HitsAtK
import pandas as pd


import logging
from pathlib import Path

import click
import more_click
import torch
from pykeen.evaluation import RankBasedEvaluator
from pykeen.losses import NSSALoss,CrossEntropyLoss
from pykeen.models.inductive import InductiveNodePiece, InductiveNodePieceGNN
from pykeen.trackers import ConsoleResultTracker, WANDBResultTracker, FileResultTracker
from pykeen.training import SLCWATrainingLoop
from pykeen.typing import TESTING, TRAINING, VALIDATION
from pykeen.utils import resolve_device, set_random_seed
from torch.optim import Adam


from pykeen.metrics.ranking import HitsAtK

from pathlib import Path

from pykeen.datasets.inductive.base import DisjointInductivePathDataset
from pykeen.datasets.base import PathDataset, Dataset
from typing_extensions import Literal
import os
from pykeen.hpo import hpo_pipeline
from pykeen.triples import TriplesFactory
from pykeen.models import InductiveNodePiece, TransE, RGCN, ConvE
from pykeen.typing import TESTING, TRAINING, VALIDATION

import time

import platform
import sys

import cpuinfo

import psutil

import subprocess

import zipfile

from tqdm import tqdm

from IPython.display import clear_output

import networkx as nx

import math 

import networkit as nk

import numpy as np
from collections import defaultdict

seed = 1234

In [2]:

# specify the path to the zip file and the destination directory for the unzipped files
zip_file_path = "MSCallGraph_traces.zip"
extract_dir = "MSCallGraph_traces"

# create a ZipFile object and extract all files to the destination directory
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)


In [3]:
def remove_couples(couples): # it removes a couple that has entities that are already seen in other couples
    
    removed = []
    G = nx.Graph()
    
    services_set= set()
    for couple in couples:
        services_set.add(couple[0])
        services_set.add(couple[1])
    for elem in services_set:
        G.add_node(elem)
    for couple in couples:
        G.add_edge(couple[0], couple[1])    
    
    mydict = defaultdict(int)
    for couple in couples:
        mydict[couple[0]]+=1
        if couple[0] != couple[1]:
            mydict[couple[1]]+=1
    couples_copy = couples.copy()
    for couple in couples:
        if mydict[couple[0]]>=2 and mydict[couple[1]]>=2 and couple[0] != couple[1]: #topology based methods do not support self loops
            try:
                G.remove_edge(couple[0],couple[1]) # the graph needs to remain connected, otherwise topology methods don't work
                if nx.is_connected(G):
                    couples_copy.remove(couple)
                    removed.append([couple[0],couple[1]])
                else:
                    G.add_edge(couple[0],couple[1])
            except:
                continue
    return couples_copy, removed

In [4]:
def compute_topology_metrics(couples):
    
    couples, true_links = remove_couples(couples)
    # Create a graph object
    G = nx.Graph()
    
    services_set= set()
    for couple in couples:
        services_set.add(couple[0])
        services_set.add(couple[1])
    for elem in services_set:
        G.add_node(elem)
    for couple in couples:
        G.add_edge(couple[0], couple[1], label=couple[1])
                
        
        
    
    non_edges = list(set(nx.non_edges(G)))
    common_neighbours = []

    preds = nx.common_neighbor_centrality(G,ebunch = non_edges, alpha=1)
    for u, v, p in preds:
        common_neighbours.append(float(p))
    
    

    sorensen_index_list = []
    for elem in non_edges:
        common_neighbors = list(nx.common_neighbors(G, elem[0], elem[1]))
        sorensen_index = (2 * len(common_neighbors)) / (G.degree(elem[0]) + G.degree(elem[1]))
        sorensen_index_list.append(sorensen_index)
    
    salton_index_list = []
    for elem in non_edges:
        common_neighbors = list(nx.common_neighbors(G, elem[0], elem[1]))
        salton_index = len(common_neighbors) / math.sqrt(G.degree(elem[0]) * G.degree(elem[1]))
        salton_index_list.append(salton_index)
    
    jaccard_index = []
    preds = nx.jaccard_coefficient(G,ebunch = non_edges)
    for u, v, p in preds:
        jaccard_index.append(float(p))
    
    resource_allocation = []
    preds = nx.resource_allocation_index(G,ebunch = non_edges)
    for u, v, p in preds:
        resource_allocation.append(float(p))
    

    adamic_adar = []
    preds = nx.adamic_adar_index(G,ebunch = non_edges)
    for u, v, p in preds:
        adamic_adar.append(float(p))
        
        
    katz_index_list = []
    katz_list = katz_index(G,non_edges)
    for elem in zip(non_edges,katz_list):
        katz_index_list.append(elem[1])
    

    lhn_index_list = []
    for elem in zip(non_edges,katz_list):
        lhn_index = elem[1]*2*len(G)/(G.degree(elem[0][0]) * G.degree(elem [0][1]))
        lhn_index_list.append(lhn_index)
    return true_links, non_edges, common_neighbours, sorensen_index_list, salton_index_list, jaccard_index, resource_allocation, adamic_adar, katz_index_list, lhn_index_list

In [5]:
def katz_index(nx_graph,non_edges):
    to_return = []

    # convert to a Networkit graph
    nk_graph = nk.Graph(len(nx_graph), weighted=True)
    from collections import defaultdict

    # Define a defaultdict with default value as 0
    d = defaultdict(int)
    i = 1
    
    for node in nx_graph.nodes():
        if d[node] == 0:
            i +=1
            d[node] = i
        nk_graph.addNode()
        
    # add edges to Networkit graph
    for edge in nx_graph.edges():
        u, v = edge
        

        nk_graph.addEdge(d[u], d[v])

    katz = nk.linkprediction.KatzIndex(nk_graph)
    
    for elem in non_edges:
        to_return.append(katz.run(d[elem[0]], d[elem[1]]))
    return to_return

In [6]:
def create_couples(df):
    df = df.drop(columns=['rpctype'],axis = 1).drop_duplicates()
    couples = []
    for i in range(len(df)):
        head = df.iloc[i]['dm']
        tail = df.iloc[i]['um']
        couples.append([head,tail])
    return couples

In [7]:
def calculate_rank_max(non_edges, metric_list, true_link):
    sorted_data = sorted(zip(non_edges, metric_list),key=lambda x: x[1],reverse = True)

    # Unzip the sorted data into separate lists
    sorted_non_edges, sorted_metric_list = zip(*sorted_data)
    rank = 0
    while rank < len(sorted_non_edges):
        if sorted_non_edges[rank][0] == true_link[0] and sorted_non_edges[rank][1] == true_link[1]:
            break
        elif sorted_non_edges[rank][0] == true_link[1] and sorted_non_edges[rank][1] == true_link[0]:
            break # the graph is undirected
        else:
            rank+=1
    
    if rank == len(sorted_non_edges):
        return len(sorted_non_edges)
    #using max policy
    i = len(sorted_metric_list)-1
    
    while i >= 0:
        if sorted_metric_list[i] == sorted_metric_list[rank]:
            return i +1
        i+=-1
    return len(sorted_non_edges)

In [8]:
def mean_reciprocal_rank(non_edges,metric_list,true_links):
    ranks = []
    for elem in true_links:
        ranks.append(calculate_rank_max(non_edges,metric_list,elem))
#     if len(ranks) == 0:
#         return 0
    return sum(1/x for x in ranks)/len(ranks)

In [9]:
def hits_at_k(non_edges,metric_list,true_links,k):
    ranks = []
    for elem in true_links:
        ranks.append(calculate_rank_max(non_edges,metric_list,elem))
    
    hits = 0
    for rank in ranks:
        if rank <k:
            hits+=1
#     if len(ranks) == 0:
#         return 0
    return hits/len(ranks)

In [10]:
def test_trace(traceid,metric, metric_name):

    TRACE_TRAIN_PATH = "MSCallGraph_traces/Testing Traces/MSCallGraph_traces/train/" + traceid + "_transductive_train.tsv"
    TRACE_TEST_PATH = "MSCallGraph_traces/Testing Traces/MSCallGraph_traces/test/" +traceid + "_transductive_test.tsv"

    trace_dataset = PathDataset(training_path = TRACE_TRAIN_PATH,
                             testing_path = TRACE_TEST_PATH,
                             validation_path = TRACE_TEST_PATH,

                              eager = True
                             )

    train_couples = trace_dataset.training.mapped_triples
    test_couples = trace_dataset.testing.mapped_triples
    train_df = pd.DataFrame(train_couples, columns=['dm','rpctype','um'])
    test_df = pd.DataFrame(test_couples, columns=['dm','rpctype','um']).drop(columns=['rpctype'],axis = 1)
    total_df = pd.concat([train_df,test_df]).drop_duplicates()
    true_links, non_edges, common_neighbours, sorensen_index_list, salton_index_list, jaccard_index, resource_allocation, adamic_adar, katz_index_list, lhn_index_list= compute_topology_metrics(create_couples(total_df))
    result = pd.DataFrame([[metric(non_edges, common_neighbours, true_links),
    metric(non_edges, sorensen_index_list, true_links),
    metric(non_edges, salton_index_list, true_links),
    metric(non_edges, jaccard_index, true_links),
    metric(non_edges, resource_allocation, true_links),
    metric(non_edges, adamic_adar, true_links),
    metric(non_edges, katz_index_list, true_links),
    metric(non_edges, lhn_index_list, true_links)]],columns = ['common_neighbours', 'sorensen_index', 'salton_index', 'jaccard_index',
                      'resource_allocation', 'adamic_adar', 'katz_index', 'lhn_index'])
    result['metric'] = metric_name
    result.set_index('metric', inplace=True)
    return result

In [11]:
def valid_trace(traceid):

    TRACE_TRAIN_PATH = "MSCallGraph_traces/Testing Traces/MSCallGraph_traces/train/" + traceid + "_transductive_train.tsv"
    TRACE_TEST_PATH = "MSCallGraph_traces/Testing Traces/MSCallGraph_traces/test/" +traceid + "_transductive_test.tsv"

    trace_dataset = PathDataset(training_path = TRACE_TRAIN_PATH,
                             testing_path = TRACE_TEST_PATH,
                             validation_path = TRACE_TEST_PATH,

                              eager = True
                             )

    train_couples = trace_dataset.training.mapped_triples
    test_couples = trace_dataset.testing.mapped_triples
    train_df = pd.DataFrame(train_couples, columns=['dm','rpctype','um'])
    test_df = pd.DataFrame(test_couples, columns=['dm','rpctype','um']).drop(columns=['rpctype'],axis = 1)
    total_df = pd.concat([train_df,test_df]).drop_duplicates()
   
    couples, true_links = remove_couples(create_couples(total_df))
    return not(len(true_links)==0)

In [12]:
def hits_at_10(non_edges,metric_list,true_links):
    return hits_at_k(non_edges,metric_list,true_links,10)

In [13]:
def hits_at_5(non_edges,metric_list,true_links):
    return hits_at_k(non_edges,metric_list,true_links,5)

In [14]:
def hits_at_3(non_edges,metric_list,true_links):
    return hits_at_k(non_edges,metric_list,true_links,3)

In [15]:
def hits_at_1(non_edges,metric_list,true_links):
    return hits_at_k(non_edges,metric_list,true_links,1)

In [16]:
def test_metrics(traceid):
    metric_names = ['mrr','hits_at_1','hits_at_3','hits_at_5','hits_at_10']
    metric_functions = [mean_reciprocal_rank,hits_at_1,hits_at_3,hits_at_5,hits_at_10]
    metric_results = []
    for metric_function,metric_name in zip(metric_functions,metric_names):
        metric_results.append(test_trace(traceid,metric_function,metric_name))
    return pd.concat(metric_results)

In [17]:
def test_on_traces(model_name):
    directory = model_name+"_testing_traces"

    if not os.path.exists(directory):
        os.makedirs(directory)
        print(f'Directory {directory} created successfully!')
    else:
        print(f'Directory {directory} already exists.')
        


    # Specifica il percorso della cartella da cui si vogliono ottenere i nomi dei file
    folder_path =  "MSCallGraph_traces/Testing Traces/MSCallGraph_traces/train/"

    files_list = []
    for filename in os.listdir(folder_path):
        # Ottenere il nome del file
        file_name = os.path.basename(filename)
        files_list.append(file_name)
    
#     files_list = files_list[:10]
#     topology_names = ['common_neighbours', 'sorensen_index', 'salton_index', 'jaccard_index',
#                       'resource_allocation', 'adamic_adar', 'katz_index', 'lhn_index']
#     metric_names = ['mrr','hits_at_1','hits_at_3','hits_at_5','hits_at_10']
#     metric_functions = [mean_reciprocal_rank,hits_at_1,hits_at_3,hits_at_5,hits_at_10]
    all_traces_list = []
    # Scansione di ogni file nella cartella
    with tqdm(desc=f'{model_name} testing traces', total=len(files_list)) as progress_bar:
        for file_name in files_list:
            # Stampa il nome del file
            if valid_trace(file_name[:-23]):
                all_traces_list.append(test_metrics(file_name[:-23]))
            progress_bar.update(1)


    mean_trace_df = sum(all_traces_list)/len(all_traces_list)
    mean_trace_df.to_csv(f"{model_name}_testing_traces/mean_trace_test_max.csv")
    
    display(mean_trace_df)
    def zip_folder(folder_path, output_path):
        with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
            for root, dirs, files in os.walk(folder_path):
                for file in files:
                    zipf.write(os.path.join(root, file))

    folder_path = model_name+"_testing_traces"
    output_path = f'{folder_path}.zip'

    zip_folder(folder_path, output_path)

In [18]:
test_on_traces('topology_connected')

Directory topology_connected_testing_traces already exists.


topology_connected testing traces: 100%|██████████| 1027/1027 [03:54<00:00,  4.38it/s]


Unnamed: 0_level_0,common_neighbours,sorensen_index,salton_index,jaccard_index,resource_allocation,adamic_adar,katz_index,lhn_index
metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
mrr,0.037942,0.038857,0.038862,0.039355,0.143154,0.143153,0.137328,0.041754
hits_at_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hits_at_3,0.034091,0.034091,0.034091,0.034091,0.128166,0.128166,0.115694,0.034091
hits_at_5,0.054545,0.054545,0.054545,0.056818,0.248808,0.248808,0.175986,0.056818
hits_at_10,0.067045,0.069318,0.069318,0.073864,0.325675,0.325675,0.273479,0.075
