Here I tried to build a transductive dataset from the single traces of death star dataset, however it is not possible since the traces are so small that need to be all part of the train set in order to be a transductive set. Also trying to put all the traces together doesn't work.

In [1]:
import json
import networkx as nx
import networkit as nk
import matplotlib.pyplot as plt
import math
import numpy as np
from numpy.linalg import inv
import pandas as pd

seed = 1234

In [2]:
def create_triplets(path):
  
    # Opening JSON file
    f = open(path)

    # returns JSON object as 
    # a dictionary
    data = json.load(f)

    # Closing file
    f.close()
    content = data['data']
    first = content[0]
    second = first['spans']

    triplets = set() #head, link, tail ---> process(spanID), operationName, process(references_spanID)
    
    dict_span_process = {}
    for elem in second:
        dict_span_process[elem['spanID']] = first['processes'][elem['processID']]['serviceName']

    root = first['traceID']

    for elem in second:
        head = elem['spanID']
        link = elem['operationName']
        tail = None
        if head != root:
            tail = elem['references'][0]['spanID']
            triplets.add(tuple([dict_span_process[head],link,dict_span_process[tail]]))
    
    return triplets
    


In [3]:
def create_transductive(triplets):
    df = pd.DataFrame(triplets,columns=['dm','rpctype','um'])

    df

    entities = set(df['dm']) | set(df['um'])
    len(entities)
    relations = set(df['rpctype'])
    baseline = []
    for entity in entities:
        baseline.append(df[df['dm'] == entity].head(1))
        baseline.append(df[df['um'] == entity].head(1))

    for relation in relations:
        baseline.append(df[df['rpctype']==relation].head(1))
    baseline_df = pd.concat(baseline)
    baseline_df = baseline_df.drop_duplicates()
    baseline_df

    n_train_items = int(len(df)*0.64)
    n_items_to_add = n_train_items - len(baseline_df)
    print("the number of items to add to the train set is: ",n_items_to_add)

    not_baseline_df = df.merge(baseline_df, how='outer', indicator=True).loc[lambda x : x['_merge']=='left_only'].drop('_merge',axis= 1)
    not_baseline_df
    
    not_baseline_train = df.head(0)
    if n_items_to_add >0:
        not_baseline_train = not_baseline_df.sample(n_items_to_add,random_state = seed)
    train_df = pd.concat([baseline_df, not_baseline_train])
    train_df

    remaining_df = not_baseline_df.drop(not_baseline_train.index) 

    validation_df = remaining_df.sample(frac = 0.44, random_state = seed)
    validation_df

    test_df = remaining_df.drop(validation_df.index)  # drop the sampled rows to get the second DataFram
    test_df

    #Transductive checks

    train_entities = set(train_df['dm']) or set(train_df['um'])
    valid_entities = set(validation_df['dm']) or set(validation_df['um'])
    test_entities = set(test_df['dm']) or set(test_df['um'])

    assert(valid_entities.issubset(train_entities))

    assert(test_entities.issubset(train_entities))

    train_relations = set(train_df['rpctype'])
    valid_relations = set(validation_df['rpctype'])
    test_relations = set(test_df['rpctype'])

    assert(valid_relations.issubset(train_relations))

    assert(test_relations.issubset(train_relations))

    #Check to see if there is test leakage

    assert(0==len(pd.merge(train_df, validation_df)))

    assert(0==len(pd.merge(train_df, test_df)))

    assert(0==len(pd.merge(test_df, validation_df)))
    return train_df, validation_df, test_df

In [16]:
train_df, validation_df, test_df = create_transductive(
#     create_triplets('social-network/sample-rate-100/user-timeline-service.json'))
    create_triplets('social-network/sample-rate-100/user-service.json'))

the number of items to add to the train set is:  -4


In [17]:
train_df

Unnamed: 0,dm,rpctype,um
1,social-graph-service,social_graph_redis_update_client,social-graph-service
0,user-service,user_mmc_get_user_id_client,user-service
6,nginx-web-server,/wrk2-api/user/follow,nginx-web-server
5,social-graph-service,follow_server,social-graph-service
2,user-service,get_user_id_server,social-graph-service
7,social-graph-service,follow_with_username_server,nginx-web-server
3,social-graph-service,mongo_update_client,social-graph-service
4,social-graph-service,social_graph_mongo_update_client,social-graph-service
8,nginx-web-server,Follow,nginx-web-server


In [18]:
validation_df

Unnamed: 0,dm,rpctype,um


In [19]:
test_df

Unnamed: 0,dm,rpctype,um


In [20]:
train_df, validation_df, test_df = create_transductive(
     create_triplets('social-network/sample-rate-100/user-timeline-service.json') |
    create_triplets('social-network/sample-rate-100/social-graph-service.json')|
    create_triplets('social-network/sample-rate-100/nginx-web-server.json')|
    create_triplets('social-network/sample-rate-100/post-storage-service.json')|
     create_triplets('social-network/sample-rate-100/user-service.json') )

the number of items to add to the train set is:  -6


In [21]:
train_df

Unnamed: 0,dm,rpctype,um
0,user-timeline-service,read_user_timeline_server,nginx-web-server
4,post-storage-service,post_storage_read_posts_server,user-timeline-service
8,social-graph-service,social_graph_redis_update_client,social-graph-service
1,user-service,get_user_id_server,social-graph-service
3,user-service,user_mmc_get_user_id_client,user-service
2,nginx-web-server,Follow,nginx-web-server
12,social-graph-service,follow_server,social-graph-service
6,user-timeline-service,user_timeline_redis_find_client,user-timeline-service
7,nginx-web-server,ReadUserTimeline,nginx-web-server
14,social-graph-service,follow_with_username_server,nginx-web-server


In [22]:
validation_df

Unnamed: 0,dm,rpctype,um


In [23]:
test_df

Unnamed: 0,dm,rpctype,um
