With this notebook I am creating the transductive train, validation, and test sets. At the beginning I find the trace ids that do not belong to duplicate calls (traces ids that can identify an unique path from an user request). Then I create a transductive train set that has all the entities in the graph. I add also the other relations in order to have only complete traces in the train set. From the remaining dataframe, I remove all the relations that belong to a trace that have some triplets already seen in the train set. As result, the training set has completed traces (traces that have all their calls), but there is just one triplet in the validation set.

In [1]:
import pandas as pd
import numpy as np

seed = 1234

In [2]:
df = pd.read_csv('MSCallGraph_0.csv')
df=df.drop(['Unnamed: 0','timestamp','rpcid','interface','rt',],axis=1)
df = df.replace('(?)', np.NaN)
df = df.replace('', np.NaN)
df = df.replace('NAN', np.NaN)
df = df.dropna()
df = df.drop_duplicates()
df.nunique()

traceid    130506
um           1474
rpctype         6
dm           7326
dtype: int64

In [3]:
df

Unnamed: 0,traceid,um,rpctype,dm
0,0b133c1915919238193454000e5d37,5cca70246befb1f4c9546d2912b9419dee54439218efa5...,mc,b1dbd3a649a3cc790fa12573c9c1aa00988e07a8818a22...
11,0b133c1915919238193454000e5d37,4ab265f54516248ee8873be7d6441912456ce17e84f399...,mc,fd6d86bd0fd550e717c1fdb82a33190a9fef216d87d535...
20,0b133c1915919238193454000e5d37,5cca70246befb1f4c9546d2912b9419dee54439218efa5...,userDefined,5cca70246befb1f4c9546d2912b9419dee54439218efa5...
26,0b133c1915919238193454000e5d37,75e56c8fbb9336eb4dd40f5f609d5344203d374d73fd0b...,rpc,84f9f68ef003a21288fffe8f9a09a5a29b05f4cc4229b8...
42,0b133c1915919238193454000e5d37,4ab265f54516248ee8873be7d6441912456ce17e84f399...,mc,01d660afcfadafd587e20ec4c04ddbc7eb0de95643ba0e...
...,...,...,...,...
6088821,0b52069415919238409694000ea18b,9a9e8613b6d7d1b573ba38dba19eec9c3cc460ab00cb49...,rpc,0d4e0aa3996cbc207424a87d8ce16610b7694e2fac37d7...
6088829,015101cf15919238987695000e,35114acfb54c54fb9618f23cd28bbc57c765f597df1409...,mc,9653f5baba69c9fb50bfb30a8571eb04dbceaae7c7f379...
6088839,0b52069415919238409694000ea18b,614c66b178d3cfe299b11254ecb4321f85beca4b2ccac8...,mc,4bba5c2371c3384862e69615979c2aa5b7f1b4fc3ea914...
6088842,0b52069415919238409694000ea18b,614c66b178d3cfe299b11254ecb4321f85beca4b2ccac8...,rpc,9a9e8613b6d7d1b573ba38dba19eec9c3cc460ab00cb49...


In [4]:
traces = list(set(df['traceid']))
unique_trace_ids = []
triplets_df = df.drop(['traceid'], axis = 1)
triplets_df = triplets_df.drop_duplicates()
triplets_df

Unnamed: 0,um,rpctype,dm
0,5cca70246befb1f4c9546d2912b9419dee54439218efa5...,mc,b1dbd3a649a3cc790fa12573c9c1aa00988e07a8818a22...
11,4ab265f54516248ee8873be7d6441912456ce17e84f399...,mc,fd6d86bd0fd550e717c1fdb82a33190a9fef216d87d535...
20,5cca70246befb1f4c9546d2912b9419dee54439218efa5...,userDefined,5cca70246befb1f4c9546d2912b9419dee54439218efa5...
26,75e56c8fbb9336eb4dd40f5f609d5344203d374d73fd0b...,rpc,84f9f68ef003a21288fffe8f9a09a5a29b05f4cc4229b8...
42,4ab265f54516248ee8873be7d6441912456ce17e84f399...,mc,01d660afcfadafd587e20ec4c04ddbc7eb0de95643ba0e...
...,...,...,...
6081607,0674d4c8b48f44a836019adff9dffe0cce2ee6e7b93ee0...,db,039ff41dc3636105ffd26b0ec675eec246a965e8102c77...
6081615,1cf86b19c487cc10ea3bb9623a319454029e040d0f4e9d...,db,bb61ac88608568ac2017c8559c1da973f4f94c7d1a958e...
6081625,2c52c868f36f3920d201fddda80c8062998ebc07ee2b85...,db,abb42d7303a4376befc681f9b4a765c6b6575de0df98ed...
6081642,1cf86b19c487cc10ea3bb9623a319454029e040d0f4e9d...,db,b0748973546ca26a9cbfe7802e52924291a76929a704a4...


In [5]:
filtered_df = df.loc[triplets_df.index]
filtered_df

Unnamed: 0,traceid,um,rpctype,dm
0,0b133c1915919238193454000e5d37,5cca70246befb1f4c9546d2912b9419dee54439218efa5...,mc,b1dbd3a649a3cc790fa12573c9c1aa00988e07a8818a22...
11,0b133c1915919238193454000e5d37,4ab265f54516248ee8873be7d6441912456ce17e84f399...,mc,fd6d86bd0fd550e717c1fdb82a33190a9fef216d87d535...
20,0b133c1915919238193454000e5d37,5cca70246befb1f4c9546d2912b9419dee54439218efa5...,userDefined,5cca70246befb1f4c9546d2912b9419dee54439218efa5...
26,0b133c1915919238193454000e5d37,75e56c8fbb9336eb4dd40f5f609d5344203d374d73fd0b...,rpc,84f9f68ef003a21288fffe8f9a09a5a29b05f4cc4229b8...
42,0b133c1915919238193454000e5d37,4ab265f54516248ee8873be7d6441912456ce17e84f399...,mc,01d660afcfadafd587e20ec4c04ddbc7eb0de95643ba0e...
...,...,...,...,...
6081607,0b5218f915919236934722000ec49b,0674d4c8b48f44a836019adff9dffe0cce2ee6e7b93ee0...,db,039ff41dc3636105ffd26b0ec675eec246a965e8102c77...
6081615,0b5218f915919236934722000ec49b,1cf86b19c487cc10ea3bb9623a319454029e040d0f4e9d...,db,bb61ac88608568ac2017c8559c1da973f4f94c7d1a958e...
6081625,0b5218f915919236934722000ec49b,2c52c868f36f3920d201fddda80c8062998ebc07ee2b85...,db,abb42d7303a4376befc681f9b4a765c6b6575de0df98ed...
6081642,0b5218f915919236934722000ec49b,1cf86b19c487cc10ea3bb9623a319454029e040d0f4e9d...,db,b0748973546ca26a9cbfe7802e52924291a76929a704a4...


In [6]:
filtered_traces =list(set(filtered_df['traceid']))
len(filtered_traces) # theoretical maximum nuber of unique traces

4362

In [7]:
i = 0
import time
import IPython
tot_n_traces = len(filtered_traces)
tot_n_triplets = len(triplets_df)
start = time.time()
for trace in filtered_traces:
    trace_calls_df = df[df['traceid'] == trace]
    trace_calls_df = trace_calls_df.drop(['traceid'],axis = 1)
    trace_calls_df = trace_calls_df.drop_duplicates()
    
    intersection_df = pd.merge(triplets_df, trace_calls_df)
    if len(intersection_df) > 0:
        unique_trace_ids.append(trace)
        # intersection is a subset of triplets
        triplets_df = pd.concat([triplets_df,intersection_df]).drop_duplicates(keep=False)
    
    remaining_triplets = len(triplets_df)
    if remaining_triplets == 0:
        break
    
    i = i+1
    duration = time.time() - start
    remaining_time = duration/(i/tot_n_traces) - duration
    print("% traces completed: ", (i/tot_n_traces) * 100)
    print("Traces remaining time (s): ", remaining_time)
    print("Traces remaining time (m): ", remaining_time/60)
    print("Traces remaining time (h): ", remaining_time/60/60)
    print("Traces remaining time (d): ", remaining_time/60/60/24)
    
    remaining_time_triplets = duration/(1-(remaining_triplets/tot_n_triplets)) - duration
    print("Remaining triplets: ", remaining_triplets)
    print("% triplets completed: ", (1-(remaining_triplets/tot_n_triplets)) *100)
    print("Triplets remaining time (s): ", remaining_time_triplets)
    print("Triplets remaining time (m): ", remaining_time_triplets/60)
    print("Triplets remaining time (h): ", remaining_time_triplets/60/60)
    print("Triplets remaining time (d): ", remaining_time_triplets/60/60/24)
    IPython.display.clear_output(wait = True)

% traces completed:  99.90829894543786
Traces remaining time (s):  0.4505743326097331
Traces remaining time (m):  0.007509572210162218
Traces remaining time (h):  0.00012515953683603698
Traces remaining time (d):  5.214980701501541e-06
Remaining triplets:  1
% triplets completed:  99.99383515196351
Triplets remaining time (s):  0.03026515014659026
Triplets remaining time (m):  0.0005044191691098377
Triplets remaining time (h):  8.406986151830628e-06
Triplets remaining time (d):  3.502910896596095e-07


In [8]:
duration = time.time() - start
print("Time required (s): ",duration)

Time required (s):  491.03484773635864


In [9]:
print("Number of unique microservice calls: ",len(unique_trace_ids))

Number of unique microservice calls:  3398


In [12]:
unique_calls_df = df[df['traceid'].isin(unique_trace_ids)]
unique_calls_df = unique_calls_df.drop_duplicates()

In [None]:
unique_calls_df.to_csv('unique_calls_df_0.csv',index=False)


In [4]:
unique_calls_df = pd.read_csv('unique_calls_df_0.csv')
unique_calls_triplets_df = unique_calls_df.drop(['traceid'],axis = 1)
unique_calls_triplets_df = unique_calls_triplets_df.drop_duplicates()

In [5]:
unique_calls_df

Unnamed: 0,traceid,um,rpctype,dm
0,0b11355e15919238739898000e4315,75e56c8fbb9336eb4dd40f5f609d5344203d374d73fd0b...,rpc,9a9e8613b6d7d1b573ba38dba19eec9c3cc460ab00cb49...
1,0b11355e15919238739898000e4315,75e56c8fbb9336eb4dd40f5f609d5344203d374d73fd0b...,rpc,f0facca9841a7832c38299f8448958df4dceeadb444631...
2,0b11355e15919238739898000e4315,75e56c8fbb9336eb4dd40f5f609d5344203d374d73fd0b...,rpc,84f9f68ef003a21288fffe8f9a09a5a29b05f4cc4229b8...
3,0b11355e15919238739898000e4315,95a6f7f8345e2eca31ee74ddc19d547e7fc0f5c8e65772...,mc,9f8edf2b85a2974dafdae8d961043bfa2f2c6e27485a00...
4,0b11355e15919238739898000e4315,75e56c8fbb9336eb4dd40f5f609d5344203d374d73fd0b...,mc,ad418f5c26ee81d1276f05bb9dd3ca6fad69efd73afd2a...
...,...,...,...,...
201355,0b5114df15919237534872000ef4d4,d2c895bd5c7de55f381310fd1b94ce9d518bf4ac22e22f...,userDefined,d2c895bd5c7de55f381310fd1b94ce9d518bf4ac22e22f...
201356,0b5114df15919237534872000ef4d4,81da45152cce70f229b6fe9abdf8e303a5ec88da52bf60...,mq,1efeaa8c6ee7899fecd33c8b0ed024154adcc03c7899c1...
201357,0b5114df15919237534872000ef4d4,81da45152cce70f229b6fe9abdf8e303a5ec88da52bf60...,mq,afd45c934f07a26cb051f0dcaba4934fed1612f9c8a02d...
201358,0b5114df15919237534872000ef4d4,81da45152cce70f229b6fe9abdf8e303a5ec88da52bf60...,mq,cdc39ad528339be2ad716f4870ce77a0bc2fe2e02ddbdb...


In [6]:
unique_calls_triplets_df

Unnamed: 0,um,rpctype,dm
0,75e56c8fbb9336eb4dd40f5f609d5344203d374d73fd0b...,rpc,9a9e8613b6d7d1b573ba38dba19eec9c3cc460ab00cb49...
1,75e56c8fbb9336eb4dd40f5f609d5344203d374d73fd0b...,rpc,f0facca9841a7832c38299f8448958df4dceeadb444631...
2,75e56c8fbb9336eb4dd40f5f609d5344203d374d73fd0b...,rpc,84f9f68ef003a21288fffe8f9a09a5a29b05f4cc4229b8...
3,95a6f7f8345e2eca31ee74ddc19d547e7fc0f5c8e65772...,mc,9f8edf2b85a2974dafdae8d961043bfa2f2c6e27485a00...
4,75e56c8fbb9336eb4dd40f5f609d5344203d374d73fd0b...,mc,ad418f5c26ee81d1276f05bb9dd3ca6fad69efd73afd2a...
...,...,...,...
201148,1cf86b19c487cc10ea3bb9623a319454029e040d0f4e9d...,db,bb61ac88608568ac2017c8559c1da973f4f94c7d1a958e...
201153,2c52c868f36f3920d201fddda80c8062998ebc07ee2b85...,db,abb42d7303a4376befc681f9b4a765c6b6575de0df98ed...
201160,1cf86b19c487cc10ea3bb9623a319454029e040d0f4e9d...,db,b0748973546ca26a9cbfe7802e52924291a76929a704a4...
201318,5cca70246befb1f4c9546d2912b9419dee54439218efa5...,db,31a97907aa00a2d2d1445a5104bbcc47d70fa52a2370b8...


In [7]:
df_copy = unique_calls_df
df = df.drop(['traceid'],axis = 1)
df = df.drop_duplicates()
df

Unnamed: 0,um,rpctype,dm
0,5cca70246befb1f4c9546d2912b9419dee54439218efa5...,mc,b1dbd3a649a3cc790fa12573c9c1aa00988e07a8818a22...
11,4ab265f54516248ee8873be7d6441912456ce17e84f399...,mc,fd6d86bd0fd550e717c1fdb82a33190a9fef216d87d535...
20,5cca70246befb1f4c9546d2912b9419dee54439218efa5...,userDefined,5cca70246befb1f4c9546d2912b9419dee54439218efa5...
26,75e56c8fbb9336eb4dd40f5f609d5344203d374d73fd0b...,rpc,84f9f68ef003a21288fffe8f9a09a5a29b05f4cc4229b8...
42,4ab265f54516248ee8873be7d6441912456ce17e84f399...,mc,01d660afcfadafd587e20ec4c04ddbc7eb0de95643ba0e...
...,...,...,...
6081607,0674d4c8b48f44a836019adff9dffe0cce2ee6e7b93ee0...,db,039ff41dc3636105ffd26b0ec675eec246a965e8102c77...
6081615,1cf86b19c487cc10ea3bb9623a319454029e040d0f4e9d...,db,bb61ac88608568ac2017c8559c1da973f4f94c7d1a958e...
6081625,2c52c868f36f3920d201fddda80c8062998ebc07ee2b85...,db,abb42d7303a4376befc681f9b4a765c6b6575de0df98ed...
6081642,1cf86b19c487cc10ea3bb9623a319454029e040d0f4e9d...,db,b0748973546ca26a9cbfe7802e52924291a76929a704a4...


Checking that all unique triplets are present

In [8]:
assert len(unique_calls_triplets_df) == len(df)

In [9]:
df = unique_calls_triplets_df

In [10]:
entities = set(df['dm']) | set(df['um'])
len(entities)

7386

In [11]:
baseline = []
for entity in entities:
    baseline.append(df[df['dm'] == entity].head(1))
    baseline.append(df[df['um'] == entity].head(1))

In [12]:
baseline_df = pd.concat(baseline)
baseline_df = baseline_df.drop_duplicates()
baseline_df

Unnamed: 0,um,rpctype,dm
33284,35114acfb54c54fb9618f23cd28bbc57c765f597df1409...,db,61371691807655fd263df9dfd20af6a2eae9f9045b893a...
29255,86562d64f056f9c8c58aecf349f090a30d1e5dd397fa2c...,rpc,9a3ef4d24dd4e7fb8baaa9e30aea1395caa50f583630f6...
23580,58d745e6fac32dd9ddb2db4a491e3d5deed19311aa0109...,db,ac42ded2623691a629de113206d12dcc2f498991dfaeaf...
40686,35114acfb54c54fb9618f23cd28bbc57c765f597df1409...,rpc,1f7da0c7d9e8bde7610bdaa249b16969cd06e8b3556d09...
40687,1f7da0c7d9e8bde7610bdaa249b16969cd06e8b3556d09...,mc,c51e84520b7fec6e71615c909f102dccde0169cd1e2319...
...,...,...,...
785,d03bb97862607468fe3153b28d41a20de1e3144a566264...,db,7796bb93a1bc340c92b3e60672c812da9e8b422a584850...
9802,d03bb97862607468fe3153b28d41a20de1e3144a566264...,db,f2679ec38619d571cdd1a5537ca3569b3c270b6620b2d4...
49150,0696938ae3212896c34626b3816d92ca5b0bed0cfda981...,db,294829572c16b7de337add46a6728d58fb4cbecfa7e97b...
14943,f765d0662fbe701b90885f25f107ab6528e2feda5f43ce...,db,6089e16a959a0af2de16df2b8bfa7662350e4c0a512bdc...


In [13]:
baseline_traces_df = df_copy.loc[baseline_df.index]
baseline_traces_df

Unnamed: 0,traceid,um,rpctype,dm
33284,0b51059215919236636499000e5b04,35114acfb54c54fb9618f23cd28bbc57c765f597df1409...,db,61371691807655fd263df9dfd20af6a2eae9f9045b893a...
29255,0b5117b215919237608494000ee682,86562d64f056f9c8c58aecf349f090a30d1e5dd397fa2c...,rpc,9a3ef4d24dd4e7fb8baaa9e30aea1395caa50f583630f6...
23580,0b5106ad15919236676023000ea751,58d745e6fac32dd9ddb2db4a491e3d5deed19311aa0109...,db,ac42ded2623691a629de113206d12dcc2f498991dfaeaf...
40686,0b52063d15919236877547000ef32d,35114acfb54c54fb9618f23cd28bbc57c765f597df1409...,rpc,1f7da0c7d9e8bde7610bdaa249b16969cd06e8b3556d09...
40687,0b52063d15919236877547000ef32d,1f7da0c7d9e8bde7610bdaa249b16969cd06e8b3556d09...,mc,c51e84520b7fec6e71615c909f102dccde0169cd1e2319...
...,...,...,...,...
785,0b092ace15919236823227000e31dd,d03bb97862607468fe3153b28d41a20de1e3144a566264...,db,7796bb93a1bc340c92b3e60672c812da9e8b422a584850...
9802,0b12df4215919238711231000ef6d6,d03bb97862607468fe3153b28d41a20de1e3144a566264...,db,f2679ec38619d571cdd1a5537ca3569b3c270b6620b2d4...
49150,0b51058015919238854225000e7b69,0696938ae3212896c34626b3816d92ca5b0bed0cfda981...,db,294829572c16b7de337add46a6728d58fb4cbecfa7e97b...
14943,0b123e1615919238869569000d0a6b,f765d0662fbe701b90885f25f107ab6528e2feda5f43ce...,db,6089e16a959a0af2de16df2b8bfa7662350e4c0a512bdc...


In [14]:
training_traces = set(baseline_traces_df['traceid'])
len(training_traces)

2424

In [15]:
total_train_traces_df = unique_calls_df[unique_calls_df['traceid'].isin(training_traces)]
total_train_traces_df

Unnamed: 0,traceid,um,rpctype,dm
0,0b11355e15919238739898000e4315,75e56c8fbb9336eb4dd40f5f609d5344203d374d73fd0b...,rpc,9a9e8613b6d7d1b573ba38dba19eec9c3cc460ab00cb49...
1,0b11355e15919238739898000e4315,75e56c8fbb9336eb4dd40f5f609d5344203d374d73fd0b...,rpc,f0facca9841a7832c38299f8448958df4dceeadb444631...
2,0b11355e15919238739898000e4315,75e56c8fbb9336eb4dd40f5f609d5344203d374d73fd0b...,rpc,84f9f68ef003a21288fffe8f9a09a5a29b05f4cc4229b8...
3,0b11355e15919238739898000e4315,95a6f7f8345e2eca31ee74ddc19d547e7fc0f5c8e65772...,mc,9f8edf2b85a2974dafdae8d961043bfa2f2c6e27485a00...
4,0b11355e15919238739898000e4315,75e56c8fbb9336eb4dd40f5f609d5344203d374d73fd0b...,mc,ad418f5c26ee81d1276f05bb9dd3ca6fad69efd73afd2a...
...,...,...,...,...
201355,0b5114df15919237534872000ef4d4,d2c895bd5c7de55f381310fd1b94ce9d518bf4ac22e22f...,userDefined,d2c895bd5c7de55f381310fd1b94ce9d518bf4ac22e22f...
201356,0b5114df15919237534872000ef4d4,81da45152cce70f229b6fe9abdf8e303a5ec88da52bf60...,mq,1efeaa8c6ee7899fecd33c8b0ed024154adcc03c7899c1...
201357,0b5114df15919237534872000ef4d4,81da45152cce70f229b6fe9abdf8e303a5ec88da52bf60...,mq,afd45c934f07a26cb051f0dcaba4934fed1612f9c8a02d...
201358,0b5114df15919237534872000ef4d4,81da45152cce70f229b6fe9abdf8e303a5ec88da52bf60...,mq,cdc39ad528339be2ad716f4870ce77a0bc2fe2e02ddbdb...


In [16]:
train_triplets_df = total_train_traces_df.drop(['traceid'],axis = 1).drop_duplicates()
# train_triplets_df = baseline_traces_df
train_triplets_df

Unnamed: 0,um,rpctype,dm
0,75e56c8fbb9336eb4dd40f5f609d5344203d374d73fd0b...,rpc,9a9e8613b6d7d1b573ba38dba19eec9c3cc460ab00cb49...
1,75e56c8fbb9336eb4dd40f5f609d5344203d374d73fd0b...,rpc,f0facca9841a7832c38299f8448958df4dceeadb444631...
2,75e56c8fbb9336eb4dd40f5f609d5344203d374d73fd0b...,rpc,84f9f68ef003a21288fffe8f9a09a5a29b05f4cc4229b8...
3,95a6f7f8345e2eca31ee74ddc19d547e7fc0f5c8e65772...,mc,9f8edf2b85a2974dafdae8d961043bfa2f2c6e27485a00...
4,75e56c8fbb9336eb4dd40f5f609d5344203d374d73fd0b...,mc,ad418f5c26ee81d1276f05bb9dd3ca6fad69efd73afd2a...
...,...,...,...
201144,90c357ca77801f0d38c1f4cffa2e3d51ef7b1fca91ae35...,db,88c8713304cae7388c1703e83e1b0d3ec62dbdf33dd465...
201146,0674d4c8b48f44a836019adff9dffe0cce2ee6e7b93ee0...,db,039ff41dc3636105ffd26b0ec675eec246a965e8102c77...
201148,1cf86b19c487cc10ea3bb9623a319454029e040d0f4e9d...,db,bb61ac88608568ac2017c8559c1da973f4f94c7d1a958e...
201153,2c52c868f36f3920d201fddda80c8062998ebc07ee2b85...,db,abb42d7303a4376befc681f9b4a765c6b6575de0df98ed...


In [17]:
# the traces in the training set may not be completed
# train_df = baseline_df
train_df = train_triplets_df
train_df

Unnamed: 0,um,rpctype,dm
0,75e56c8fbb9336eb4dd40f5f609d5344203d374d73fd0b...,rpc,9a9e8613b6d7d1b573ba38dba19eec9c3cc460ab00cb49...
1,75e56c8fbb9336eb4dd40f5f609d5344203d374d73fd0b...,rpc,f0facca9841a7832c38299f8448958df4dceeadb444631...
2,75e56c8fbb9336eb4dd40f5f609d5344203d374d73fd0b...,rpc,84f9f68ef003a21288fffe8f9a09a5a29b05f4cc4229b8...
3,95a6f7f8345e2eca31ee74ddc19d547e7fc0f5c8e65772...,mc,9f8edf2b85a2974dafdae8d961043bfa2f2c6e27485a00...
4,75e56c8fbb9336eb4dd40f5f609d5344203d374d73fd0b...,mc,ad418f5c26ee81d1276f05bb9dd3ca6fad69efd73afd2a...
...,...,...,...
201144,90c357ca77801f0d38c1f4cffa2e3d51ef7b1fca91ae35...,db,88c8713304cae7388c1703e83e1b0d3ec62dbdf33dd465...
201146,0674d4c8b48f44a836019adff9dffe0cce2ee6e7b93ee0...,db,039ff41dc3636105ffd26b0ec675eec246a965e8102c77...
201148,1cf86b19c487cc10ea3bb9623a319454029e040d0f4e9d...,db,bb61ac88608568ac2017c8559c1da973f4f94c7d1a958e...
201153,2c52c868f36f3920d201fddda80c8062998ebc07ee2b85...,db,abb42d7303a4376befc681f9b4a765c6b6575de0df98ed...


In [18]:
triplets_not_present_in_training = pd.concat([df,train_triplets_df]).drop_duplicates(keep = False)
traces_with_triplets_used_in_training_df = unique_calls_df.drop(triplets_not_present_in_training.index)
traces_with_triplets_used_in_training = set(traces_with_triplets_used_in_training_df['traceid'])
not_used_triplets_df = unique_calls_df[~unique_calls_df['traceid'].isin(traces_with_triplets_used_in_training)]
# not_used_triplets_df = unique_calls_df.drop(unique_calls_df.drop(pd.concat([unique_calls_df.drop(['traceid'],axis=1).drop_duplicates(),train_triplets_df]).drop_duplicates(keep = False).index).index)
not_used_triplets_df

Unnamed: 0,traceid,um,rpctype,dm
73977,0b51187a15918992051312000e0a8b,b39889af09ab89f4a7918a534348e405b52260bf198a43...,rpc,1efeaa8c6ee7899fecd33c8b0ed024154adcc03c7899c1...


In [19]:
len(set(not_used_triplets_df['traceid']))

1

In [20]:
not_baseline_df = not_used_triplets_df.drop('traceid',axis=1).drop_duplicates()
not_baseline_df

Unnamed: 0,um,rpctype,dm
73977,b39889af09ab89f4a7918a534348e405b52260bf198a43...,rpc,1efeaa8c6ee7899fecd33c8b0ed024154adcc03c7899c1...


In [21]:
# training_traces_df = df_copy.loc[train_df.index]
# training_traces_df

In [22]:
# training_traces = set(training_traces_df['traceid'])
# len(training_traces)

In [23]:
# # triplets that have not been used in training
# remaining_df_1 = df_copy[~ df_copy['traceid'].isin(training_traces)].drop(['traceid'],axis = 1).drop_duplicates()
# # triplets of relations that have not been used in training
# remaining_df_2 = df.drop(training_traces_df.index)
# remaining_df = remaining_df_2.merge(remaining_df_1,left_index= True,right_on=remaining_df_1.index, how = 'inner').drop(
#     ['key_0','um_y','rpctype_y','dm_y'],axis = 1).rename(columns={'um_x':'um','rpctype_x':'rpctype','dm_x':'dm'})
remaining_df = not_baseline_df
remaining_df

Unnamed: 0,um,rpctype,dm
73977,b39889af09ab89f4a7918a534348e405b52260bf198a43...,rpc,1efeaa8c6ee7899fecd33c8b0ed024154adcc03c7899c1...


In [24]:
remaining_traces_df = df_copy.loc[remaining_df.index]
remaining_traces_df

Unnamed: 0,traceid,um,rpctype,dm
73977,0b51187a15918992051312000e0a8b,b39889af09ab89f4a7918a534348e405b52260bf198a43...,rpc,1efeaa8c6ee7899fecd33c8b0ed024154adcc03c7899c1...


In [25]:
remaining_traces = list(set(remaining_traces_df['traceid']))
len(remaining_traces)

1

In [26]:
valid_testing_df = remaining_df
valid_testing_df

Unnamed: 0,um,rpctype,dm
73977,b39889af09ab89f4a7918a534348e405b52260bf198a43...,rpc,1efeaa8c6ee7899fecd33c8b0ed024154adcc03c7899c1...


In [27]:
validation_df = valid_testing_df.head(0)
i = 0    # I want the test set to have at least 60% of the remaining traces
while i < len(remaining_traces) and len(valid_testing_df) > len(remaining_df) * 0.54:
    trace = remaining_traces[i]
    temp_df = remaining_traces_df[remaining_traces_df['traceid'] == trace].drop(['traceid'],axis=1).drop_duplicates()
    validation_df = pd.concat([validation_df, temp_df])
    valid_testing_df = valid_testing_df.drop(temp_df.index)
    remaining_traces_df = remaining_traces_df.drop(temp_df.index)
    i = i+1

In [28]:
validation_df

Unnamed: 0,um,rpctype,dm
73977,b39889af09ab89f4a7918a534348e405b52260bf198a43...,rpc,1efeaa8c6ee7899fecd33c8b0ed024154adcc03c7899c1...


In [29]:
test_df = valid_testing_df
test_df

Unnamed: 0,um,rpctype,dm


trace leakage checks

In [30]:
train_traces = set(df_copy.loc[train_df.index]['traceid'])
validation_traces = set(df_copy.loc[validation_df.index]['traceid'])
test_traces = set(df_copy.loc[test_df.index]['traceid'])

In [31]:
assert len(train_traces & validation_traces) == 0

In [32]:
assert len(test_traces & validation_traces) == 0

In [33]:
assert len(train_traces & test_traces) == 0

check to see if the traces are completed (entirely contained in the sets)

In [34]:
for trace in train_traces:
    all_triplets_trace_df = unique_calls_df[unique_calls_df['traceid']==trace].drop(['traceid'],axis = 1).drop_duplicates()
    len_total_triplets = len(all_triplets_trace_df)
    len_contained = len(train_df.merge(all_triplets_trace_df))
    assert (len_total_triplets == len_contained),f"{len_total_triplets}  {len_contained}" 

In [35]:
for trace in validation_traces:
    all_triplets_trace_df = unique_calls_df[unique_calls_df['traceid']==trace].drop(['traceid'],axis = 1).drop_duplicates()
    len_total_triplets = len(all_triplets_trace_df)
    len_contained = len(validation_df.merge(all_triplets_trace_df))
    assert (len_total_triplets == len_contained),f"{len_total_triplets}  {len_contained}" 

In [36]:
for trace in test_traces:
    all_triplets_trace_df = unique_calls_df[unique_calls_df['traceid']==trace].drop(['traceid'],axis = 1).drop_duplicates()
    len_total_triplets = len(all_triplets_trace_df)
    len_contained = len(test_df.merge(all_triplets_trace_df))
    assert (len_total_triplets == len_contained),f"{len_total_triplets}  {len_contained}" 

Transductive checks

In [37]:
train_entities = set(train_df['dm']) or set(train_df['um'])
valid_entities = set(validation_df['dm']) or set(validation_df['um'])
test_entities = set(test_df['dm']) or set(test_df['um'])

In [38]:
assert(valid_entities.issubset(train_entities))

In [39]:
assert(test_entities.issubset(train_entities))

In [40]:
train_relations = set(train_df['rpctype'])
valid_relations = set(validation_df['rpctype'])
test_relations = set(test_df['rpctype'])

In [41]:
assert(valid_relations.issubset(train_relations))

In [42]:
assert(test_relations.issubset(train_relations))

Check to see if there is test leakage

In [43]:
assert(0==len(pd.merge(train_df, validation_df)))

In [44]:
assert(0==len(pd.merge(train_df, test_df)))

In [45]:
assert(0==len(pd.merge(test_df, validation_df)))

In [43]:
def create_triplets(df):
    triplets = []
    for i in range(len(df)):
        head = df.iloc[i]['dm']
        tail = df.iloc[i]['um']
        rel = df.iloc[i]['rpctype']
        triplets.append([head,rel,tail])
    return triplets

In [44]:
import asposecells
import jpype
jpype.startJVM()

from asposecells.api import Workbook, FileFormatType

def create_tsv(triplets, file_name):


    # Create Workbook object.
    workbook = Workbook(FileFormatType.TSV)

    # Access the first worksheet of the workbook.
    worksheet = workbook.getWorksheets().get(0)

    # Get the desired cell(s) of the worksheet and input the value into the cell(s).


    i = 1
    for elem in triplets: 
        worksheet.getCells().get("A"+str(i)).putValue(elem[0])
        worksheet.getCells().get("B"+str(i)).putValue(elem[1])
        worksheet.getCells().get("C"+str(i)).putValue(elem[2])
        i+=1


    # Save the workbook as TSV file.
    workbook.save(file_name)


    file = open(file_name,'r')  
    lines = file.readlines()  
    file.close()
    file = open(file_name,'w')  
    lines = lines[:-1]
    file.writelines(lines)
    file.close()

In [45]:
datasets = [train_df, validation_df, test_df]
names = ["train", "validation","test"]

In [46]:
for elem in zip(datasets,names):
    create_tsv(create_triplets(elem[0]),f"MSCallGraph_0_{elem[1]}_transductive_traces.tsv")
jpype.shutdownJVM()