With this notebook I am creating the transductive train, validation, and test sets. At the beginning I find the trace ids that do not belong to duplicate calls (traces ids that can identify an unique path from an user request). Then I create a transductive train set that has all the entities in the graph. From the remaining dataframe, I remove all traces that have some relations already seen in the train set. As result, the training set has not completed traces (traces that have all their calls), while validation and test set have completed traces. However, the size of the test set is small.

In [1]:
import pandas as pd
import numpy as np

seed = 1234

In [2]:
df = pd.read_csv('MSCallGraph_0.csv')
df=df.drop(['Unnamed: 0','timestamp','rpcid','interface','rt',],axis=1)
df = df.replace('(?)', np.NaN)
df = df.replace('', np.NaN)
df = df.replace('NAN', np.NaN)
df = df.dropna()
df = df.drop_duplicates()
df.nunique()

traceid    130506
um           1474
rpctype         6
dm           7326
dtype: int64

In [3]:
df

Unnamed: 0,traceid,um,rpctype,dm
0,0b133c1915919238193454000e5d37,5cca70246befb1f4c9546d2912b9419dee54439218efa5...,mc,b1dbd3a649a3cc790fa12573c9c1aa00988e07a8818a22...
11,0b133c1915919238193454000e5d37,4ab265f54516248ee8873be7d6441912456ce17e84f399...,mc,fd6d86bd0fd550e717c1fdb82a33190a9fef216d87d535...
20,0b133c1915919238193454000e5d37,5cca70246befb1f4c9546d2912b9419dee54439218efa5...,userDefined,5cca70246befb1f4c9546d2912b9419dee54439218efa5...
26,0b133c1915919238193454000e5d37,75e56c8fbb9336eb4dd40f5f609d5344203d374d73fd0b...,rpc,84f9f68ef003a21288fffe8f9a09a5a29b05f4cc4229b8...
42,0b133c1915919238193454000e5d37,4ab265f54516248ee8873be7d6441912456ce17e84f399...,mc,01d660afcfadafd587e20ec4c04ddbc7eb0de95643ba0e...
...,...,...,...,...
6088821,0b52069415919238409694000ea18b,9a9e8613b6d7d1b573ba38dba19eec9c3cc460ab00cb49...,rpc,0d4e0aa3996cbc207424a87d8ce16610b7694e2fac37d7...
6088829,015101cf15919238987695000e,35114acfb54c54fb9618f23cd28bbc57c765f597df1409...,mc,9653f5baba69c9fb50bfb30a8571eb04dbceaae7c7f379...
6088839,0b52069415919238409694000ea18b,614c66b178d3cfe299b11254ecb4321f85beca4b2ccac8...,mc,4bba5c2371c3384862e69615979c2aa5b7f1b4fc3ea914...
6088842,0b52069415919238409694000ea18b,614c66b178d3cfe299b11254ecb4321f85beca4b2ccac8...,rpc,9a9e8613b6d7d1b573ba38dba19eec9c3cc460ab00cb49...


In [4]:
traces = list(set(df['traceid']))
unique_trace_ids = []
triplets_df = df.drop(['traceid'], axis = 1)
triplets_df = triplets_df.drop_duplicates()
triplets_df

Unnamed: 0,um,rpctype,dm
0,5cca70246befb1f4c9546d2912b9419dee54439218efa5...,mc,b1dbd3a649a3cc790fa12573c9c1aa00988e07a8818a22...
11,4ab265f54516248ee8873be7d6441912456ce17e84f399...,mc,fd6d86bd0fd550e717c1fdb82a33190a9fef216d87d535...
20,5cca70246befb1f4c9546d2912b9419dee54439218efa5...,userDefined,5cca70246befb1f4c9546d2912b9419dee54439218efa5...
26,75e56c8fbb9336eb4dd40f5f609d5344203d374d73fd0b...,rpc,84f9f68ef003a21288fffe8f9a09a5a29b05f4cc4229b8...
42,4ab265f54516248ee8873be7d6441912456ce17e84f399...,mc,01d660afcfadafd587e20ec4c04ddbc7eb0de95643ba0e...
...,...,...,...
6081607,0674d4c8b48f44a836019adff9dffe0cce2ee6e7b93ee0...,db,039ff41dc3636105ffd26b0ec675eec246a965e8102c77...
6081615,1cf86b19c487cc10ea3bb9623a319454029e040d0f4e9d...,db,bb61ac88608568ac2017c8559c1da973f4f94c7d1a958e...
6081625,2c52c868f36f3920d201fddda80c8062998ebc07ee2b85...,db,abb42d7303a4376befc681f9b4a765c6b6575de0df98ed...
6081642,1cf86b19c487cc10ea3bb9623a319454029e040d0f4e9d...,db,b0748973546ca26a9cbfe7802e52924291a76929a704a4...


In [5]:
filtered_df = df.loc[triplets_df.index]
filtered_df

Unnamed: 0,traceid,um,rpctype,dm
0,0b133c1915919238193454000e5d37,5cca70246befb1f4c9546d2912b9419dee54439218efa5...,mc,b1dbd3a649a3cc790fa12573c9c1aa00988e07a8818a22...
11,0b133c1915919238193454000e5d37,4ab265f54516248ee8873be7d6441912456ce17e84f399...,mc,fd6d86bd0fd550e717c1fdb82a33190a9fef216d87d535...
20,0b133c1915919238193454000e5d37,5cca70246befb1f4c9546d2912b9419dee54439218efa5...,userDefined,5cca70246befb1f4c9546d2912b9419dee54439218efa5...
26,0b133c1915919238193454000e5d37,75e56c8fbb9336eb4dd40f5f609d5344203d374d73fd0b...,rpc,84f9f68ef003a21288fffe8f9a09a5a29b05f4cc4229b8...
42,0b133c1915919238193454000e5d37,4ab265f54516248ee8873be7d6441912456ce17e84f399...,mc,01d660afcfadafd587e20ec4c04ddbc7eb0de95643ba0e...
...,...,...,...,...
6081607,0b5218f915919236934722000ec49b,0674d4c8b48f44a836019adff9dffe0cce2ee6e7b93ee0...,db,039ff41dc3636105ffd26b0ec675eec246a965e8102c77...
6081615,0b5218f915919236934722000ec49b,1cf86b19c487cc10ea3bb9623a319454029e040d0f4e9d...,db,bb61ac88608568ac2017c8559c1da973f4f94c7d1a958e...
6081625,0b5218f915919236934722000ec49b,2c52c868f36f3920d201fddda80c8062998ebc07ee2b85...,db,abb42d7303a4376befc681f9b4a765c6b6575de0df98ed...
6081642,0b5218f915919236934722000ec49b,1cf86b19c487cc10ea3bb9623a319454029e040d0f4e9d...,db,b0748973546ca26a9cbfe7802e52924291a76929a704a4...


In [6]:
filtered_traces =list(set(filtered_df['traceid']))
len(filtered_traces) # theoretical maximum nuber of unique traces

4362

In [7]:
# i = 0
# import time
# import IPython
# tot_n_traces = len(filtered_traces)
# tot_n_triplets = len(triplets_df)
# start = time.time()
# for trace in filtered_traces:
#     trace_calls_df = df[df['traceid'] == trace]
#     trace_calls_df = trace_calls_df.drop(['traceid'],axis = 1)
#     trace_calls_df = trace_calls_df.drop_duplicates()
    
#     intersection_df = pd.merge(triplets_df, trace_calls_df)
#     if len(intersection_df) > 0:
#         unique_trace_ids.append(trace)
#         # intersection is a subset of triplets
#         triplets_df = pd.concat([triplets_df,intersection_df]).drop_duplicates(keep=False)
    
#     remaining_triplets = len(triplets_df)
#     if remaining_triplets == 0:
#         break
    
#     i = i+1
#     duration = time.time() - start
#     remaining_time = duration/(i/tot_n_traces) - duration
#     print("% traces completed: ", (i/tot_n_traces) * 100)
#     print("Traces remaining time (s): ", remaining_time)
#     print("Traces remaining time (m): ", remaining_time/60)
#     print("Traces remaining time (h): ", remaining_time/60/60)
#     print("Traces remaining time (d): ", remaining_time/60/60/24)
    
#     remaining_time_triplets = duration/(1-(remaining_triplets/tot_n_triplets)) - duration
#     print("Remaining triplets: ", remaining_triplets)
#     print("% triplets completed: ", (1-(remaining_triplets/tot_n_triplets)) *100)
#     print("Triplets remaining time (s): ", remaining_time_triplets)
#     print("Triplets remaining time (m): ", remaining_time_triplets/60)
#     print("Triplets remaining time (h): ", remaining_time_triplets/60/60)
#     print("Triplets remaining time (d): ", remaining_time_triplets/60/60/24)
#     IPython.display.clear_output(wait = True)

In [8]:
# duration = time.time() - start
# print("Time required (s): ",duration)

In [9]:
# print("Number of unique microservice calls: ",len(unique_trace_ids))

In [10]:
# unique_calls_df = df[df['traceid'].isin(unique_trace_ids)]
# unique_calls_df = unique_calls_df.drop_duplicates()

In [11]:
# unique_calls_df.to_csv('unique_calls_df_0.csv',index=False)


In [12]:
unique_calls_df = pd.read_csv('unique_calls_df_0.csv')
unique_calls_triplets_df = unique_calls_df.drop(['traceid'],axis = 1)
unique_calls_triplets_df = unique_calls_triplets_df.drop_duplicates()

In [13]:
unique_calls_df

Unnamed: 0,traceid,um,rpctype,dm
0,0b11355e15919238739898000e4315,75e56c8fbb9336eb4dd40f5f609d5344203d374d73fd0b...,rpc,9a9e8613b6d7d1b573ba38dba19eec9c3cc460ab00cb49...
1,0b11355e15919238739898000e4315,75e56c8fbb9336eb4dd40f5f609d5344203d374d73fd0b...,rpc,f0facca9841a7832c38299f8448958df4dceeadb444631...
2,0b11355e15919238739898000e4315,75e56c8fbb9336eb4dd40f5f609d5344203d374d73fd0b...,rpc,84f9f68ef003a21288fffe8f9a09a5a29b05f4cc4229b8...
3,0b11355e15919238739898000e4315,95a6f7f8345e2eca31ee74ddc19d547e7fc0f5c8e65772...,mc,9f8edf2b85a2974dafdae8d961043bfa2f2c6e27485a00...
4,0b11355e15919238739898000e4315,75e56c8fbb9336eb4dd40f5f609d5344203d374d73fd0b...,mc,ad418f5c26ee81d1276f05bb9dd3ca6fad69efd73afd2a...
...,...,...,...,...
201355,0b5114df15919237534872000ef4d4,d2c895bd5c7de55f381310fd1b94ce9d518bf4ac22e22f...,userDefined,d2c895bd5c7de55f381310fd1b94ce9d518bf4ac22e22f...
201356,0b5114df15919237534872000ef4d4,81da45152cce70f229b6fe9abdf8e303a5ec88da52bf60...,mq,1efeaa8c6ee7899fecd33c8b0ed024154adcc03c7899c1...
201357,0b5114df15919237534872000ef4d4,81da45152cce70f229b6fe9abdf8e303a5ec88da52bf60...,mq,afd45c934f07a26cb051f0dcaba4934fed1612f9c8a02d...
201358,0b5114df15919237534872000ef4d4,81da45152cce70f229b6fe9abdf8e303a5ec88da52bf60...,mq,cdc39ad528339be2ad716f4870ce77a0bc2fe2e02ddbdb...


In [14]:
unique_calls_triplets_df

Unnamed: 0,um,rpctype,dm
0,75e56c8fbb9336eb4dd40f5f609d5344203d374d73fd0b...,rpc,9a9e8613b6d7d1b573ba38dba19eec9c3cc460ab00cb49...
1,75e56c8fbb9336eb4dd40f5f609d5344203d374d73fd0b...,rpc,f0facca9841a7832c38299f8448958df4dceeadb444631...
2,75e56c8fbb9336eb4dd40f5f609d5344203d374d73fd0b...,rpc,84f9f68ef003a21288fffe8f9a09a5a29b05f4cc4229b8...
3,95a6f7f8345e2eca31ee74ddc19d547e7fc0f5c8e65772...,mc,9f8edf2b85a2974dafdae8d961043bfa2f2c6e27485a00...
4,75e56c8fbb9336eb4dd40f5f609d5344203d374d73fd0b...,mc,ad418f5c26ee81d1276f05bb9dd3ca6fad69efd73afd2a...
...,...,...,...
201148,1cf86b19c487cc10ea3bb9623a319454029e040d0f4e9d...,db,bb61ac88608568ac2017c8559c1da973f4f94c7d1a958e...
201153,2c52c868f36f3920d201fddda80c8062998ebc07ee2b85...,db,abb42d7303a4376befc681f9b4a765c6b6575de0df98ed...
201160,1cf86b19c487cc10ea3bb9623a319454029e040d0f4e9d...,db,b0748973546ca26a9cbfe7802e52924291a76929a704a4...
201318,5cca70246befb1f4c9546d2912b9419dee54439218efa5...,db,31a97907aa00a2d2d1445a5104bbcc47d70fa52a2370b8...


In [15]:
df_copy = unique_calls_df
df = df.drop(['traceid'],axis = 1)
df = df.drop_duplicates()
df

Unnamed: 0,um,rpctype,dm
0,5cca70246befb1f4c9546d2912b9419dee54439218efa5...,mc,b1dbd3a649a3cc790fa12573c9c1aa00988e07a8818a22...
11,4ab265f54516248ee8873be7d6441912456ce17e84f399...,mc,fd6d86bd0fd550e717c1fdb82a33190a9fef216d87d535...
20,5cca70246befb1f4c9546d2912b9419dee54439218efa5...,userDefined,5cca70246befb1f4c9546d2912b9419dee54439218efa5...
26,75e56c8fbb9336eb4dd40f5f609d5344203d374d73fd0b...,rpc,84f9f68ef003a21288fffe8f9a09a5a29b05f4cc4229b8...
42,4ab265f54516248ee8873be7d6441912456ce17e84f399...,mc,01d660afcfadafd587e20ec4c04ddbc7eb0de95643ba0e...
...,...,...,...
6081607,0674d4c8b48f44a836019adff9dffe0cce2ee6e7b93ee0...,db,039ff41dc3636105ffd26b0ec675eec246a965e8102c77...
6081615,1cf86b19c487cc10ea3bb9623a319454029e040d0f4e9d...,db,bb61ac88608568ac2017c8559c1da973f4f94c7d1a958e...
6081625,2c52c868f36f3920d201fddda80c8062998ebc07ee2b85...,db,abb42d7303a4376befc681f9b4a765c6b6575de0df98ed...
6081642,1cf86b19c487cc10ea3bb9623a319454029e040d0f4e9d...,db,b0748973546ca26a9cbfe7802e52924291a76929a704a4...


Checking that all unique triplets are present

In [16]:
assert len(unique_calls_triplets_df) == len(df)

In [17]:
df = unique_calls_triplets_df

In [18]:
entities = set(df['dm']) | set(df['um'])
len(entities)

7386

In [19]:
baseline = []
for entity in entities:
    baseline.append(df[df['dm'] == entity].head(1))
    baseline.append(df[df['um'] == entity].head(1))

In [20]:
baseline_df = pd.concat(baseline)
baseline_df = baseline_df.drop_duplicates()
baseline_df

Unnamed: 0,um,rpctype,dm
1111,6b3de38ce727d7a097829c79b86ae6446afe984f9f45d5...,mq,de225ac0ca268758ae03fa9da1bf044ac360f369177540...
2285,de225ac0ca268758ae03fa9da1bf044ac360f369177540...,db,e704cc0eb4f84e19073e47b66eca4afdbf1860714c8e5e...
1893,7b3b9cb2faed94c69ae18e8275ec186a4ffe6b2e258878...,rpc,93002d43042065592e282b13cb2f89a6fe261616a6fa2a...
1857,93002d43042065592e282b13cb2f89a6fe261616a6fa2a...,mc,893e06c4df69596691717127a26b31aafa9dd9af9196ee...
47622,0754c211dd9fb69fcda626915a5224ec20406cfdad2378...,db,d45b907650050d72216bc2ec5450a5f88a91df39de6954...
...,...,...,...
14367,c133d5f7eb91a127d94757ce7915ee3bceba0a25e95146...,mq,349b79ab56dbc35abec2e2df88330f7dfb23c128c13eb6...
74828,61907af770acd8c9bd15f09fa6836818d41fd772482066...,db,3a742419d05d56d26d9ff2a7658fc557876353622b7f89...
6881,3322acdce65a8353e2f8a32a2bf0474526cfc62c3d1328...,db,e494ccd2d53d6358e820e4e26f960bb6820b074ab82c46...
159406,59683ac93dc15b414023345f9236f23d181b3e6f59261e...,db,10d4a6a251cd9e80c225e8b11d608b589b951e33e29feb...


In [21]:
baseline_traces_df = df_copy.loc[baseline_df.index]
baseline_traces_df

Unnamed: 0,traceid,um,rpctype,dm
1111,0b1398a715919238284947000e31a3,6b3de38ce727d7a097829c79b86ae6446afe984f9f45d5...,mq,de225ac0ca268758ae03fa9da1bf044ac360f369177540...
2285,0b11348615919237072014000d0a26,de225ac0ca268758ae03fa9da1bf044ac360f369177540...,db,e704cc0eb4f84e19073e47b66eca4afdbf1860714c8e5e...
1893,0b139ac515919237895465000e26ab,7b3b9cb2faed94c69ae18e8275ec186a4ffe6b2e258878...,rpc,93002d43042065592e282b13cb2f89a6fe261616a6fa2a...
1857,0b139ac515919237895465000e26ab,93002d43042065592e282b13cb2f89a6fe261616a6fa2a...,mc,893e06c4df69596691717127a26b31aafa9dd9af9196ee...
47622,0b8dcd8e15919236341799000d0cd0,0754c211dd9fb69fcda626915a5224ec20406cfdad2378...,db,d45b907650050d72216bc2ec5450a5f88a91df39de6954...
...,...,...,...,...
14367,0b08334815919237085027000e4675,c133d5f7eb91a127d94757ce7915ee3bceba0a25e95146...,mq,349b79ab56dbc35abec2e2df88330f7dfb23c128c13eb6...
74828,0b51142115919238506652000e6fe6,61907af770acd8c9bd15f09fa6836818d41fd772482066...,db,3a742419d05d56d26d9ff2a7658fc557876353622b7f89...
6881,0b51054815919236393501000e3ec8,3322acdce65a8353e2f8a32a2bf0474526cfc62c3d1328...,db,e494ccd2d53d6358e820e4e26f960bb6820b074ab82c46...
159406,0b08441615919236543425000d05b5,59683ac93dc15b414023345f9236f23d181b3e6f59261e...,db,10d4a6a251cd9e80c225e8b11d608b589b951e33e29feb...


In [22]:
# the traces in the training set may not be completed
train_df = baseline_df
train_df

Unnamed: 0,um,rpctype,dm
1111,6b3de38ce727d7a097829c79b86ae6446afe984f9f45d5...,mq,de225ac0ca268758ae03fa9da1bf044ac360f369177540...
2285,de225ac0ca268758ae03fa9da1bf044ac360f369177540...,db,e704cc0eb4f84e19073e47b66eca4afdbf1860714c8e5e...
1893,7b3b9cb2faed94c69ae18e8275ec186a4ffe6b2e258878...,rpc,93002d43042065592e282b13cb2f89a6fe261616a6fa2a...
1857,93002d43042065592e282b13cb2f89a6fe261616a6fa2a...,mc,893e06c4df69596691717127a26b31aafa9dd9af9196ee...
47622,0754c211dd9fb69fcda626915a5224ec20406cfdad2378...,db,d45b907650050d72216bc2ec5450a5f88a91df39de6954...
...,...,...,...
14367,c133d5f7eb91a127d94757ce7915ee3bceba0a25e95146...,mq,349b79ab56dbc35abec2e2df88330f7dfb23c128c13eb6...
74828,61907af770acd8c9bd15f09fa6836818d41fd772482066...,db,3a742419d05d56d26d9ff2a7658fc557876353622b7f89...
6881,3322acdce65a8353e2f8a32a2bf0474526cfc62c3d1328...,db,e494ccd2d53d6358e820e4e26f960bb6820b074ab82c46...
159406,59683ac93dc15b414023345f9236f23d181b3e6f59261e...,db,10d4a6a251cd9e80c225e8b11d608b589b951e33e29feb...


In [23]:
triplets_not_present_in_training = pd.concat([df,train_df]).drop_duplicates(keep = False)
traces_with_triplets_used_in_training_df = unique_calls_df.drop(triplets_not_present_in_training.index)
traces_with_triplets_used_in_training = set(traces_with_triplets_used_in_training_df['traceid'])
not_used_triplets_df = unique_calls_df[~unique_calls_df['traceid'].isin(traces_with_triplets_used_in_training)]
not_used_triplets_df

Unnamed: 0,traceid,um,rpctype,dm
56054,0b119e4315918931503093000e5ccc,7372a65ec1f85c7387a1ecb90c49664b9cb976a0b73bd3...,rpc,12bc3fe2791d9f2946a74114ff349795461e695a099ae3...
56055,0b119e4315918931503093000e5ccc,b39889af09ab89f4a7918a534348e405b52260bf198a43...,rpc,7372a65ec1f85c7387a1ecb90c49664b9cb976a0b73bd3...
56056,0b119e4315918931503093000e5ccc,12bc3fe2791d9f2946a74114ff349795461e695a099ae3...,db,e8f0700426dbb43bc525942ce80a96706eee51df6f46a8...
73977,0b51187a15918992051312000e0a8b,b39889af09ab89f4a7918a534348e405b52260bf198a43...,rpc,1efeaa8c6ee7899fecd33c8b0ed024154adcc03c7899c1...


In [24]:
len(set(not_used_triplets_df['traceid']))

2

In [25]:
remaining_df = not_used_triplets_df.drop('traceid',axis=1).drop_duplicates()
remaining_df

Unnamed: 0,um,rpctype,dm
56054,7372a65ec1f85c7387a1ecb90c49664b9cb976a0b73bd3...,rpc,12bc3fe2791d9f2946a74114ff349795461e695a099ae3...
56055,b39889af09ab89f4a7918a534348e405b52260bf198a43...,rpc,7372a65ec1f85c7387a1ecb90c49664b9cb976a0b73bd3...
56056,12bc3fe2791d9f2946a74114ff349795461e695a099ae3...,db,e8f0700426dbb43bc525942ce80a96706eee51df6f46a8...
73977,b39889af09ab89f4a7918a534348e405b52260bf198a43...,rpc,1efeaa8c6ee7899fecd33c8b0ed024154adcc03c7899c1...


In [26]:
remaining_traces_df = df_copy.loc[remaining_df.index]
remaining_traces_df

Unnamed: 0,traceid,um,rpctype,dm
56054,0b119e4315918931503093000e5ccc,7372a65ec1f85c7387a1ecb90c49664b9cb976a0b73bd3...,rpc,12bc3fe2791d9f2946a74114ff349795461e695a099ae3...
56055,0b119e4315918931503093000e5ccc,b39889af09ab89f4a7918a534348e405b52260bf198a43...,rpc,7372a65ec1f85c7387a1ecb90c49664b9cb976a0b73bd3...
56056,0b119e4315918931503093000e5ccc,12bc3fe2791d9f2946a74114ff349795461e695a099ae3...,db,e8f0700426dbb43bc525942ce80a96706eee51df6f46a8...
73977,0b51187a15918992051312000e0a8b,b39889af09ab89f4a7918a534348e405b52260bf198a43...,rpc,1efeaa8c6ee7899fecd33c8b0ed024154adcc03c7899c1...


In [27]:
remaining_traces = list(set(remaining_traces_df['traceid']))
len(remaining_traces)

2

In [28]:
valid_testing_df = remaining_df
valid_testing_df

Unnamed: 0,um,rpctype,dm
56054,7372a65ec1f85c7387a1ecb90c49664b9cb976a0b73bd3...,rpc,12bc3fe2791d9f2946a74114ff349795461e695a099ae3...
56055,b39889af09ab89f4a7918a534348e405b52260bf198a43...,rpc,7372a65ec1f85c7387a1ecb90c49664b9cb976a0b73bd3...
56056,12bc3fe2791d9f2946a74114ff349795461e695a099ae3...,db,e8f0700426dbb43bc525942ce80a96706eee51df6f46a8...
73977,b39889af09ab89f4a7918a534348e405b52260bf198a43...,rpc,1efeaa8c6ee7899fecd33c8b0ed024154adcc03c7899c1...


In [29]:
validation_df = valid_testing_df.head(0)
i = 0    # I want the test set to have at least 60% of the remaining traces
while i < len(remaining_traces) and len(valid_testing_df) > len(remaining_df) * 0.54:
    trace = remaining_traces[i]
    temp_df = remaining_traces_df[remaining_traces_df['traceid'] == trace].drop(['traceid'],axis=1).drop_duplicates()
    validation_df = pd.concat([validation_df, temp_df])
    valid_testing_df = valid_testing_df.drop(temp_df.index)
    remaining_traces_df = remaining_traces_df.drop(temp_df.index)
    i = i+1

In [30]:
validation_df

Unnamed: 0,um,rpctype,dm
56054,7372a65ec1f85c7387a1ecb90c49664b9cb976a0b73bd3...,rpc,12bc3fe2791d9f2946a74114ff349795461e695a099ae3...
56055,b39889af09ab89f4a7918a534348e405b52260bf198a43...,rpc,7372a65ec1f85c7387a1ecb90c49664b9cb976a0b73bd3...
56056,12bc3fe2791d9f2946a74114ff349795461e695a099ae3...,db,e8f0700426dbb43bc525942ce80a96706eee51df6f46a8...


In [31]:
test_df = valid_testing_df
test_df

Unnamed: 0,um,rpctype,dm
73977,b39889af09ab89f4a7918a534348e405b52260bf198a43...,rpc,1efeaa8c6ee7899fecd33c8b0ed024154adcc03c7899c1...


trace leakage checks

In [32]:
train_traces = set(df_copy.loc[train_df.index]['traceid'])
validation_traces = set(df_copy.loc[validation_df.index]['traceid'])
test_traces = set(df_copy.loc[test_df.index]['traceid'])

In [33]:
assert len(train_traces & validation_traces) == 0

In [34]:
assert len(test_traces & validation_traces) == 0

In [35]:
assert len(train_traces & test_traces) == 0

check to see if the traces are completed (entirely contained in the sets)

In [36]:
# for trace in train_traces:
#     all_triplets_trace_df = unique_calls_df[unique_calls_df['traceid']==trace].drop(['traceid'],axis = 1).drop_duplicates()
#     len_total_triplets = len(all_triplets_trace_df)
#     len_contained = len(train_df.merge(all_triplets_trace_df))
#     assert (len_total_triplets == len_contained),f"{len_total_triplets}  {len_contained}" 

In [37]:
for trace in validation_traces:
    all_triplets_trace_df = unique_calls_df[unique_calls_df['traceid']==trace].drop(['traceid'],axis = 1).drop_duplicates()
    len_total_triplets = len(all_triplets_trace_df)
    len_contained = len(validation_df.merge(all_triplets_trace_df))
    assert (len_total_triplets == len_contained),f"{len_total_triplets}  {len_contained}" 

In [38]:
for trace in test_traces:
    all_triplets_trace_df = unique_calls_df[unique_calls_df['traceid']==trace].drop(['traceid'],axis = 1).drop_duplicates()
    len_total_triplets = len(all_triplets_trace_df)
    len_contained = len(test_df.merge(all_triplets_trace_df))
    assert (len_total_triplets == len_contained),f"{len_total_triplets}  {len_contained}" 

Transductive checks

In [39]:
train_entities = set(train_df['dm']) or set(train_df['um'])
valid_entities = set(validation_df['dm']) or set(validation_df['um'])
test_entities = set(test_df['dm']) or set(test_df['um'])

In [40]:
assert(valid_entities.issubset(train_entities))

In [41]:
assert(test_entities.issubset(train_entities))

In [42]:
train_relations = set(train_df['rpctype'])
valid_relations = set(validation_df['rpctype'])
test_relations = set(test_df['rpctype'])

In [43]:
assert(valid_relations.issubset(train_relations))

In [44]:
assert(test_relations.issubset(train_relations))

Check to see if there is test leakage

In [45]:
assert(0==len(pd.merge(train_df, validation_df)))

In [46]:
assert(0==len(pd.merge(train_df, test_df)))

In [47]:
assert(0==len(pd.merge(test_df, validation_df)))

In [48]:
def create_triplets(df):
    triplets = []
    for i in range(len(df)):
        head = df.iloc[i]['dm']
        tail = df.iloc[i]['um']
        rel = df.iloc[i]['rpctype']
        triplets.append([head,rel,tail])
    return triplets

In [49]:
import asposecells
import jpype
jpype.startJVM()

from asposecells.api import Workbook, FileFormatType

def create_tsv(triplets, file_name):


    # Create Workbook object.
    workbook = Workbook(FileFormatType.TSV)

    # Access the first worksheet of the workbook.
    worksheet = workbook.getWorksheets().get(0)

    # Get the desired cell(s) of the worksheet and input the value into the cell(s).


    i = 1
    for elem in triplets: 
        worksheet.getCells().get("A"+str(i)).putValue(elem[0])
        worksheet.getCells().get("B"+str(i)).putValue(elem[1])
        worksheet.getCells().get("C"+str(i)).putValue(elem[2])
        i+=1


    # Save the workbook as TSV file.
    workbook.save(file_name)


    file = open(file_name,'r')  
    lines = file.readlines()  
    file.close()
    file = open(file_name,'w')  
    lines = lines[:-1]
    file.writelines(lines)
    file.close()

In [50]:
datasets = [train_df, validation_df, test_df]
names = ["train", "validation","test"]

In [51]:
for elem in zip(datasets,names):
    create_tsv(create_triplets(elem[0]),f"MSCallGraph_0_{elem[1]}_transductive_traces_vt_completed.tsv")
jpype.shutdownJVM()