In [1]:
import tarfile
import pandas as pd
import numpy as np
import os
import random
from tqdm import tqdm
import contextlib
import joblib

seed = 123
random.seed(seed)

In [2]:


@contextlib.contextmanager
def tqdm_joblib(tqdm_object):
    """Context manager to patch joblib to report into tqdm progress bar given as argument"""
    class TqdmBatchCompletionCallback(joblib.parallel.BatchCompletionCallBack):
        def __call__(self, *args, **kwargs):
            tqdm_object.update(n=self.batch_size)
            return super().__call__(*args, **kwargs)

    old_batch_callback = joblib.parallel.BatchCompletionCallBack
    joblib.parallel.BatchCompletionCallBack = TqdmBatchCompletionCallback
    try:
        yield tqdm_object
    finally:
        joblib.parallel.BatchCompletionCallBack = old_batch_callback
        tqdm_object.close()

In [3]:
to_skip = tuple([2,57,60,64,66,75,98,101,125,129,130,144]) # corrupted files
def extract_data(i):
    if to_skip.count(i) > 0:
            return

    # open file
    file = tarfile.open(f'MSCallGraph_{i}.tar.gz')

    # extracting file
    file.extractall('./')

    file.close()

In [4]:
with tqdm_joblib(tqdm(desc="extracting data", total=145)) as progress_bar:
    joblib.Parallel(n_jobs=-1)(joblib.delayed(extract_data)(i) for i in range(0, 145))


extracting data: 100%|██████████| 145/145 [02:49<00:00,  1.17s/it]


In [5]:
def get_entities(i):
    if to_skip.count(i) > 0:
            return set()
    df = pd.read_csv(f'MSCallGraph_{i}.csv')
    df=df.drop(['Unnamed: 0','timestamp','rpcid','interface','rt','traceid'],axis=1)
    df = df.replace('(?)', np.NaN)
    df = df.replace('', np.NaN)
    df = df.replace('NAN', np.NaN)
    df = df.dropna()
    entities = set(df['dm']) | set(df['um'])
    return entities

In [6]:
with tqdm_joblib(tqdm(desc="finding entities", total=145)) as progress_bar:
    results = joblib.Parallel(n_jobs=-1)(joblib.delayed(get_entities)(i) for i in range(0, 145))


finding entities: 100%|██████████| 145/145 [02:15<00:00,  1.07it/s]


In [7]:
entities = set()
for elem in results:
    entities = entities |elem 
len(entities)

16657

In [8]:
probability_dict = {}
for entity in entities:
    probability_dict[entity]= random.uniform(0.01,0.1) #probability between 1% and 10%

In [9]:
probability_dict

{'76dd56a9f2e9c563a1cf719314e2ba5257bd2eb99a623db0c6b49fe12c1c34a2': 0.01471272389658499,
 'e887321342cfcab81228e67f8fcf836e45313dc45b1811ec2e78345d8355bc92': 0.01784680097703691,
 'fd4d92f4328a8714b7100aa132bfc4d40271b5120113ea4deba27737dd97de1b': 0.046651758730335856,
 '7b3d2d6c8a8760081d2124b7a3fa1f795661105c79593c3f760e35e9b86ba017': 0.01969302114445952,
 '420a07bf26e59047ab9cc0aa4c1d96574d93014dcb7443fc51285e63337dba66': 0.09110789901565253,
 '8f6d3848b72971854fc906cd437e714c6c636c34e544b0c5586ae70b122364bd': 0.013433829949209017,
 'dbe3d9ec6a1bdfd907fdeca3a754ce01dfb868558edb9a2f39534ce4ca5133ba': 0.05825818360305343,
 'a59dd247264a6681713892a708b9ee154e6c705dbe123e7e9ba8413050e12d50': 0.03989779286587119,
 '4efaa3209161f9072b41e38822b00281f08eb85ec28ecdc7ac2f2bcb434ce43d': 0.08668779570364318,
 '9912937fb3d5c750d72e73cb45a35b20c085de276c45b44f07a7163a6d1d5922': 0.024369615704977296,
 '7a2d4f269b3636fb8fa459526646e32ecc5feb9e52ebd92fcce2f174a81fa1e7': 0.0403494991398348,
 'e37d22

In [10]:
df = pd.read_csv(f'MSCallGraph_0.csv')
df = df.drop(['Unnamed: 0','timestamp','rpcid','interface','rt','traceid'],axis=1)
df = df.replace('(?)', np.NaN)
df = df.replace('', np.NaN)
df = df.replace('NAN', np.NaN)
df = df.dropna() 
df = df.drop_duplicates()
with tqdm(desc="concatenating dataframes", total=144) as progress_bar:
    for i in range(1,145):
        progress_bar.update(1)
        if to_skip.count(i) > 0:
                continue
        temp_df = pd.read_csv(f'MSCallGraph_{i}.csv')
        temp_df = temp_df.drop(['Unnamed: 0','timestamp','rpcid','interface','rt','traceid'],axis=1)
        temp_df = temp_df.replace('(?)', np.NaN)
        temp_df = temp_df.replace('', np.NaN)
        temp_df = temp_df.replace('NAN', np.NaN)
        temp_df = temp_df.dropna()
        temp_df = temp_df.drop_duplicates()
        df = pd.concat([df,temp_df])
        df = df.drop_duplicates()
df

concatenating dataframes: 100%|██████████| 144/144 [30:53<00:00, 12.87s/it]


Unnamed: 0,um,rpctype,dm
0,5cca70246befb1f4c9546d2912b9419dee54439218efa5...,mc,b1dbd3a649a3cc790fa12573c9c1aa00988e07a8818a22...
11,4ab265f54516248ee8873be7d6441912456ce17e84f399...,mc,fd6d86bd0fd550e717c1fdb82a33190a9fef216d87d535...
20,5cca70246befb1f4c9546d2912b9419dee54439218efa5...,userDefined,5cca70246befb1f4c9546d2912b9419dee54439218efa5...
26,75e56c8fbb9336eb4dd40f5f609d5344203d374d73fd0b...,rpc,84f9f68ef003a21288fffe8f9a09a5a29b05f4cc4229b8...
42,4ab265f54516248ee8873be7d6441912456ce17e84f399...,mc,01d660afcfadafd587e20ec4c04ddbc7eb0de95643ba0e...
...,...,...,...
6777201,eb7ce54148779f812f2c7f07f18ef4ed269043b9f6b959...,db,db984ae0fed93d22fbcca65eb1ec1b25b4faa71167c580...
7012357,24601dd8b36f856eb0a4d759866f475bc62cf843ffe092...,rpc,1224ab4a5244545b7333a290aaccc95e49f2a793457d21...
7012358,1224ab4a5244545b7333a290aaccc95e49f2a793457d21...,db,1ac258b5230c8d9dfb28dcad1ced301a9b475b58b37025...
7273972,5d887bdb9fd24cea365fb7ce018526df1d39b11630081b...,db,8ee7c139f721a23915ca012d2518eddbdc7ad1ced8b3da...


In [11]:
baseline = []
with tqdm(desc="collecting all entities",total=len(entities)) as progress_bar:
    for entity in entities:
        baseline.append(df[df['dm'] == entity].head(1))
        baseline.append(df[df['um'] == entity].head(1))
        progress_bar.update(1)

collecting all entities: 100%|██████████| 16657/16657 [02:13<00:00, 125.03it/s]


In [12]:
baseline_df = pd.concat(baseline)
baseline_df = baseline_df.drop_duplicates()
baseline_df

Unnamed: 0,um,rpctype,dm
3839214,94dce029795f9357501c763b763bf951c8290ad1744ec1...,mc,76dd56a9f2e9c563a1cf719314e2ba5257bd2eb99a623d...
2734028,5c17bb2b4e1bfb38ccbe6cffde6c49b854800d62a80343...,db,e887321342cfcab81228e67f8fcf836e45313dc45b1811...
5092129,1db622d2ef805ba3c30f129fc21afb8cf145109380bffe...,rpc,fd4d92f4328a8714b7100aa132bfc4d40271b5120113ea...
5092127,fd4d92f4328a8714b7100aa132bfc4d40271b5120113ea...,db,e3b1ec3da44f91aad3e564d2291d723b6afb4d5ac7ccb4...
1319,9f14b8cd62aa14111695d8d170020805e508ca2168604a...,mc,7b3d2d6c8a8760081d2124b7a3fa1f795661105c79593c...
...,...,...,...
155075,6f639e16aba857462d1326154890989e23b4d326a5b0dd...,mq,515691d0421dedd130a5d03dae7fe44b2167841d679502...
6033099,c5d26caa5424ce9de273434ab8e6b8aa5fc1a9db0e288b...,mq,e63598fd52bf90ae1a10886dde21523ffff887ac43f5d7...
5951070,81104ebf2456207e8b5b096c7909a676386e4eb6654be0...,db,af669bd74f6e6ea7c20122d29ea62c05d60e82bdb9507f...
3173089,01e4b5c4e46c834de68d7afc1002c152e951f8f0d1058f...,db,ce5d5c3498c564bf990be08da02878e949960a8813d1dd...


In [13]:
def sample_data(i):
    if to_skip.count(i) > 0:
            return None
    df = pd.read_csv(f'MSCallGraph_{i}.csv')
    df=df.drop(['Unnamed: 0','timestamp','rpcid','interface','rt','traceid'],axis=1)
    df = df.replace('(?)', np.NaN)
    df = df.replace('', np.NaN)
    df = df.replace('NAN', np.NaN)
    df = df.dropna()
    
    sampled_list = []
    for j in range(len(df)):
        entity = df.iloc[j]['um']
        if random.random() < probability_dict.get(entity):
            sampled_list.append(j)
    if len(sampled_list) == 0:
        return None
    
    sampled_df = df.iloc[sampled_list]
    sampled_df = sampled_df.drop_duplicates()
    return sampled_df

In [14]:
with tqdm_joblib(tqdm(desc="sampling data", total=145)) as progress_bar:
    results = joblib.Parallel(n_jobs=-1)(joblib.delayed(sample_data)(i) for i in range(0, 145))


sampling data: 100%|██████████| 145/145 [37:30<00:00, 15.52s/it]  


In [15]:
sampled_df_list = []
for df in results:
    if df is not None:
        sampled_df_list.append(df)

In [16]:
sampled_df = pd.concat(sampled_df_list)
sampled_df

Unnamed: 0,um,rpctype,dm
26,75e56c8fbb9336eb4dd40f5f609d5344203d374d73fd0b...,rpc,84f9f68ef003a21288fffe8f9a09a5a29b05f4cc4229b8...
55,4ab265f54516248ee8873be7d6441912456ce17e84f399...,mc,676ca482c56de84046ce1d0076abd6cdb0f25778d036db...
127,4ab265f54516248ee8873be7d6441912456ce17e84f399...,mc,d3a0dbd99cef0ff255b05ad66d6c56af5ed7b6b507baaf...
168,4ab265f54516248ee8873be7d6441912456ce17e84f399...,rpc,e6c95bef37d936ebb375bf135ded88c96eb9257a1d3d9d...
211,75e56c8fbb9336eb4dd40f5f609d5344203d374d73fd0b...,mc,6655de1e1f8b90d9f49e942d595b112742780854f994bc...
...,...,...,...
7412947,377f1c6e2b6c3315132d0d8dbd721dc01630e5919f39b0...,db,4803a63d3824d588dc7eef4ae08540db75e42e0311f794...
7415357,73257ee66fbea0ea8b789e1027396c9292d7140b6d5937...,db,ca3c5e5ecac17b967d5d2bd3406efb38d65b7258717f02...
7417968,664b8e8494b6153cc7ce4335d051b96bdaff8f0985b868...,db,00733a86e87eb7709d9ed30677de5bb8789bcf41a3c113...
7422795,2a793a43ad08bd77e7da24019d2d06547c7b7803fd1944...,db,373e3d34569d49d12dfb8dcc70cbcef170dc18b749ef8a...


In [17]:
sampled_df = sampled_df.drop_duplicates()
sampled_df

Unnamed: 0,um,rpctype,dm
26,75e56c8fbb9336eb4dd40f5f609d5344203d374d73fd0b...,rpc,84f9f68ef003a21288fffe8f9a09a5a29b05f4cc4229b8...
55,4ab265f54516248ee8873be7d6441912456ce17e84f399...,mc,676ca482c56de84046ce1d0076abd6cdb0f25778d036db...
127,4ab265f54516248ee8873be7d6441912456ce17e84f399...,mc,d3a0dbd99cef0ff255b05ad66d6c56af5ed7b6b507baaf...
168,4ab265f54516248ee8873be7d6441912456ce17e84f399...,rpc,e6c95bef37d936ebb375bf135ded88c96eb9257a1d3d9d...
211,75e56c8fbb9336eb4dd40f5f609d5344203d374d73fd0b...,mc,6655de1e1f8b90d9f49e942d595b112742780854f994bc...
...,...,...,...
6097787,ae4eeac16ff793350aae53b6009d05f81fa3ddc195825d...,rpc,a4b2c5b055d175c0c9f25efd20ab12879a2313e2a35140...
6098214,22393715cef3a7b093fd27da0ab4707cebc2591cfe8bde...,db,f43b4a0cf1c5549e3ec1d32ae4b00137b9d4d5e90c6129...
6101308,12bc3fe2791d9f2946a74114ff349795461e695a099ae3...,mq,5aefad73e20ae83cd0bf473107dd290e74f9a8b12aa6bc...
6270408,150f30cbcef004e2e3964d5327c7cb25e3f1dea47199dc...,db,4346785c51657ac12c3b03236666aec286c40f6eafafd1...


In [18]:
sampled_entities = set(sampled_df['dm']) | set(sampled_df['um'])
len(sampled_entities)

13322

In [19]:
sampled_df = pd.concat([baseline_df,sampled_df]).drop_duplicates()
sampled_df

Unnamed: 0,um,rpctype,dm
3839214,94dce029795f9357501c763b763bf951c8290ad1744ec1...,mc,76dd56a9f2e9c563a1cf719314e2ba5257bd2eb99a623d...
2734028,5c17bb2b4e1bfb38ccbe6cffde6c49b854800d62a80343...,db,e887321342cfcab81228e67f8fcf836e45313dc45b1811...
5092129,1db622d2ef805ba3c30f129fc21afb8cf145109380bffe...,rpc,fd4d92f4328a8714b7100aa132bfc4d40271b5120113ea...
5092127,fd4d92f4328a8714b7100aa132bfc4d40271b5120113ea...,db,e3b1ec3da44f91aad3e564d2291d723b6afb4d5ac7ccb4...
1319,9f14b8cd62aa14111695d8d170020805e508ca2168604a...,mc,7b3d2d6c8a8760081d2124b7a3fa1f795661105c79593c...
...,...,...,...
5097614,c20e26ec49d020ebe2b44aa6e3119993049eff078ff576...,db,8cce245e0d489e9ecf271400b101b92be1d371fe7895e5...
5744665,59b1463de2fbe73df7cdba3fb71cde40e2bfddb9efec55...,db,bf88d94e7daa4eaf9401ff71ba09616ec59b990ba38861...
6101308,12bc3fe2791d9f2946a74114ff349795461e695a099ae3...,mq,5aefad73e20ae83cd0bf473107dd290e74f9a8b12aa6bc...
6270408,150f30cbcef004e2e3964d5327c7cb25e3f1dea47199dc...,db,4346785c51657ac12c3b03236666aec286c40f6eafafd1...


In [20]:
sampled_entities = set(sampled_df['dm']) | set(sampled_df['um'])
len(sampled_entities)

16657

In [21]:
#check to see that there are all the entities
assert len(entities) == len(sampled_entities), [len(entities), len(sampled_entities)]

In [22]:
sampled_df.to_csv(f'MSCallGraph_sampled.csv',index=False)