In this notebook I am joining the previously processed reduced dataset parts. It will be used for creating the trace test set.

In [1]:
import pandas as pd
import numpy as np
import zipfile
import joblib
from tqdm import tqdm
import contextlib

In [2]:

to_skip = tuple([2,57,60,64,66,75,98,101,125,129,130,144]) # corrupted files
df_list = []

# define a function to extract and process a single file
def extract_and_process_file(i):  
    if to_skip.count(i) > 0:
        return None
        
    # specify the name of the zip file and the directory to extract the contents
    zip_file_name = f'MSCallGraph_{i}_io.zip'
    extract_dir = ""

    # create a ZipFile object and extract the contents to the specified directory
    with zipfile.ZipFile(zip_file_name, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)
    
    # read the CSV file and process it
    df = pd.read_csv(f'MSCallGraph_{i}_io.csv').drop(['Unnamed: 0'],axis = 1).drop_duplicates()
    return df


In [3]:


@contextlib.contextmanager
def tqdm_joblib(tqdm_object):
    """Context manager to patch joblib to report into tqdm progress bar given as argument"""
    class TqdmBatchCompletionCallback(joblib.parallel.BatchCompletionCallBack):
        def __call__(self, *args, **kwargs):
            tqdm_object.update(n=self.batch_size)
            return super().__call__(*args, **kwargs)

    old_batch_callback = joblib.parallel.BatchCompletionCallBack
    joblib.parallel.BatchCompletionCallBack = TqdmBatchCompletionCallback
    try:
        yield tqdm_object
    finally:
        joblib.parallel.BatchCompletionCallBack = old_batch_callback
        tqdm_object.close()

In [4]:
# use joblib.Parallel to extract and process the files in parallel
n_jobs = -1 # use all available CPUs
print("cpu ",n_jobs)
with tqdm_joblib(tqdm(desc="aggregating io", total=145)) as progress_bar:
    results = joblib.Parallel(n_jobs=n_jobs)(joblib.delayed(extract_and_process_file)(i) for i in range(0, 145))

# filter out the None values and append the processed dataframes to df_list
for df in results:
    if df is not None:
        df_list.append(df)

cpu  -1


aggregating io: 100%|█████████████████████| 145/145 [00:55<00:00,  2.60it/s]


In [5]:
df = pd.concat(df_list).drop_duplicates()
df

Unnamed: 0,traceid,um,rpctype,dm
0,0b133c1915919238193454000e5d37,5cca70246befb1f4c9546d2912b9419dee54439218efa5...,mc_out,b1dbd3a649a3cc790fa12573c9c1aa00988e07a8818a22...
1,0b133c1915919238193454000e5d37,4ab265f54516248ee8873be7d6441912456ce17e84f399...,mc_out,fd6d86bd0fd550e717c1fdb82a33190a9fef216d87d535...
2,0b133c1915919238193454000e5d37,5cca70246befb1f4c9546d2912b9419dee54439218efa5...,userDefined_out,5cca70246befb1f4c9546d2912b9419dee54439218efa5...
3,0b133c1915919238193454000e5d37,75e56c8fbb9336eb4dd40f5f609d5344203d374d73fd0b...,rpc_in,84f9f68ef003a21288fffe8f9a09a5a29b05f4cc4229b8...
4,0b133c1915919238193454000e5d37,4ab265f54516248ee8873be7d6441912456ce17e84f399...,mc_out,01d660afcfadafd587e20ec4c04ddbc7eb0de95643ba0e...
...,...,...,...,...
1878117,0b14339415919667189434000e499f,24601dd8b36f856eb0a4d759866f475bc62cf843ffe092...,rpc_out,2abd05990aa9eb81a3eaa333829ec355eaee57fbecc9e8...
1878118,0b14339415919667189434000e499f,24601dd8b36f856eb0a4d759866f475bc62cf843ffe092...,rpc_in,24601dd8b36f856eb0a4d759866f475bc62cf843ffe092...
1878119,0b14339415919667189434000e499f,24601dd8b36f856eb0a4d759866f475bc62cf843ffe092...,rpc_in,643bce6326e327bb36361c31d1ad0cf9ee8d5767175482...
1878120,0b14339415919667189434000e499f,643bce6326e327bb36361c31d1ad0cf9ee8d5767175482...,rpc_out,e9916393b398199ef017f67a1904dd9c30323405c6f77c...


In [6]:
df.to_csv('MSCallGraph_joined_io.csv',index=False)

In [7]:
df = pd.read_csv('MSCallGraph_joined_io.csv')
df

Unnamed: 0,traceid,um,rpctype,dm
0,0b133c1915919238193454000e5d37,5cca70246befb1f4c9546d2912b9419dee54439218efa5...,mc_out,b1dbd3a649a3cc790fa12573c9c1aa00988e07a8818a22...
1,0b133c1915919238193454000e5d37,4ab265f54516248ee8873be7d6441912456ce17e84f399...,mc_out,fd6d86bd0fd550e717c1fdb82a33190a9fef216d87d535...
2,0b133c1915919238193454000e5d37,5cca70246befb1f4c9546d2912b9419dee54439218efa5...,userDefined_out,5cca70246befb1f4c9546d2912b9419dee54439218efa5...
3,0b133c1915919238193454000e5d37,75e56c8fbb9336eb4dd40f5f609d5344203d374d73fd0b...,rpc_in,84f9f68ef003a21288fffe8f9a09a5a29b05f4cc4229b8...
4,0b133c1915919238193454000e5d37,4ab265f54516248ee8873be7d6441912456ce17e84f399...,mc_out,01d660afcfadafd587e20ec4c04ddbc7eb0de95643ba0e...
...,...,...,...,...
222509712,0b14339415919667189434000e499f,24601dd8b36f856eb0a4d759866f475bc62cf843ffe092...,rpc_out,2abd05990aa9eb81a3eaa333829ec355eaee57fbecc9e8...
222509713,0b14339415919667189434000e499f,24601dd8b36f856eb0a4d759866f475bc62cf843ffe092...,rpc_in,24601dd8b36f856eb0a4d759866f475bc62cf843ffe092...
222509714,0b14339415919667189434000e499f,24601dd8b36f856eb0a4d759866f475bc62cf843ffe092...,rpc_in,643bce6326e327bb36361c31d1ad0cf9ee8d5767175482...
222509715,0b14339415919667189434000e499f,643bce6326e327bb36361c31d1ad0cf9ee8d5767175482...,rpc_out,e9916393b398199ef017f67a1904dd9c30323405c6f77c...


In [8]:
print(df)

                                  traceid  \
0          0b133c1915919238193454000e5d37   
1          0b133c1915919238193454000e5d37   
2          0b133c1915919238193454000e5d37   
3          0b133c1915919238193454000e5d37   
4          0b133c1915919238193454000e5d37   
...                                   ...   
222509712  0b14339415919667189434000e499f   
222509713  0b14339415919667189434000e499f   
222509714  0b14339415919667189434000e499f   
222509715  0b14339415919667189434000e499f   
222509716  0b5218dd15919665521188000eff6c   

                                                          um          rpctype  \
0          5cca70246befb1f4c9546d2912b9419dee54439218efa5...           mc_out   
1          4ab265f54516248ee8873be7d6441912456ce17e84f399...           mc_out   
2          5cca70246befb1f4c9546d2912b9419dee54439218efa5...  userDefined_out   
3          75e56c8fbb9336eb4dd40f5f609d5344203d374d73fd0b...           rpc_in   
4          4ab265f54516248ee8873be7d6441912456ce17e84f

In [9]:
import zipfile
filename = 'MSCallGraph_joined_io.csv'
zipfilename = 'MSCallGraph_joined_io.zip'

# Create a ZipFile object and add the file to it
with zipfile.ZipFile(zipfilename, 'w', compresslevel=9, compression=zipfile.ZIP_LZMA) as zip:
    zip.write(filename)