In this notebook I am joining the previously processed reduced dataset parts. It will be used for creating the trace test set.

In [1]:
import pandas as pd
import numpy as np
import zipfile
import joblib
from tqdm import tqdm
import contextlib

In [3]:

to_skip = tuple([2,57,60,64,66,75,98,101,125,129,130,144]) # corrupted files
df_list = []

# define a function to extract and process a single file
def extract_and_process_file(i):  
    if to_skip.count(i) > 0:
        return None
        
    # specify the name of the zip file and the directory to extract the contents
    zip_file_name = f'MSCallGraph_{i}_reduced.zip'
    extract_dir = ""

    # create a ZipFile object and extract the contents to the specified directory
    with zipfile.ZipFile(zip_file_name, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)
    
    # read the CSV file and process it
    df = pd.read_csv(f'MSCallGraph_{i}_reduced.csv').drop(['Unnamed: 0'],axis = 1).drop_duplicates()
    return df


In [4]:


@contextlib.contextmanager
def tqdm_joblib(tqdm_object):
    """Context manager to patch joblib to report into tqdm progress bar given as argument"""
    class TqdmBatchCompletionCallback(joblib.parallel.BatchCompletionCallBack):
        def __call__(self, *args, **kwargs):
            tqdm_object.update(n=self.batch_size)
            return super().__call__(*args, **kwargs)

    old_batch_callback = joblib.parallel.BatchCompletionCallBack
    joblib.parallel.BatchCompletionCallBack = TqdmBatchCompletionCallback
    try:
        yield tqdm_object
    finally:
        joblib.parallel.BatchCompletionCallBack = old_batch_callback
        tqdm_object.close()

In [5]:
# use joblib.Parallel to extract and process the files in parallel
n_jobs = -1 # use all available CPUs
print("cpu ",n_jobs)
with tqdm_joblib(tqdm(desc="aggregating reducedd", total=145)) as progress_bar:
    results = joblib.Parallel(n_jobs=n_jobs)(joblib.delayed(extract_and_process_file)(i) for i in range(0, 145))

# filter out the None values and append the processed dataframes to df_list
for df in results:
    if df is not None:
        df_list.append(df)

aggregating reducedd: 100%|██████████| 145/145 [00:25<00:00,  5.72it/s]


In [6]:
df = pd.concat(df_list).drop_duplicates()
df

Unnamed: 0,traceid,um,rpctype,dm
0,0b133c1915919238193454000e5d37,5cca70246befb1f4c9546d2912b9419dee54439218efa5...,mc,b1dbd3a649a3cc790fa12573c9c1aa00988e07a8818a22...
1,0b133c1915919238193454000e5d37,4ab265f54516248ee8873be7d6441912456ce17e84f399...,mc,fd6d86bd0fd550e717c1fdb82a33190a9fef216d87d535...
2,0b133c1915919238193454000e5d37,5cca70246befb1f4c9546d2912b9419dee54439218efa5...,userDefined,5cca70246befb1f4c9546d2912b9419dee54439218efa5...
3,0b133c1915919238193454000e5d37,75e56c8fbb9336eb4dd40f5f609d5344203d374d73fd0b...,rpc,84f9f68ef003a21288fffe8f9a09a5a29b05f4cc4229b8...
4,0b133c1915919238193454000e5d37,4ab265f54516248ee8873be7d6441912456ce17e84f399...,mc,01d660afcfadafd587e20ec4c04ddbc7eb0de95643ba0e...
...,...,...,...,...
1708154,0b5106e415919252519982000e6709,95a6f7f8345e2eca31ee74ddc19d547e7fc0f5c8e65772...,mc,0b0c981da8042a436f30f7d9f7cce74360abe630bd5478...
1708155,0b5106e415919252519982000e6709,95a6f7f8345e2eca31ee74ddc19d547e7fc0f5c8e65772...,mc,5c5200228afeec02308f5fdd4bdfa256b2245a163b3c81...
1708156,0b5106e715919253897478000e53d1,63b8a25a4a9f80c7508c0db0d163af6d9e6c0746c9b7e2...,mc,9653f5baba69c9fb50bfb30a8571eb04dbceaae7c7f379...
1708157,0b5106e715919253897478000e53d1,63b8a25a4a9f80c7508c0db0d163af6d9e6c0746c9b7e2...,mc,04940a16b54c3d25fe069d5fa3b209e6897040996c3715...


In [7]:
df.to_csv('MSCallGraph_joined_reduced.csv',index=False)

In [8]:
df = pd.read_csv('MSCallGraph_joined_reduced.csv')
df

Unnamed: 0,traceid,um,rpctype,dm
0,0b133c1915919238193454000e5d37,5cca70246befb1f4c9546d2912b9419dee54439218efa5...,mc,b1dbd3a649a3cc790fa12573c9c1aa00988e07a8818a22...
1,0b133c1915919238193454000e5d37,4ab265f54516248ee8873be7d6441912456ce17e84f399...,mc,fd6d86bd0fd550e717c1fdb82a33190a9fef216d87d535...
2,0b133c1915919238193454000e5d37,5cca70246befb1f4c9546d2912b9419dee54439218efa5...,userDefined,5cca70246befb1f4c9546d2912b9419dee54439218efa5...
3,0b133c1915919238193454000e5d37,75e56c8fbb9336eb4dd40f5f609d5344203d374d73fd0b...,rpc,84f9f68ef003a21288fffe8f9a09a5a29b05f4cc4229b8...
4,0b133c1915919238193454000e5d37,4ab265f54516248ee8873be7d6441912456ce17e84f399...,mc,01d660afcfadafd587e20ec4c04ddbc7eb0de95643ba0e...
...,...,...,...,...
8348252,0b5106e415919252519982000e6709,95a6f7f8345e2eca31ee74ddc19d547e7fc0f5c8e65772...,mc,0b0c981da8042a436f30f7d9f7cce74360abe630bd5478...
8348253,0b5106e415919252519982000e6709,95a6f7f8345e2eca31ee74ddc19d547e7fc0f5c8e65772...,mc,5c5200228afeec02308f5fdd4bdfa256b2245a163b3c81...
8348254,0b5106e715919253897478000e53d1,63b8a25a4a9f80c7508c0db0d163af6d9e6c0746c9b7e2...,mc,9653f5baba69c9fb50bfb30a8571eb04dbceaae7c7f379...
8348255,0b5106e715919253897478000e53d1,63b8a25a4a9f80c7508c0db0d163af6d9e6c0746c9b7e2...,mc,04940a16b54c3d25fe069d5fa3b209e6897040996c3715...


In [10]:
print(df)

                                traceid  \
0        0b133c1915919238193454000e5d37   
1        0b133c1915919238193454000e5d37   
2        0b133c1915919238193454000e5d37   
3        0b133c1915919238193454000e5d37   
4        0b133c1915919238193454000e5d37   
...                                 ...   
8348252  0b5106e415919252519982000e6709   
8348253  0b5106e415919252519982000e6709   
8348254  0b5106e715919253897478000e53d1   
8348255  0b5106e715919253897478000e53d1   
8348256  0b5106e715919253897478000e53d1   

                                                        um      rpctype  \
0        5cca70246befb1f4c9546d2912b9419dee54439218efa5...           mc   
1        4ab265f54516248ee8873be7d6441912456ce17e84f399...           mc   
2        5cca70246befb1f4c9546d2912b9419dee54439218efa5...  userDefined   
3        75e56c8fbb9336eb4dd40f5f609d5344203d374d73fd0b...          rpc   
4        4ab265f54516248ee8873be7d6441912456ce17e84f399...           mc   
...                              

In [9]:
import zipfile
filename = 'MSCallGraph_joined_reduced.csv'
zipfilename = 'MSCallGraph_joined_reduced.zip'

# Create a ZipFile object and add the file to it
with zipfile.ZipFile(zipfilename, 'w', compresslevel=9, compression=zipfile.ZIP_LZMA) as zip:
    zip.write(filename)