# 2. Sketching task

We use **CTU-13 dataset, malware capture 43**.


* [x] Build code for computing a COUNT-MIN sketch, play with different heights and widths for the Count-Min sketch matrix.
* [ ] Compare it to the RESERVOIR sampling strategy. Is it more space-efficient / accurate? What about run-time? Use the theory to explain any differences you observe.

In [1]:
%load_ext autoreload
%autoreload 2

In [42]:
import pandas as pd
import os
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
from global_helper import *
from helper import *
from itertools import product

## Load data

In [3]:
%%time

# Read file and preprocess columns (takes ~1.5min)
# Only loads the traffic associated with the provided IP address.
df = load_data("../data/capture20110811.pcap.netflow.labeled", keep_ip='147.32.84.165')

Wall time: 57.3 s


In [53]:
# Store original frequencies
comparison = pd.DataFrame()
comparison['all'] = df['dest_ip'].value_counts() / len(df)

## Count-Min Sketching

In [54]:
widths = [2, 5, 10, 15, 20, 50]
depths = [2, 5, 8, 10, 20, 50]

grid = list(product(widths, depths))

In [55]:
for width, depth in grid:
    comparison['%d_%d' % (width,depth)] = count_min_sketch(df['dest_ip'], width, depth, normalize=True)

In [59]:
with pd.option_context('display.max_columns', 1000):
    display(comparison.head(10))

Unnamed: 0,all,2_2,2_5,2_8,2_10,2_20,2_50,5_2,5_5,5_8,5_10,5_20,5_50,10_2,10_5,10_8,10_10,10_20,10_50,15_2,15_5,15_8,15_10,15_20,15_50,20_2,20_5,20_8,20_10,20_20,20_50,50_2,50_5,50_8,50_10,50_20,50_50
193.23.181.44,0.136254,0.54685,0.506683,0.506683,0.444842,0.37054,0.37054,0.208237,0.208237,0.208237,0.208237,0.207618,0.207618,0.182851,0.182851,0.182851,0.180221,0.161353,0.161353,0.149025,0.149025,0.149025,0.149025,0.149025,0.149025,0.151676,0.146594,0.146594,0.146594,0.145534,0.145534,0.139833,0.13895,0.138353,0.138353,0.138353,0.137867
174.128.246.102,0.075716,0.54685,0.479154,0.315923,0.315923,0.315923,0.315923,0.177835,0.17134,0.168887,0.168887,0.163673,0.138419,0.125119,0.116568,0.116568,0.107311,0.107311,0.103356,0.112879,0.112879,0.101191,0.096131,0.094386,0.093392,0.092552,0.088818,0.088818,0.087227,0.087227,0.087227,0.077683,0.077683,0.077683,0.077683,0.077683,0.077307
174.37.196.55,0.07417,0.45315,0.431254,0.429354,0.429354,0.308323,0.308323,0.242902,0.174123,0.174123,0.174123,0.148097,0.1302,0.185811,0.12313,0.12313,0.12313,0.101102,0.101102,0.131349,0.088597,0.088597,0.088597,0.088597,0.083405,0.093259,0.083383,0.083383,0.083383,0.082278,0.082278,0.082057,0.079517,0.076269,0.076269,0.076269,0.076269
67.19.72.206,0.069265,0.45315,0.45315,0.350567,0.350567,0.350567,0.310974,0.214467,0.214467,0.191556,0.191556,0.146727,0.130532,0.105676,0.105676,0.098407,0.098407,0.09686,0.092883,0.091425,0.091425,0.085482,0.085482,0.085482,0.081196,0.08789,0.084974,0.081395,0.081395,0.076004,0.076004,0.073374,0.070878,0.070878,0.070878,0.070878,0.070348
72.20.15.61,0.065531,0.412055,0.412055,0.412055,0.412055,0.314686,0.313073,0.214467,0.149555,0.149555,0.147147,0.140828,0.140828,0.108791,0.100219,0.086697,0.086697,0.086697,0.086454,0.126268,0.077594,0.077594,0.077594,0.077594,0.077594,0.089039,0.078036,0.078036,0.078036,0.075694,0.073131,0.069795,0.067807,0.067807,0.067807,0.067807,0.067453
173.236.31.226,0.037737,0.412055,0.412055,0.350567,0.350567,0.350567,0.313073,0.237666,0.143037,0.143037,0.132741,0.114823,0.11416,0.097479,0.076247,0.076247,0.076247,0.068757,0.064934,0.082212,0.077219,0.067299,0.067299,0.060781,0.051568,0.079097,0.045889,0.045889,0.045889,0.045889,0.04505,0.051833,0.039769,0.039769,0.039769,0.039305,0.039283
184.154.89.154,0.037074,0.412055,0.412055,0.412055,0.412055,0.242505,0.242505,0.13336,0.105079,0.105079,0.105079,0.105079,0.100594,0.065686,0.05972,0.05972,0.05972,0.05972,0.057268,0.056914,0.056561,0.055456,0.055456,0.051523,0.051523,0.052672,0.050883,0.050883,0.050883,0.048386,0.044851,0.040498,0.040498,0.039416,0.039416,0.038687,0.038687
46.4.36.120,0.035947,0.54685,0.431254,0.429354,0.414595,0.314686,0.314686,0.237666,0.112481,0.112481,0.112481,0.112481,0.112039,0.122401,0.064051,0.064051,0.064051,0.056715,0.056715,0.060626,0.047127,0.047127,0.047127,0.046707,0.046707,0.043304,0.043304,0.043304,0.043304,0.043304,0.04273,0.041316,0.039018,0.039018,0.03851,0.03851,0.037538
147.32.80.9,0.017388,0.412055,0.412055,0.350567,0.350567,0.242505,0.242505,0.140452,0.140452,0.140452,0.140452,0.083273,0.075385,0.095756,0.060869,0.042421,0.042421,0.042421,0.042421,0.049734,0.031815,0.031815,0.031815,0.031528,0.030401,0.063344,0.046972,0.025055,0.025055,0.023795,0.023795,0.019377,0.019377,0.019377,0.019377,0.019244,0.0192
217.163.21.37,0.015488,0.412055,0.412055,0.350567,0.350567,0.242505,0.242505,0.140452,0.140452,0.115022,0.115022,0.098451,0.084775,0.095756,0.049623,0.049623,0.049623,0.03767,0.034842,0.047834,0.031064,0.031064,0.031064,0.031064,0.030247,0.032412,0.025099,0.025099,0.025099,0.022801,0.022801,0.018714,0.017808,0.017808,0.017808,0.017101,0.017101
