# Sampling Task & Sketching task

In [1]:
import pandas as pd
import numpy as np
from time import time
from random import randint

### Cleaning & Processing dataset

In [2]:
df = pd.read_table('dataset/capture20110818.pcap.netflow.labeled',delim_whitespace=True) 
print(type(df))
df = df.iloc[:,:-5]
df.columns = ['Date','Time','Durat','Prot','Src IP Addr:Port','Dir','Dst IP Addr:Port','Flags','Tos','Packets','Bytes','Flows','Label']
LEN_DF = len(df)
print(LEN_DF)
print(df.head())

<class 'pandas.core.frame.DataFrame'>
5180851
         Date          Time  Durat Prot      Src IP Addr:Port Dir  \
0  2011-08-18  10:19:13.328  0.002  TCP   147.32.86.166:33426  ->   
1  2011-08-18  10:19:13.328  4.995  UDP     82.39.2.249:41915  ->   
2  2011-08-18  10:19:13.329  4.996  UDP    147.32.84.59:43087  ->   
3  2011-08-18  10:19:13.330  0.000  TCP   147.32.86.166:42020  ->   
4  2011-08-18  10:19:13.330  0.000  TCP  212.24.150.110:25443  ->   

       Dst IP Addr:Port  Flags  Tos  Packets    Bytes  Flows       Label  
0  212.24.150.110:25443  FRPA_    0        4      321      1  Background  
1    147.32.84.59:43087    INT    0      617    40095      1  Background  
2     82.39.2.249:41915    INT    0     1290  1909200      1  Background  
3     147.32.192.34:993     A_    0        1       66      1  Background  
4   147.32.86.166:33426   FPA_    0        2      169      1  Background  


In [3]:
#Seperate address and port
addr = []
port = []
for i in df['Dst IP Addr:Port']:
    try:
        s = i.split(':', 1)
        addr.append(s[0])
        port.append(s[1])
    except:
        port.append('Null')

df['Des_address'] = addr
df['Port'] = port
print(df.head())
df.to_csv('dataset/ctu_13_52_netflow.csv')
# Check Dataset Ip, protocol, port and label
label = np.unique(df['Label'].values)
print('Label:')
print(len(label))
print(label)

print('Protocol:')
protocol = np.unique(df['Prot'].values)
print(len(protocol))
print(protocol)

min_pac = min(df['Packets'])
max_pac = max(df['Packets'])
med_pac = np.median(df['Packets'].values)
print(min_pac,med_pac,max_pac)
s = df['Packets'].sort_values()

         Date          Time  Durat Prot      Src IP Addr:Port Dir  \
0  2011-08-18  10:19:13.328  0.002  TCP   147.32.86.166:33426  ->   
1  2011-08-18  10:19:13.328  4.995  UDP     82.39.2.249:41915  ->   
2  2011-08-18  10:19:13.329  4.996  UDP    147.32.84.59:43087  ->   
3  2011-08-18  10:19:13.330  0.000  TCP   147.32.86.166:42020  ->   
4  2011-08-18  10:19:13.330  0.000  TCP  212.24.150.110:25443  ->   

       Dst IP Addr:Port  Flags  Tos  Packets    Bytes  Flows       Label  \
0  212.24.150.110:25443  FRPA_    0        4      321      1  Background   
1    147.32.84.59:43087    INT    0      617    40095      1  Background   
2     82.39.2.249:41915    INT    0     1290  1909200      1  Background   
3     147.32.192.34:993     A_    0        1       66      1  Background   
4   147.32.86.166:33426   FPA_    0        2      169      1  Background   

      Des_address   Port  
0  212.24.150.110  25443  
1    147.32.84.59  43087  
2     82.39.2.249  41915  
3   147.32.192.34   

### 10 most frequent IP - Before Sampling

In [4]:
des_ip = np.unique(df['Des_address'])
des_ip_sort = df['Des_address'].value_counts()
print(len(des_ip))
print(des_ip_sort[:10])
ori_ip = des_ip_sort[:10].index.values

print("Length of Stream - " + str(len(df)))

191244
147.32.84.229    785569
147.32.80.9      483657
147.32.84.59     361143
147.32.84.138    224358
147.32.96.69     216892
147.32.80.13      67986
147.32.86.116     45606
147.32.85.25      43424
147.32.85.26      42873
147.32.84.118     30570
Name: Des_address, dtype: int64
Length of Stream - 5180851


## Sampling Task - Using Resovoir Sampling

In [5]:
import random
from random import randint
from time import time

# select k elements 
select = [100000,10000,1000,500,100,60]

#remove duplicate
sub_df = df.iloc[:,2:12]
sub_df = sub_df.drop_duplicates()
print(len(sub_df))
inx_sub = sub_df.index.values
df = df.loc[inx_sub,:]
LEN_DF = len(df)

rand = np.random.random((LEN_DF,1))

df['random'] = rand

ip_list = list(ori_ip)
time_list = []
for k in select:
    print("Select Value - " + str(k))
    t = time()
    # first k items
    samp_df = df[:k]
    # select random sample with a probability smaller than k/i
    index = df.index.values # Just indexes
    proba = k/index
    df['proba'] = proba
#     print("Random Indices --- " + str(len(df['proba']!=0)))
    samples = df.loc[df['proba']<=df['random'],:]

    all_sample = pd.concat([samp_df,samples])
#     print("Length of All Samples - " + str(len(all_sample)))
    
    sample = all_sample.sample(n=k)
    t = time()-t
    time_list.append(t)
    
    samp_des_ip_sort = sample['Des_address'].value_counts()
    top = samp_des_ip_sort[:10]
    ip = top.index.values
    ip_list.extend(ip)
    
factors, uniques = pd.factorize(ip_list)
print(factors.reshape(7,10))
print(type(factors))

3097079
Select Value - 100000


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




Select Value - 10000
Select Value - 1000
Select Value - 500
Select Value - 100
Select Value - 60
[[ 0  1  2  3  4  5  6  7  8  9]
 [ 0  1  2  5  3  6  8  7  4  9]
 [ 0  1  2  5  3  7  4  6  8 10]
 [ 0  1  2  5  3  6  9  4 11 12]
 [ 0  2  1  3  4  7 10  9  5 13]
 [ 0  2  1  5 11  7 14 15 16 17]
 [ 0 18  7  2 19 20 21 22  8 23]]
<type 'numpy.ndarray'>


In [6]:
# IP rank comparison
samp_ip = samp_des_ip_sort[:10].index.values

comp = pd.DataFrame({'Origin':factors[:10]})

for i,k in enumerate(select):
    comp[str(k)] = factors[(i+1)*10:(i+2)*10]
print(comp)

accuracy = [sum(comp.iloc[:,i]==comp.iloc[:,0])/10. for i,k in enumerate(comp.columns)]
print("Time consumed:")
print(np.asarray(time_list).reshape(6,1))
print(np.mean(time_list))
print("Accuracy:")
print(accuracy)

   Origin  100000  10000  1000  500  100  60
0       0       0      0     0    0    0   0
1       1       1      1     1    2    2  18
2       2       2      2     2    1    1   7
3       3       5      5     5    3    5   2
4       4       3      3     3    4   11  19
5       5       6      7     6    7    7  20
6       6       8      4     9   10   14  21
7       7       7      6     4    9   15  22
8       8       4      8    11    5   16   8
9       9       9     10    12   13   17  23
Time consumed:
[[5.54867291]
 [5.29997706]
 [4.51962399]
 [4.3726871 ]
 [4.54535604]
 [4.27662802]]
4.760490854581197
Accuracy:
[1.0, 0.5, 0.4, 0.3, 0.3, 0.1, 0.2]


### Count Min Sketch

In [23]:
from array import array
from random import randint
from math import log, e, ceil
from itertools import izip
from heapq import nlargest

class CountMinSketch(object):
    def __init__(self, w=None, d=None, delta=None, epsilon=None):

        if w is not None and d is not None:
            self.w = w
            self.d = d
        elif delta is not None and epsilon is not None:
            self.w = int(ceil(e/epsilon))
            self.d = int(ceil(log(1./delta)))
            print self.w, self.d
        else:
            raise Exception("You must either supply both w and d or delta and epsilon.")

        self.counts = [array('L', (0 for _ in xrange(self.w))) for _ in xrange(self.d)]
        upper_bound = 2147483647
        step = upper_bound / (self.d-1)
        ranges = [(i*step, step*(i+1)-1) for i in xrange(self.d-1)]
        self.mask = array('L', (randint(low, high) for low, high in ranges))

    def get_columns(self, a):
        h = hash(a)
        w = self.w

        yield h % w
        for m in self.mask:
            yield (h ^ m) % w


    def update(self, a, val=1):
        for row, col in izip(self.counts, self.get_columns(a)):
          row[col] += val

    def query(self, a):
        return min(row[col] for row, col in izip(self.counts, self.get_columns(a)))

    def __getitem__(self, a):
        return self.query(a)

    def __setitem__(self, a, val):
        for row, col in izip(self.counts, self.get_columns(a)):
          row[col] = val
        
# Testing Count Min Sketch
def test_cms(ip_list,freq_list,w,d):
    mytime = time()
    
    
# To Check for e = [0.001,0.001,0.005,0.005,0.01,0.01]
#              d = [0.01, 0.1, 0.01, 0.1, 0.2, 0.1]
    e = [0.001,0.001,0.005,0.005,0.005,0.01]
    d = [0.01, 0.1, 0.01, 0.1, 0.2, 0.1]
    ind = 5
    mine = CountMinSketch(delta=d[ind], epsilon=e[ind])
    for ip,freq in zip(ip_list,freq_list):
#         t = time()
        mine.update(ip, freq)
    
    mytime = time() - mytime
    loss= 0
    pre_freq={}
    for ip, freq in zip(ip_list,freq_list):
        
        loss += (mine.query(ip) - freq)**2
        pre_freq[ip]=int(mine[ip])

    print 'loss:', loss**0.5 / len(ip_list)
    print 'time:', mytime
    ips=[]
    topNum = 10
    nlargestList = nlargest(topNum, pre_freq.values())        #get top 10  
    for value in nlargestList:                                #print
        for key in pre_freq:  
            if pre_freq[key] == value:  
                ips.append(key)
                print key, pre_freq[key]
    return np.array(ips)

In [24]:
# get the value and freq as input
ip_list = des_ip_sort.index.values
freq_list = np.array(des_ip_sort).tolist()
#get top 10 ips
true_10 = np.array(des_ip_sort[:10].keys())


# Get Accuracy - Count Min Sketch 
# Change w, d (hgt and width) for checking 
ips = test_cms(ip_list,freq_list,1000,1000)
acc = sum(ips==true_10)/10.
print 'the top10 accuracy is', acc

272 3
loss: 18.0809215628
time: 0.981522083282
151.74.5.14 791267
108.95.172.239 791267
147.32.84.229 791267
151.74.5.14 791267
108.95.172.239 791267
147.32.84.229 791267
151.74.5.14 791267
108.95.172.239 791267
147.32.84.229 791267
80.109.27.144 490528
124.193.125.226 490528
147.32.80.9 490528
80.109.27.144 490528
124.193.125.226 490528
147.32.80.9 490528
80.109.27.144 490528
124.193.125.226 490528
147.32.80.9 490528
147.32.84.59 370099
91.205.120.71 370099
85.71.48.103 370099
147.32.84.59 370099
91.205.120.71 370099
85.71.48.103 370099
147.32.84.59 370099
91.205.120.71 370099
85.71.48.103 370099
147.32.84.138 229151
88.247.238.136 229151
186.112.114.18 229151


  # This is added back by InteractiveShellApp.init_path()


TypeError: 'bool' object is not iterable