<h1>Import all required packages</h1>

In [1]:
import sys
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

sys.path.insert(0, '..//..//flink')

from db_utils import CassandraDb 

<h1>Select all Metadata features. Execlude Http, TLS, and DNS related features</h1>

In [2]:
features=  ['pr', 'rev_intervals_ccnt', 'rev_pld_max', 'dst_port', 'num_pkts_in', 'pld_ccnt', 'rev_hdr_ccnt', 'rev_hdr_bin_40', 'bytes_in',
        'rev_ack_psh_rst_syn_fin_cnt','time_length','rev_pld_var','pld_mean','rev_pld_mean', 'pld_median','src_port',
        'rev_pld_bin_128','hdr_distinct','rev_pld_distinct','ack_psh_rst_syn_fin_cnt','pld_distinct',
            'hdr_bin_40','rev_pld_ccnt','intervals_ccnt','hdr_ccnt',
            'pld_bin_inf','hdr_mean','num_pkts_out','rev_hdr_distinct',
            'pld_max','id','bytes_out','label']

<h1>
    Readl all instances from Cassandra Database
</h1>

In [19]:
KEYSPACE = 'ost_sm_2020'
TABLE_NAME = 'non_vpn'

db = CassandraDb( ['127.0.0.1'], 9042)

data = db.read_table(KEYSPACE, TABLE_NAME,features)

<h1>Data Preprocessing</h1>
<ul>
    <li>
        Treat each element of a feature list as a dimension
    </li>
    <li>
        Store the data into a data frame of shape ( n_instances, n_features )
    </li>
</ul>

In [21]:
data_preprocessed = {}

for row in data:
    preprocessed_row = dict()
    for feature in features:
        if type( row[feature] ) is list:
            for i, e in enumerate(row[feature]):
                preprocessed_row[feature+'_'+str(i)] = row[feature][i]
        else:
            preprocessed_row[feature] = row[feature]
    for k,v in preprocessed_row.items():
        if k not in data_preprocessed:
            
            values = list()
            values.append(v)
            data_preprocessed[k] = values
        else:
            values = data_preprocessed[k]
            values.append(v)
            data_preprocessed[k] = values
            
df = pd.DataFrame.from_dict(data_preprocessed)

print( df.shape )

(131065, 123)

<h1>Change Data Frame index to the flow's id</h1>
<h3>Print the first 5 rows</h3>

In [22]:
df = df.set_index('id')

df.head(n=5)

Unnamed: 0_level_0,pr,rev_intervals_ccnt_0,rev_intervals_ccnt_1,rev_intervals_ccnt_2,rev_intervals_ccnt_3,rev_intervals_ccnt_4,rev_intervals_ccnt_5,rev_intervals_ccnt_6,rev_intervals_ccnt_7,rev_intervals_ccnt_8,...,hdr_ccnt_9,hdr_ccnt_10,hdr_ccnt_11,pld_bin_inf,hdr_mean,num_pkts_out,rev_hdr_distinct,pld_max,bytes_out,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7935030,17,0,0,0,0,0,0,0,0,0,...,0,0,0,0,8.0,2,0,22,44,audio
7022607,17,0,0,0,0,0,0,0,0,0,...,0,0,0,0,8.0,1,0,89,89,audio
4449798,17,0,0,0,0,0,0,0,0,0,...,0,0,0,0,8.0,1,1,37,37,audio
2570029,17,0,0,0,0,0,0,0,0,0,...,0,0,0,0,8.0,2,0,22,44,file_transfer
6533998,17,0,0,0,0,0,0,0,0,0,...,0,0,0,0,8.0,1,1,37,37,audio


<ul>
    <li>
        Separate attributes from labels
    </li>
    <li>
        save attributes and labels in npy files for later usage
    </li>
</ul>

In [23]:
Y = df['label']
X = df.drop(columns=['label'])

# save for later use
np.save('non_vpn_ds.npy', X)
np.save('non_vpn_top_annotations.npy', Y)