In [1]:
import pandas as pd
import numpy as np
from scapy.all import rdpcap, raw

from sklearn.preprocessing import LabelEncoder
import torch

## Labels

In [3]:
labels = pd.read_csv('../data/tow-ids-dataset/raw/y_train.csv', header=None, names=["index", "Class", "Description"])

In [35]:
labels.shape

(1203737, 3)

In [36]:
labels.head()

Unnamed: 0,index,Class,Description
0,1,Normal,Normal
1,2,Normal,Normal
2,3,Normal,Normal
3,4,Normal,Normal
4,5,Normal,Normal


In [38]:
labels['Class'].value_counts()

Class
Normal      954912
Abnormal    248825
Name: count, dtype: int64

In [39]:
labels['Description'].value_counts()

Description
Normal    954912
C_D        85466
P_I        64635
F_I        35112
M_F        33765
C_R        29847
Name: count, dtype: int64

In [4]:
mapping = {
    'Normal': 'Normal',
    'C_D': 'CAN DoS',
    'P_I': 'PTP Sync',
    'M_F': 'Switch MAC Flooding',
    'F_I': 'Frame Injection',
    'C_R': 'CAN Replay',
}

In [5]:
labels['Description'] = labels['Description'].replace(mapping)
labels['Description'].value_counts()

Description
Normal                 954912
CAN DoS                 85466
PTP Sync                64635
Frame Injection         35112
Switch MAC Flooding     33765
CAN Replay              29847
Name: count, dtype: int64

## Values

In [5]:
raw_packets = rdpcap('../data/raw/tow-ids-dataset/Automotive_Ethernet_with_Attack_original_10_17_19_50_training.pcap', count=140319)

In [6]:
type(raw_packets)

scapy.plist.PacketList

In [7]:
labels.shape[0]

1203737

In [8]:
len(raw_packets), len(raw_packets) == labels.shape[0]

(5, False)

In [9]:
for raw_packet in raw_packets:
    converted_packet = np.frombuffer(raw(raw_packet), dtype='uint8')

    print(len(converted_packet))
    print(converted_packet)
    print(np.array([converted_packet[:52]]))
    break

434
[145 239   0   0 254   0   0 252 112   0   0   3 129   0   0   2  34 240
   0 136  22   0   1  35  69 103 137 171 205 239   0   0   0   0   0   0
   0   0   1 136  95 160  63   6 196  96 160   0   0   0  45 145 232 183
  71  64  65  16   0   0   1 224  15 174 129 128   5  33  97 137 159  65
   0   0   0   1   6   1   4   0  24   8  16 128   0   0   0   1  97 225
 130 111  51 239 135 226  86 238 222 188 102  12  24  83   9 106 141 246
  35  27 162 252  88 240   8 114 129 110 154  95  47 121 164 254 165  25
  78 195 183  85 185  84 157 215  51  12 213 107 113  24  54 217 114  57
  35  58  22 232 235 134 210 121 206  53  98 222 244 246  16 173  94  59
 239 108 110 248  57 245  48 239 246 222  14 235 151  44  81   8 127  60
 122  53  85  47  61 207   2 140 137 118 187   8 242  52  44 115 124  59
 225 123 208  63 144 228 149 144 132 180  94 205 183 188 163 186 239 137
  72 232 172 125 174  30 160  33  60  90 236  83  31 217  26  57  93 214
 183  33 123 104 165 208  49 230  45 145 232 18

In [7]:
converted_packets_list = []
for raw_packet in raw_packets[-10:-1]:
    converted_packet = np.frombuffer(raw(raw_packet), dtype='uint8')
    converted_packets_list.append(converted_packet[:52])
values = np.array(converted_packets_list, dtype='uint8')

In [8]:
values.shape

(9, 52)

In [9]:
values[0:2]

array([[220, 166,  50,  94,  72,  71, 220, 166,  50,  93, 206, 210,   8,
          0,  69,   0,   0,  37, 237, 192,  64,   0,  64,  17, 183, 177,
        192, 168,  10,   2, 192, 168,  10,   3, 237, 164, 234, 104,   0,
         17, 236, 105,   8, 254,  97,   0,   0,   0,  60,   0,   0,   0],
       [220, 166,  50,  94,  72,  71, 220, 166,  50,  93, 206, 210,   8,
          0,  69,   0,   0,  37, 237, 193,  64,   0,  64,  17, 183, 176,
        192, 168,  10,   2, 192, 168,  10,   3, 162,   8, 234,  96,   0,
         17, 214,  12,   8,   0,   0,   0,   0,   0,   0,   0,   0,   0]],
      dtype=uint8)

In [10]:
X, y = list(), list()

# Loop of the entire data set
for i in range(values.shape[0]):
    # Compute a new (sliding window) index
    start_ix = i*(1)
    end_ix = start_ix + 2 - 1 + 1
    print(start_ix, end_ix)

    # If index is larger than the size of the dataset, we stop
    if end_ix >= values.shape[0]:
        break

    # Get a sequence of data for x
    seq_X = values[start_ix:end_ix]

    # Append the list with sequences
    X.append(seq_X)

0 2
1 3
2 4
3 5
4 6
5 7
6 8
7 9


In [11]:
np.array(X).shape

(7, 2, 52)

In [26]:
def get_label_mapping():
    return {
        'Normal': 'Normal',
        'C_D': 'CAN DoS',
        'P_I': 'PTP Sync',
        'M_F': 'Switch MAC Flooding',
        'F_I': 'Frame Injection',
        'C_R': 'CAN Replay',
    }
    
def labeling_schema(sequence: pd.DataFrame) -> bool:    
    seq_y = 'Abnormal'
    # Remove Abnormal label
    labels = get_label_mapping().values()
    labels = [label for label in labels if label != 'Normal']

    indexes = sequence['Description'].value_counts().sort_values(ascending=False).reset_index()
    indexes_list = list(indexes['Description'].values)

    set_attacks = set(labels)
    set_sequence_indexes = set(indexes_list)

    intersect = any(set_atk in set_sequence_indexes for set_atk in set_attacks)

    if intersect is True:
        attacks_mask = indexes['Description'].isin(labels)
        indexes_attacks = indexes[attacks_mask]
        seq_y = indexes_attacks['Description'].values[0]

    return seq_y

In [16]:
'Abnormal' in labels[0:2]['Class'].values

False

In [27]:
X, y = list(), list()

# Loop of the entire data set
for i in range(values.shape[0]):
    # Compute a new (sliding window) index
    start_ix = i*(1) + 140319 - 10
    end_ix = start_ix + 2 - 1 + 1 + 140319 - 1
    print(start_ix, end_ix)

    # If index is larger than the size of the dataset, we stop
    # if end_ix >= values.shape[0]:
        # break

    # Get a sequence of data for x
    seq_X = values[start_ix:end_ix]

    # Get a squence of data for y
    tmp_seq_y = labels[start_ix : end_ix]

    # Labeling schema
    seq_y = labeling_schema(tmp_seq_y)

    # Append the list with sequences
    X.append(seq_X)
    y.append(seq_y)

140309 280629
140310 280630
140311 280631
140312 280632
140313 280633
140314 280634
140315 280635
140316 280636
140317 280637


## Loading features

In [6]:
X = np.load('../data/tow-ids-dataset/processed/X_train_fg_sliding-window-generator_Wsize_44_Wstride_1_nb_52.npz')
X = X.f.arr_0

In [7]:
X.shape

(1203693, 44, 52)

In [47]:
y = pd.read_csv('../data/tow-ids-dataset/processed/y_train_fg_sliding-window-generator_Wsize_44_Wstride_1_nb_52.csv')
y = y['label'].values

In [48]:
y.shape

(1203693,)

In [50]:
X.shape

(1203693, 44, 52)

In [54]:
X.reshape(X.shape[0], X.shape[1] * X.shape[2]).shape

(1203693, 2288)

In [8]:
idx = np.arange(len(X))

In [9]:
from sklearn.model_selection import train_test_split


train_val_idx, test_idx = train_test_split(idx, train_size=0.8, random_state=10, shuffle=True)

In [10]:
# TODO: Check if it is unsupervised
benign_idx = labels[labels['Description'] == 'Normal'].index

In [11]:
# Filter train_idx with benign_idx
train_idx = np.array([i for i in train_val_idx if i in benign_idx])

In [12]:
rest_idx = np.array([i for i in train_val_idx if i not in benign_idx])

In [13]:
train_idx, val_idx = train_test_split(train_idx, train_size=0.8, random_state=10, shuffle=True)

In [14]:
(len(train_idx) + len(val_idx) + len(test_idx) + len(rest_idx)) == len(idx)

True

In [15]:
for i in range(len(X)):
    count = 0
    if i in train_idx:
        count += 1
    if i in val_idx:
        count += 1
    if i in test_idx:
        count += 1
    if i in rest_idx:
        count += 1

    if count > 1:
        print(f"Index {i} is in more than one set: {count}")
        break

In [3]:
a = np.array([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]])

In [8]:
# create float tensor
a_tensor = torch.tensor(a, dtype=torch.float32)

  a_tensor = torch.tensor(a, dtype=torch.float32)


In [14]:
b = a_tensor.mean(dim=1)

In [16]:
b.mean()

tensor(5.5000)

In [None]:
def load_processed(subset: int = None) -> tuple[np.ndarray, pd.DataFrame]:
        X = np.load('../data/tow-ids-dataset/processed/X_train_fg_sliding-window-generator_Wsize_44_Wstride_1_nb_52.npz')
        X = X.f.arr_0

        y = pd.read_csv('../data/tow-ids-dataset/processed/y_train_fg_sliding-window-generator_Wsize_44_Wstride_1_nb_52.csv', index_col=0)

        if subset is not None:
            indices = np.random.choice(len(X), size=int(subset*len(X)), replace=False)
            X = X[indices]
            y = y.iloc[indices].reset_index(drop=True)

        return X, y

In [23]:
X, y = load_processed(subset=0.1)

In [24]:
from sklearn.model_selection import train_test_split

def __divide_data(X, labels: pd.DataFrame):
        # Shuffle the data
        idx = np.arange(X.shape[0])

        train_val_idx, test_idx = train_test_split(idx, train_size=0.8, random_state=10, shuffle=True)

        # TODO: Check if it is unsupervised
        benign_idx = labels[labels['label'] == 'Normal'].index

        # Filter train_idx with benign_idx
        train_idx = np.array([i for i in train_val_idx if i in benign_idx])
        train_idx, val_idx = train_test_split(train_idx, train_size=0.8, random_state=10, shuffle=True)

        return train_idx, val_idx, test_idx

In [25]:
X[:1]

array([[[0.00392157, 0.5019608 , 0.7607843 , ..., 0.        ,
         0.        , 0.        ],
        [0.8627451 , 0.6509804 , 0.19607843, ..., 0.        ,
         0.        , 0.        ],
        [0.00392157, 0.5019608 , 0.7607843 , ..., 0.        ,
         0.        , 0.        ],
        ...,
        [0.8627451 , 0.6509804 , 0.19607843, ..., 0.        ,
         0.        , 0.        ],
        [0.8627451 , 0.6509804 , 0.19607843, ..., 0.        ,
         0.05490196, 0.        ],
        [0.8627451 , 0.6509804 , 0.19607843, ..., 0.69803923,
         0.12941177, 0.        ]]], shape=(1, 44, 52), dtype=float32)

In [26]:
y.head()

Unnamed: 0,index,label
0,1011791,PTP Sync
1,93226,Normal
2,182234,CAN DoS
3,851856,Frame Injection
4,1110611,PTP Sync


In [30]:
train_idx, val_idx, test_idx = __divide_data(X, y)

In [31]:
train_df = pd.DataFrame([y.iloc[i]['label'] for i in train_idx], columns=['label'])
val_df = pd.DataFrame([y.iloc[i]['label'] for i in val_idx], columns=['label'])
test_df = pd.DataFrame([y.iloc[i]['label'] for i in test_idx], columns=['label'])

In [32]:
train_df.value_counts()

label 
Normal    36120
Name: count, dtype: int64

In [34]:
val_df.value_counts()

label 
Normal    9030
Name: count, dtype: int64

In [None]:
test_df.value_counts()

label              
Normal                 11262
PTP Sync                3876
CAN DoS                 3448
Switch MAC Flooding     2168
CAN Replay              2058
Frame Injection         1262
Name: count, dtype: int64