# Load some libraries and the `read_pcap` function.

In [1]:
from tqdm import tqdm
import scapy
from scapy.utils import RawPcapReader, RawPcapNgReader
import pandas as pd
from pathlib import Path

assert scapy.__version__ == '2.4.4'  # pip install scapy==2.4.4


def read_pcap(path: str or Path) -> pd.DataFrame:
    path = Path(path)
    assert path.exists(), 'File not found.'

    signature = path.read_bytes()[:4]
    if signature == b'\x4D\x3C\xB2\xA1':
        Reader = RawPcapReader
        unit = 'ns'  # the second timestamp field represents a nanosecond.
        columns = ['sec', unit, 'wirelen', 'caplen', 'payload']
    elif signature == b'\xD4\xC3\xB2\xA1':
        Reader = RawPcapReader
        unit = 'us'  # the second timestamp field represents a microsecond.
        columns = ['sec', unit, 'wirelen', 'caplen', 'payload']
    elif signature == b'\n\r\r\n':
        Reader = RawPcapNgReader
        columns = ['linktype', 'tsresol', 'tshigh', 'tslow', 'wirelen', 'payload']
    else:
        raise ValueError(f'The {signature=} is an unknown signature.')

    with path.open('rb') as fp:
        reader = Reader(fp)
        iterator = tqdm(reader, desc=f'Parsing {path}')
        list_output = list()
        for payload, metadata in iterator:
            list_output.append(tuple(metadata) + (payload,))

    df = pd.DataFrame(list_output, columns=columns)
    if Reader == RawPcapReader:
        df['timestamp'] = pd.to_datetime(df['sec'], unit='s')
        df['timestamp'] += pd.to_timedelta(df[unit], unit=unit)  # absolute time
    elif Reader == RawPcapNgReader:
        df['timestamp'] = pd.to_datetime(((df['tshigh'].values << 32) + df['tslow']) * 1e-9, unit='s') # absolute time
    df['time'] = (df['timestamp'] - df['timestamp'].min()).dt.total_seconds()  # relative time
    return df[['time', 'timestamp', 'payload']].copy()

# the 36 packets that the adversary have injected repeatedly.

In [2]:
df_injected = read_pcap('single-MPEG-frame.pcap')
df_injected

Parsing single-MPEG-frame.pcap: 36it [00:00, 191132.84it/s]


Unnamed: 0,time,timestamp,payload
0,0.0,2019-06-11 10:51:55.538378650,b'\x91\xe0\xf0\x00\xfe\x82\x00\xfcp\x00\x00\x0...
1,3.7e-05,2019-06-11 10:51:55.538415200,b'\x91\xe0\xf0\x00\xfe\x82\x00\xfcp\x00\x00\x0...
2,0.00013,2019-06-11 10:51:55.538508650,b'\x91\xe0\xf0\x00\xfe\x82\x00\xfcp\x00\x00\x0...
3,0.000246,2019-06-11 10:51:55.538624900,b'\x91\xe0\xf0\x00\xfe\x82\x00\xfcp\x00\x00\x0...
4,0.000378,2019-06-11 10:51:55.538756950,b'\x91\xe0\xf0\x00\xfe\x82\x00\xfcp\x00\x00\x0...
5,0.000515,2019-06-11 10:51:55.538893300,b'\x91\xe0\xf0\x00\xfe\x82\x00\xfcp\x00\x00\x0...
6,0.001003,2019-06-11 10:51:55.539381700,b'\x91\xe0\xf0\x00\xfe\x82\x00\xfcp\x00\x00\x0...
7,0.00104,2019-06-11 10:51:55.539418400,b'\x91\xe0\xf0\x00\xfe\x82\x00\xfcp\x00\x00\x0...
8,0.001133,2019-06-11 10:51:55.539511450,b'\x91\xe0\xf0\x00\xfe\x82\x00\xfcp\x00\x00\x0...
9,0.003998,2019-06-11 10:51:55.542376300,b'\x91\xe0\xf0\x00\xfe\x82\x00\xfcp\x00\x00\x0...


# load all packet dumps

In [3]:
files = [
    'driving_01_injected.pcap',
    'driving_01_original.pcap',
    'driving_02_injected.pcap',
    'driving_02_original.pcap',
    'indoors_01_injected.pcap',
    'indoors_01_original.pcap',
    'indoors_02_injected.pcap',
    'indoors_02_original.pcap'
]

dict_df = dict()
for file in files:
    dict_df[file] = read_pcap(file)

Parsing driving_01_injected.pcap: 1007624it [00:01, 566664.13it/s]
Parsing driving_01_original.pcap: 808135it [00:00, 943437.11it/s]
Parsing driving_02_injected.pcap: 934084it [00:01, 575352.74it/s]
Parsing driving_02_original.pcap: 757688it [00:00, 954913.99it/s]
Parsing indoors_01_injected.pcap: 219780it [00:00, 585257.29it/s]
Parsing indoors_01_original.pcap: 153799it [00:00, 941767.65it/s]
Parsing indoors_02_injected.pcap: 463320it [00:00, 571997.85it/s]
Parsing indoors_02_original.pcap: 332419it [00:00, 942834.40it/s]


# label determination.

check whether each packet is one of the 36 packets. If so, the packet was injected (label `1`). If not, the packet was benign (label `0`) and the packet must be in a corresponding packet dump (*_original.pcap)

In [4]:
for file in files:
    df = dict_df[file]
    df['label'] = df['payload'].isin(df_injected['payload']).astype('int')
    print('file: {} / Label count: {}'.format(file, df['label'].value_counts().to_dict()))

file: driving_01_injected.pcap / Label count: {0: 807968, 1: 199656}
file: driving_01_original.pcap / Label count: {0: 808135}
file: driving_02_injected.pcap / Label count: {0: 757504, 1: 176580}
file: driving_02_original.pcap / Label count: {0: 757688}
file: indoors_01_injected.pcap / Label count: {0: 153792, 1: 65988}
file: indoors_01_original.pcap / Label count: {0: 153799}
file: indoors_02_injected.pcap / Label count: {0: 332414, 1: 130906}
file: indoors_02_original.pcap / Label count: {0: 332419}


print out some benign and injected packets.

In [5]:
dict_df['indoors_01_injected.pcap'].loc[25010: 25030]

Unnamed: 0,time,timestamp,payload,label
25010,120.313818,2020-03-19 03:15:50.741228032,b'\x91\xe0\xf0\x00\xfe\x82\x00\xfcp\x00\x00\x0...,0
25011,120.313845,2020-03-19 03:15:50.741254400,b'\x91\xe0\xf0\x00\xfe\x82\x00\xfcp\x00\x00\x0...,0
25012,120.313935,2020-03-19 03:15:50.741344768,b'\x91\xe0\xf0\x00\xfe\x82\x00\xfcp\x00\x00\x0...,0
25013,120.314049,2020-03-19 03:15:50.741458176,b'\x91\xe0\xf0\x00\xfe\x82\x00\xfcp\x00\x00\x0...,0
25014,120.314173,2020-03-19 03:15:50.741583104,b'\x91\xe0\xf0\x00\xfe\x82\x00\xfcp\x00\x00\x0...,0
25015,120.314298,2020-03-19 03:15:50.741707264,b'\x91\xe0\xf0\x00\xfe\x82\x00\xfcp\x00\x00\x0...,0
25016,120.314424,2020-03-19 03:15:50.741833728,b'\x91\xe0\xf0\x00\xfe\x82\x00\xfcp\x00\x00\x0...,0
25017,120.314548,2020-03-19 03:15:50.741958144,b'\x91\xe0\xf0\x00\xfe\x82\x00\xfcp\x00\x00\x0...,0
25018,120.314678,2020-03-19 03:15:50.742087168,b'\x91\xe0\xf0\x00\xfe\x82\x00\xfcp\x00\x00\x0...,0
25019,120.333835,2020-03-19 03:15:50.761244416,b'\x91\xe0\xf0\x00\xfe\x82\x00\xfcp\x00\x00\x0...,1
