In [1]:
from scapy.utils import RawPcapReader
import pandas as pd
import struct
import ipaddress
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score
import numpy as np
from matplotlib import pyplot as plt
from sklearn.tree import export_text
import pickle

endian = "little"
endian_prefix = "<" if endian == "little" else ">"

In [2]:
def read_file(fname):
    return [pkt[14:54] for (pkt, _meta) in RawPcapReader(fname)]


def parse_ipv4_header(ipv4_header):
    version_ihl, tos, ip_total_length = struct.unpack(
        f"{endian_prefix}BBH", ipv4_header[:4]
    )
    ip_version = version_ihl >> 4
    ip_ihl = version_ihl & 0xF
    ip_preference = tos >> 5  # 上位3ビット
    ip_dscp = tos >> 2  # 上位6ビット
    ip_ecn = tos & 0x3  # 最後の2bitはECN(Explicit Congestion Notification)

    ip_identification, flags_fragoffset = struct.unpack(
        f"{endian_prefix}HH", ipv4_header[4:8]
    )
    ip_flags = flags_fragoffset >> 13  # 上位3ビット
    ip_df_bit = (ip_flags & 0x2) >> 1
    ip_mf_bit = ip_flags & 0x1
    ip_frag_offset = flags_fragoffset & 0x1FFF

    ip_ttl, ip_protocol, ip_header_checksum = struct.unpack(
        f"{endian_prefix}BBH", ipv4_header[8:12]
    )
    # src_ip = struct.unpack('!4B', ipv4_header[12:16])
    # dst_ip = struct.unpack('!4B', ipv4_header[16:20])

    return (
        ip_version,
        ip_total_length,
        ip_ihl,
        ip_preference,
        ip_dscp,
        ip_ecn,
        ip_identification,
        ip_flags,
        ip_df_bit,
        ip_mf_bit,
        ip_frag_offset,
        ip_ttl,
        ip_protocol,
        ip_header_checksum,
    )


def parse_tcp_header(tcp_header):
    tcp_source_port, tcp_dest_port = struct.unpack(f"{endian_prefix}HH", tcp_header[:4])
    tcp_sequence_num, tcp_ack_num = struct.unpack(
        f"{endian_prefix}LL", tcp_header[4:12]
    )
    data_offset_reserved_flags, tcp_window_size = struct.unpack(
        "!HH", tcp_header[12:16]
    )
    tcp_checksum, tcp_urgent_pointer = struct.unpack(
        f"{endian_prefix}HH", tcp_header[16:20]
    )

    tcp_data_offset = data_offset_reserved_flags >> 12
    tcp_flags = data_offset_reserved_flags & 0x1FF
    tcp_ns_flag = (tcp_flags >> 8) & 1
    tcp_cwr_flag = (tcp_flags >> 7) & 1
    tcp_ece_flag = (tcp_flags >> 6) & 1
    tcp_urg_flag = (tcp_flags >> 5) & 1
    tcp_ack_flag = (tcp_flags >> 4) & 1
    tcp_psh_flag = (tcp_flags >> 3) & 1
    tcp_rst_flag = (tcp_flags >> 2) & 1
    tcp_syn_flag = (tcp_flags >> 1) & 1
    tcp_fin_flag = tcp_flags & 1

    return (
        tcp_source_port,
        tcp_dest_port,
        tcp_sequence_num,
        tcp_ack_num,
        tcp_data_offset,
        tcp_ns_flag,
        tcp_cwr_flag,
        tcp_ece_flag,
        tcp_urg_flag,
        tcp_ack_flag,
        tcp_psh_flag,
        tcp_rst_flag,
        tcp_syn_flag,
        tcp_fin_flag,
        tcp_flags,
        tcp_window_size,
        tcp_urgent_pointer,
    )

In [3]:
columns = [
    "y",
    "ip_version",
    "ip_total_length",
    "ip_ihl",
    "ip_preference",
    "ip_dscp",
    "ip_ecn",
    "ip_identification",
    "ip_flags",
    "ip_df_bit",
    "ip_mf_bit",
    "ip_frag_offset",
    "ip_ttl",
    "ip_protocol",
    "ip_header_checksum",
    "tcp_source_port",
    "tcp_dest_port",
    "tcp_sequence_num",
    "tcp_ack_num",
    "tcp_data_offset",
    "tcp_ns_flag",
    "tcp_cwr_flag",
    "tcp_ece_flag",
    "tcp_urg_flag",
    "tcp_ack_flag",
    "tcp_psh_flag",
    "tcp_rst_flag",
    "tcp_syn_flag",
    "tcp_fin_flag",
    "tcp_flags",
    "tcp_window_size",
    "tcp_urgent_pointer",
]

features = [
    "ip_ihl",
    "ip_version",
    "ip_preference",
    "ip_dscp",
    "ip_total_length",
    "ip_frag_offset",
    "ip_ttl",
    "ip_protocol",
    "tcp_window_size",
    "tcp_cwr_flag",
    "tcp_ece_flag",
    "tcp_urg_flag",
    "tcp_ack_flag",
    "tcp_psh_flag",
    "tcp_rst_flag",
    "tcp_syn_flag",
    "tcp_fin_flag",
]
data = []

pkts = read_file("nmap.pcap")
for pkt in pkts:
    ipv4_header = pkt[:20]
    tcp_header = pkt[20:]
    data.append(
        [1] + list(parse_ipv4_header(ipv4_header)) + list(parse_tcp_header(tcp_header))
    )
pkts = read_file("scp.pcap")
for pkt in pkts:
    ipv4_header = pkt[:20]
    tcp_header = pkt[20:]
    data.append(
        [0] + list(parse_ipv4_header(ipv4_header)) + list(parse_tcp_header(tcp_header))
    )

In [4]:
df = pd.DataFrame(data, columns=columns)
df = df.drop(["ip_identification", "tcp_urgent_pointer", "ip_header_checksum"], axis=1)

In [5]:
X, y = df[features], df["y"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42
)

In [6]:
clf = DecisionTreeClassifier(random_state=0)
clf.fit(X_train, y_train)
roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])

0.9960782238826228

In [7]:
with open("tree.pkl", "wb") as f:
    pickle.dump((clf, X.columns.tolist()), f)

In [8]:
print(export_text(clf, feature_names=X.columns.tolist()))

|--- ip_dscp <= 1.00
|   |--- tcp_psh_flag <= 0.50
|   |   |--- tcp_window_size <= 114.00
|   |   |   |--- class: 1
|   |   |--- tcp_window_size >  114.00
|   |   |   |--- tcp_window_size <= 46600.00
|   |   |   |   |--- tcp_fin_flag <= 0.50
|   |   |   |   |   |--- tcp_window_size <= 500.50
|   |   |   |   |   |   |--- tcp_window_size <= 246.50
|   |   |   |   |   |   |   |--- tcp_window_size <= 231.00
|   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |--- tcp_window_size >  231.00
|   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |--- tcp_window_size >  246.50
|   |   |   |   |   |   |   |--- tcp_window_size <= 372.00
|   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |--- tcp_window_size >  372.00
|   |   |   |   |   |   |   |   |--- tcp_window_size <= 499.50
|   |   |   |   |   |   |   |   |   |--- tcp_window_size <= 498.50
|   |   |   |   |   |   |   |   |   |   |--- tcp_window_size <= 494.50
|   |   |   |   |   | 

In [9]:
endl = "\n"
par_left = "{"
par_right = "}"


def dump_tree(clf, feature_names, node_idx=0, indent_cnt=2, indent_char=" "):
    code = ""
    if clf.tree_.threshold[node_idx] != -2:
        code += f"{indent_cnt*indent_char}if ({feature_names[clf.tree_.feature[node_idx]]} <= {int(clf.tree_.threshold[node_idx])}) {par_left+endl}"
        indent_cnt += 1

        if clf.tree_.children_left[node_idx] != -1:
            code += dump_tree(
                clf,
                feature_names,
                clf.tree_.children_left[node_idx],
                indent_cnt,
                indent_char,
            )
        indent_cnt -= 1
        code += f"{indent_cnt*indent_char+par_right+endl+indent_cnt*indent_char}else {par_left+endl}"

        indent_cnt += 1
        if clf.tree_.children_right[node_idx] != -1:
            code += dump_tree(
                clf,
                feature_names,
                clf.tree_.children_right[node_idx],
                indent_cnt,
                indent_char,
            )
        indent_cnt -= 1
        code += indent_cnt * indent_char + par_right + endl

    else:
        code += f"{indent_cnt*indent_char}return {clf.tree_.value[node_idx].argmax()};{endl}"

    return code


include_def = "#include <stdint.h>\n"
func_def = (
    "inline int filter_func(unsigned int ip_ihl, unsigned int ip_version,\n"
    "                        int ip_preference, int ip_dscp, uint16_t ip_total_length,\n"
    "                        uint16_t ip_frag_offset, uint8_t ip_ttl, uint8_t ip_protocol,\n"
    "                        uint16_t tcp_source_port, uint16_t tcp_dest_port,\n"
    "                        unsigned int tcp_sequence_num, unsigned int tcp_ack_num,\n"
    "                        uint16_t tcp_windows_size, uint16_t tcp_urgent_pointer, uint16_t tcp_cwr_flag,\n"
    "                        uint16_t tcp_ece_flag, uint16_t tcp_urg_flag, uint16_t tcp_ack_flag,\n"
    "                        uint16_t tcp_psh_flag, uint16_t tcp_rst_flag) {\n"
)
dumped_tree = dump_tree(clf, X.columns.tolist(), indent_char=" ")

In [10]:
print(include_def + "\n" + func_def + dumped_tree + "}")

#include <stdint.h>

inline int filter_func(unsigned int ip_ihl, unsigned int ip_version,
                        int ip_preference, int ip_dscp, uint16_t ip_total_length,
                        uint16_t ip_frag_offset, uint8_t ip_ttl, uint8_t ip_protocol,
                        uint16_t tcp_source_port, uint16_t tcp_dest_port,
                        unsigned int tcp_sequence_num, unsigned int tcp_ack_num,
                        uint16_t tcp_windows_size, uint16_t tcp_urgent_pointer, uint16_t tcp_cwr_flag,
                        uint16_t tcp_ece_flag, uint16_t tcp_urg_flag, uint16_t tcp_ack_flag,
                        uint16_t tcp_psh_flag, uint16_t tcp_rst_flag) {
  if (ip_dscp <= 1) {
   if (tcp_psh_flag <= 0) {
    if (tcp_window_size <= 114) {
     return 1;
    }
    else {
     if (tcp_window_size <= 46600) {
      if (tcp_fin_flag <= 0) {
       if (tcp_window_size <= 500) {
        if (tcp_window_size <= 246) {
         if (tcp_window_size <= 231) {
          return 1;
 

In [11]:
clf = LogisticRegression(random_state=0)
clf.fit(X_train, y_train)
print(roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]))

0.9974299576030423


In [12]:
with open("lr.pkl", "wb") as f:
    pickle.dump((clf, X.columns.tolist()), f)

In [13]:
def dump_logisticregression(clf, feature_names, threshold=0, indent_char=" "):
    code = indent_char
    code += f" return ({clf.intercept_[0]}"
    for c, n in zip(clf.coef_[0], feature_names):
        code += f" + ({c} * (float){n})"
    code += f") > {threshold};\n"
    return code


include_def = "#include <stdint.h>\n"
func_def = (
    "inline int filter_func(unsigned int ip_ihl, unsigned int ip_version,\n"
    "                        int ip_preference, int ip_dscp, uint16_t ip_total_length,\n"
    "                        uint16_t ip_frag_offset, uint8_t ip_ttl, uint8_t ip_protocol,\n"
    "                        uint16_t tcp_source_port, uint16_t tcp_dest_port,\n"
    "                        unsigned int tcp_sequence_num, unsigned int tcp_ack_num,\n"
    "                        uint16_t tcp_windows_size, uint16_t tcp_urgent_pointer, uint16_t tcp_cwr_flag,\n"
    "                        uint16_t tcp_ece_flag, uint16_t tcp_urg_flag, uint16_t tcp_ack_flag,\n"
    "                        uint16_t tcp_psh_flag, uint16_t tcp_rst_flag) {\n"
)
dumped_lm = dump_logisticregression(clf, X.columns.tolist(), indent_char=" ")

In [14]:
print(include_def + "\n" + func_def + dumped_lm + "}")

#include <stdint.h>

inline int filter_func(unsigned int ip_ihl, unsigned int ip_version,
                        int ip_preference, int ip_dscp, uint16_t ip_total_length,
                        uint16_t ip_frag_offset, uint8_t ip_ttl, uint8_t ip_protocol,
                        uint16_t tcp_source_port, uint16_t tcp_dest_port,
                        unsigned int tcp_sequence_num, unsigned int tcp_ack_num,
                        uint16_t tcp_windows_size, uint16_t tcp_urgent_pointer, uint16_t tcp_cwr_flag,
                        uint16_t tcp_ece_flag, uint16_t tcp_urg_flag, uint16_t tcp_ack_flag,
                        uint16_t tcp_psh_flag, uint16_t tcp_rst_flag) {
  return (0.0023098538848292474 + (0.01154640999695451 * (float)ip_ihl) + (0.009237127993530991 * (float)ip_version) + (0.02387782900311908 * (float)ip_preference) + (-2.708327486274549 * (float)ip_dscp) + (-6.738141921301666e-05 * (float)ip_total_length) + (-0.1069027947436406 * (float)ip_frag_offset) + (0.1477940478

In [41]:
clf = MLPClassifier(hidden_layer_sizes=(10, 10), random_state=1, max_iter=50)
clf.fit(X_train, y_train)
print(roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]))

0.9969013183774594


In [42]:
with open("mlp.pkl", "wb") as f:
    pickle.dump((clf, X.columns.tolist()), f)

In [43]:
def dump_mlp(clf, feature_names, threshold=0, indent_char=" "):
    code = ""
    for c, n in enumerate(feature_names):
        code += f"{indent_char}float h_0_{c} = (float){n};\n"

    len_layers = len(clf.coefs_)

    for layer_id in range(len_layers):
        code += "\n"
        for j in range(clf.coefs_[layer_id].shape[1]):
            code += f"{indent_char}float h_{layer_id + 1}_{j} = {clf.intercepts_[layer_id][j]}"
            for c in range(len(clf.coefs_[layer_id][:, j])):
                code += f" + ({clf.coefs_[layer_id][c, j]} * h_{layer_id}_{c})"
            code += ";\n"
            if layer_id < len_layers - 1:
                if clf.activation == "relu":
                    code += f"{indent_char}h_{layer_id + 1}_{j} = max(0, h_{layer_id}_{j});\n"
            else:
                code += f"{indent_char}return h_{layer_id + 1}_{j} > {threshold};\n"
    return code


include_def = "#include <stdint.h>\n"
func_def = (
    "inline int filter_func(unsigned int ip_ihl, unsigned int ip_version,\n"
    "                        int ip_preference, int ip_dscp, uint16_t ip_total_length,\n"
    "                        uint16_t ip_frag_offset, uint8_t ip_ttl, uint8_t ip_protocol,\n"
    "                        uint16_t tcp_source_port, uint16_t tcp_dest_port,\n"
    "                        unsigned int tcp_sequence_num, unsigned int tcp_ack_num,\n"
    "                        uint16_t tcp_windows_size, uint16_t tcp_urgent_pointer, uint16_t tcp_cwr_flag,\n"
    "                        uint16_t tcp_ece_flag, uint16_t tcp_urg_flag, uint16_t tcp_ack_flag,\n"
    "                        uint16_t tcp_psh_flag, uint16_t tcp_rst_flag) {\n"
)
dumped_mlp = dump_mlp(clf, X.columns.tolist(), indent_char=" ")

In [44]:
print(include_def + "\n" + func_def + dumped_mlp + "}")

#include <stdint.h>

inline int filter_func(unsigned int ip_ihl, unsigned int ip_version,
                        int ip_preference, int ip_dscp, uint16_t ip_total_length,
                        uint16_t ip_frag_offset, uint8_t ip_ttl, uint8_t ip_protocol,
                        uint16_t tcp_source_port, uint16_t tcp_dest_port,
                        unsigned int tcp_sequence_num, unsigned int tcp_ack_num,
                        uint16_t tcp_windows_size, uint16_t tcp_urgent_pointer, uint16_t tcp_cwr_flag,
                        uint16_t tcp_ece_flag, uint16_t tcp_urg_flag, uint16_t tcp_ack_flag,
                        uint16_t tcp_psh_flag, uint16_t tcp_rst_flag) {
 float h_0_0 = (float)ip_ihl;
 float h_0_1 = (float)ip_version;
 float h_0_2 = (float)ip_preference;
 float h_0_3 = (float)ip_dscp;
 float h_0_4 = (float)ip_total_length;
 float h_0_5 = (float)ip_frag_offset;
 float h_0_6 = (float)ip_ttl;
 float h_0_7 = (float)ip_protocol;
 float h_0_8 = (float)tcp_window_size;
 float