<a href="https://colab.research.google.com/github/Mc4minta/AIB5-PcapAttackClassifier/blob/main/Baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install scapy

Collecting scapy
  Downloading scapy-2.6.1-py3-none-any.whl.metadata (5.6 kB)
Downloading scapy-2.6.1-py3-none-any.whl (2.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scapy
Successfully installed scapy-2.6.1


# TPs-ESIR-S9 Original

In [5]:
# Necessary imports
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from scapy.all import *
import pandas as pd # Add pandas here as it's used in display functions
import os # Import the os module

# Model and Tokenizer Loading
tokenizer = AutoTokenizer.from_pretrained("rdpahalavan/bert-network-packet-flow-header-payload")
model = AutoModelForSequenceClassification.from_pretrained("rdpahalavan/bert-network-packet-flow-header-payload")

# Classes
classes = [
 'Analysis',
 'Backdoor',
 'Bot',
 'DDoS',
 'DoS',
 'DoS GoldenEye',
 'DoS Hulk',
 'DoS SlowHTTPTest',
 'DoS Slowloris',
 'Exploits',
 'FTP Patator',
 'Fuzzers',
 'Generic',
 'Heartbleed',
 'Infiltration',
 'Normal',
 'Port Scan',
 'Reconnaissance',
 'SSH Patator',
 'Shellcode',
 'Web Attack - Brute Force',
 'Web Attack - SQL Injection',
 'Web Attack - XSS',
 'Worms']

# Device setup
device = torch.device("cpu")
model = model.to(device)

# Initialize dictionaries and lists for packet analysis.
packets_brief = {}
forward_packets = {}
backward_packets = {}
protocols = []
protocol_counts = {}

def processing_packet_conversion(packet):
    # Clone the packet for processing without modifying the original.
    packet_2 = packet

    while packet_2:
        # Extract and count protocol layers in the packet.
        layer = packet_2[0]
        if layer.name not in protocol_counts:
            protocol_counts[layer.name] = 0
        else:
            protocol_counts[layer.name] += 1
        protocols.append(layer.name)

        # Break if there are no more payload layers.
        if not layer.payload:
            break
        packet_2 = layer.payload

    # Extract relevant information for feature creation.
    src_ip = packet[IP].src
    dst_ip = packet[IP].dst
    src_port = packet.sport
    dst_port = packet.dport
    ip_length = len(packet[IP])
    ip_ttl = packet[IP].ttl
    ip_tos = packet[IP].tos
    tcp_data_offset = packet[TCP].dataofs
    tcp_flags = packet[TCP].flags

    # Process payload content and create a feature string.
    payload_bytes = bytes(packet.payload)
    payload_length = len(payload_bytes)
    payload_content = payload_bytes.decode('utf-8', 'replace')
    payload_decimal = ' '.join(str(byte) for byte in payload_bytes)
    final_data = "0" + " " + "0" + " " + "195" + " " + "-1" + " " + str(src_port) + " " + str(dst_port) + " " + str(ip_length) + " " + str(payload_length) + " " + str(ip_ttl) + " " + str(ip_tos) + " " + str(tcp_data_offset) + " " + str(int(tcp_flags)) + " " + "-1" + " " + str(payload_decimal)
    return final_data


# Prediction function
text_data = []

def predictingRowsCategory(file_path, filter):
  packets_brief.clear()  # Clear the dictionary tracking packet briefs.

  packets_nbr = 0  # Initialize packet counter.
  with PcapReader(file_path) as pcap:  # Iterate through packets in the pcap file.
    for pkt in pcap:
      if IP in pkt :  # Check for IPv4 packets
        if TCP in pkt:

          input_line = processing_packet_conversion(pkt)  # Process packet data
          if input_line is not None:

            truncated_line = input_line[:1024]  # Limit input length
            tokens = tokenizer(truncated_line, return_tensors="pt")  # Tokenize input
            tokens = {key: value.to(device) for key, value in tokens.items()} # Move tokens to device
            outputs = model(**tokens)  # Pass tokens through the model
            logits = outputs.logits
            probabilities = logits.softmax(dim=1)  # Calculate class probabilities
            predicted_class = torch.argmax(probabilities, dim=1).item()  # Get predicted class index

            predictedAttack = classes[predicted_class]  # Map index to corresponding attack class

            if predictedAttack != "Normal":
              # Update or add count for non-normal packets in packets_brief dictionary
              if predictedAttack not in packets_brief :
                packets_brief[predictedAttack] = 1
              else :
                packets_brief[predictedAttack] += 1

            # Print prediction details
            print("Predicted class:", predicted_class)
            print("predicted class is : ", classes[predicted_class])
            print("Class probabilities:", probabilities.tolist())

          packets_nbr += 1  # Increment packet counter
  return packets_nbr

In [6]:
import pandas as pd

def display_detected_attack_summary(packets_brief, total_packets):
    """
    Displays a summary table of predicted attack types and their percentages
    out of the total packets processed.

    Args:
        packets_brief (dict): A dictionary with predicted classes and their counts.
        total_packets (int): The total number of packets processed.
    """
    if not packets_brief:
        print("No packet data to display.")
        return

    attack_data = []
    # Include all classes from packets_brief
    data_to_display = packets_brief
    total_for_percentage = total_packets
    table_title = "Summary of Predicted Packet Classes:"
    total_row_label = 'Total Packets Processed'


    if total_for_percentage == 0:
         print("No packets were processed.")
         return

    for attack, count in data_to_display.items():
        percentage = (count / total_for_percentage) * 100
        attack_data.append({'Attack Type': attack, 'Packet Count': count, 'Percentage (%)': percentage})

    attack_percentages_df = pd.DataFrame(attack_data)

    attack_percentages_df = attack_percentages_df.sort_values(by='Percentage (%)', ascending=False)
    total_row = pd.DataFrame([{'Attack Type': total_row_label, 'Packet Count': total_for_percentage, 'Percentage (%)': 100.0}])
    attack_percentages_df = pd.concat([attack_percentages_df, total_row], ignore_index=True)


    print(table_title)
    display(attack_percentages_df)

In [7]:
total_packets = predictingRowsCategory("/content/ftpbrute-ubuntu.pcap", b"")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
predicted class is :  Normal
Class probabilities: [[1.0251880724551632e-12, 1.3259894050821064e-11, 9.166759795675716e-09, 1.276132053362744e-07, 9.730351990810959e-08, 1.4621384103463697e-09, 3.376834101231907e-08, 3.168377915585552e-08, 5.576389905392887e-10, 2.5807612047401562e-08, 6.463586714744451e-07, 9.728592931246283e-11, 6.121830153915653e-09, 7.723751416222058e-09, 6.279362030348068e-10, 0.999998927116394, 5.981068085247898e-09, 1.1803816257760502e-10, 1.7372070715282462e-07, 1.4523864749760307e-11, 2.4447850410780347e-09, 1.0063404970450662e-10, 1.0078705869887017e-08, 7.343653463109945e-11]]
Predicted class: 15
predicted class is :  Normal
Class probabilities: [[1.9160532535589248e-12, 1.57989558935423e-11, 9.016129176586674e-09, 3.925426597106707e-07, 1.244881531192732e-07, 3.5087404093303576e-09, 4.349526605551546e-08, 5.9058574919390594e-08, 1.3769020368314955e-09, 2.9528429479341867e-08, 1.7059229548976873

In [8]:
display_detected_attack_summary(packets_brief, total_packets)

Summary of Predicted Packet Classes:


Unnamed: 0,Attack Type,Packet Count,Percentage (%)
0,DoS GoldenEye,571,2.823239
1,DoS SlowHTTPTest,286,1.414091
2,FTP Patator,248,1.226205
3,Bot,40,0.197775
4,Web Attack - Brute Force,20,0.098888
5,Fuzzers,5,0.024722
6,Total Packets Processed,20225,100.0


In [9]:
total_packets

20225

# TPs-ESIR39 Modified

In [3]:
# Necessary imports
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from scapy.all import *
import pandas as pd # Add pandas here as it's used in display functions
import os # Import the os module

# Model and Tokenizer Loading
tokenizer = AutoTokenizer.from_pretrained("rdpahalavan/bert-network-packet-flow-header-payload")
model = AutoModelForSequenceClassification.from_pretrained("rdpahalavan/bert-network-packet-flow-header-payload")

# Classes
classes = [
 'Analysis',
 'Backdoor',
 'Bot',
 'DDoS',
 'DoS',
 'DoS GoldenEye',
 'DoS Hulk',
 'DoS SlowHTTPTest',
 'DoS Slowloris',
 'Exploits',
 'FTP Patator',
 'Fuzzers',
 'Generic',
 'Heartbleed',
 'Infiltration',
 'Normal',
 'Port Scan',
 'Reconnaissance',
 'SSH Patator',
 'Shellcode',
 'Web Attack - Brute Force',
 'Web Attack - SQL Injection',
 'Web Attack - XSS',
 'Worms']

# Device setup
device = torch.device("cpu")
model = model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/321 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/669k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.78k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/263M [00:00<?, ?B/s]

In [20]:
# Initialize dictionaries and lists for packet analysis.
packets_brief = {}
forward_packets = {}
backward_packets = {}
protocols = []
protocol_counts = {}

def processing_packet_conversion(packet):
    # Clone the packet for processing without modifying the original.
    packet_2 = packet

    while packet_2:
        # Extract and count protocol layers in the packet.
        layer = packet_2[0]
        if layer.name not in protocol_counts:
            protocol_counts[layer.name] = 0
        else:
            protocol_counts[layer.name] += 1
        protocols.append(layer.name)

        # Break if there are no more payload layers.
        if not layer.payload:
            break
        packet_2 = layer.payload

    # Extract relevant information for feature creation.
    src_ip = packet[IP].src
    dst_ip = packet[IP].dst
    src_port = packet.sport
    dst_port = packet.dport
    ip_length = len(packet[IP])
    ip_ttl = packet[IP].ttl
    ip_tos = packet[IP].tos
    tcp_data_offset = packet[TCP].dataofs
    tcp_flags = packet[TCP].flags

    # Process payload content and create a feature string.
    payload_bytes = bytes(packet.payload)
    payload_length = len(payload_bytes)
    payload_content = payload_bytes.decode('utf-8', 'replace')
    payload_decimal = ' '.join(str(byte) for byte in payload_bytes)
    final_data = "0" + " " + "0" + " " + "195" + " " + "-1" + " " + str(src_port) + " " + str(dst_port) + " " + str(ip_length) + " " + str(payload_length) + " " + str(ip_ttl) + " " + str(ip_tos) + " " + str(tcp_data_offset) + " " + str(int(tcp_flags)) + " " + "-1" + " " + str(payload_decimal)
    return final_data


# Prediction function
text_data = []

def predictingRowsCategory(file_path, filter):
  packets_brief.clear()  # Clear the dictionary tracking packet briefs.

  print("Loading pcap file...")
  pcap = PcapReader(file_path)
  print("Pcap file loaded.")

  print("Couting packets file")
  total_pcap = 0
  for _ in pcap:
      total_pcap += 1
  print("total_pcap (counted): ", total_pcap)

  packets_nbr = 0  # Initialize packet counter.
  with PcapReader(file_path) as pcap:  # Iterate through packets in the pcap file.
    for pkt in pcap:
      if IP in pkt :  # Check for IPv4 packets
        if TCP in pkt:
          input_line = processing_packet_conversion(pkt)  # Process packet data
          if input_line is not None:

            truncated_line = input_line[:1024]  # Limit input length
            tokens = tokenizer(truncated_line, return_tensors="pt")  # Tokenize input
            tokens = {key: value.to(device) for key, value in tokens.items()} # Move tokens to device
            outputs = model(**tokens)  # Pass tokens through the model
            logits = outputs.logits
            probabilities = logits.softmax(dim=1)  # Calculate class probabilities
            predicted_class = torch.argmax(probabilities, dim=1).item()  # Get predicted class index

            predictedAttack = classes[predicted_class]  # Map index to corresponding attack class

            if predictedAttack != "Normal":
              # Update or add count for non-normal packets in packets_brief dictionary
              if predictedAttack not in packets_brief :
                packets_brief[predictedAttack] = 1
              else :
                packets_brief[predictedAttack] += 1

            # Print prediction details
            # print("Predicted class:", predicted_class)
            print("predicted class is : ", classes[predicted_class])
            # print("Class probabilities:", probabilities.tolist())

          packets_nbr += 1  # Increment packet counter
  return packets_nbr

In [21]:
result = predictingRowsCategory("/content/ftpbrute-kali.pcap", b"")

Loading pcap file...
Pcap file loaded.
Couting packets file
total_pcap (counted):  13910
predicted class is :  Normal
predicted class is :  Normal
predicted class is :  Normal
predicted class is :  Normal
predicted class is :  Normal
predicted class is :  Normal
predicted class is :  Normal
predicted class is :  Normal
predicted class is :  Normal
predicted class is :  Normal
predicted class is :  Normal
predicted class is :  Normal
predicted class is :  Normal
predicted class is :  Normal
predicted class is :  Normal
predicted class is :  Normal


KeyboardInterrupt: 