In [24]:
!pip install dpkt
import dpkt      
from scapy.all import *
import pandas as pd           
import numpy as np             
import binascii                 
import seaborn as sns          
import os




In [25]:
def readPCAP(directory):    
    def list_pcap_files(folder):
        pcap_files = []
        for root, dirs, files in os.walk(folder):
            for file in files:
                if file.endswith(".pcap.gz"):
                    pcap_files.append(os.path.join(root, file))
        return pcap_files

    return list_pcap_files(directory)
    
        
    

In [26]:
listPCAP = readPCAP("../sample_data/20240301T000205.745148Z-pcap-mlab2-gru02-ndt/")

In [27]:
samplePCAP = listPCAP[0]

In [28]:
pcap = rdpcap(samplePCAP)

In [29]:
# Define lists for IP, TCP, and UDP fields
ip_fields = ['src_ip', 'dst_ip', 'len', 'proto']
tcp_fields = ['sport', 'dport', 'seq', 'ack', 'off', 'flags', 'win', 'sum', 'urp', 'opts', "ulen"]
udp_fields = ['sport', 'dport', 'ulen', 'sum']

# Function to extract IPv4 fields from data
def get_ip4_fields(data):
    src = socket.inet_ntoa(data.src)
    dst = socket.inet_ntoa(data.dst)
    plen = data.len
    proto = data.p
    return [src, dst, plen, proto]

# Function to extract IPv6 fields from data
def get_ip6_fields(data):
    src = socket.inet_ntop(socket.AF_INET6, data.src)
    dst = socket.inet_ntop(socket.AF_INET6, data.dst)
    plen = data.plen
    proto = data.p
    return [src, dst, plen, proto]

# Function to extract TCP fields
def get_tcp_fields(tcp):
    fields = []
    for field in tcp_fields:
        if field == "ulen":
            fields.append(len(tcp.data))
        else:
            fields.append(getattr(tcp, field))
    return fields

# Function to extract UDP fields
def get_udp_fields(udp):
    fields = []
    for field in udp_fields:
        fields.append(getattr(udp, field))
    return fields

# Function to convert pcap file to DataFrame
def pcap_to_df(file_path):
    # Open the pcap file
    f = open(file_path, 'rb')
    # Create a pcap reader
    pcap = dpkt.pcap.Reader(f)
    # List to store extracted fields
    fields_list = []
    # Iterate through each packet in the pcap file
    for ts, buf in pcap:
        
        fields = [ts]
        # Parse the Ethernet frame
        eth = dpkt.ethernet.Ethernet(buf)
        # Get the IP packet
        ip = eth.data
        ip_type = "ipv4"
        
        
        # Check if it is an IPv4 or IPv6 packet
        if isinstance(ip, dpkt.ip.IP):
            fields += get_ip4_fields(ip)
        elif isinstance(ip, dpkt.ip6.IP6):
            fields += get_ip6_fields(ip)
            ip_type = "ipv6"
        else:
            continue
            
        # Create dummy lists for TCP and UDP fields
        dummy_tcp_fields = [None for x in tcp_fields]
        dummy_udp_fields = [None for x in udp_fields]
        # Check the transport protocol and extract fields accordingly
        if fields[-1] == 17:  # UDP
            transport_fields = get_udp_fields(ip.data) + dummy_tcp_fields
        elif fields[-1] == 6:  # TCP
            transport_fields = dummy_udp_fields + get_tcp_fields(ip.data)
        else:
            continue
        
        # Combine IP, UDP, and TCP fields
        fields += transport_fields
        fields_list.append(fields)
    
    udp_fields_name = [f"udp_{x}" for x in udp_fields]
    tcp_fields_name = [f"tcp_{x}" for x in tcp_fields]
    field_names = ['ts'] + ip_fields + udp_fields_name + tcp_fields_name
    # Create a DataFrame from the list of fields
    df = pd.DataFrame(fields_list, columns=field_names)
    return df


In [30]:
file_path = "../sample_data/20240301T000205.745148Z-pcap-mlab2-gru02-ndt/"

In [31]:
pcap_to_df(samplePCAP)

ValueError: invalid tcpdump header

In [33]:
from scapy.all import rdpcap
import pandas as pd

def pcap_to_df(pcap_file):
    # Read pcap file
    packets = rdpcap(pcap_file)
    
    # Extract fields from each packet
    fields_list = []
    for pkt in packets:
        if pkt.haslayer('IP'):
            ip = pkt['IP']
            if pkt.haslayer('UDP'):
                udp = pkt['UDP']
                fields_list.append([pkt.time, ip.src, ip.dst, ip.len, ip.proto, udp.sport, udp.dport, len(udp.payload), udp.chksum])
            elif pkt.haslayer('TCP'):
                tcp = pkt['TCP']
                fields_list.append([pkt.time, ip.src, ip.dst, ip.len, ip.proto, tcp.sport, tcp.dport, len(tcp.payload), tcp.chksum])
    
    # Create DataFrame
    df = pd.DataFrame(fields_list, columns=['Timestamp', 'Source IP', 'Destination IP', 'Length', 'Protocol', 'Source Port', 'Destination Port', 'Payload Length', 'Checksum'])
    
    return df

# Path to the pcap file
pcap_file_path = samplePCAP

# Convert pcap to DataFrame
df = pcap_to_df(pcap_file_path)
print(df)


               Timestamp       Source IP  Destination IP  Length  Protocol  \
0   1709251267.970841000    34.68.160.32  177.136.80.216      60         6   
1   1709251267.970859000  177.136.80.216    34.68.160.32      60         6   
2   1709251268.107604000    34.68.160.32  177.136.80.216      52         6   
3   1709251268.107801000    34.68.160.32  177.136.80.216     354         6   
4   1709251268.107814000  177.136.80.216    34.68.160.32      52         6   
5   1709251268.114273000  177.136.80.216    34.68.160.32    2868         6   
6   1709251268.114287000  177.136.80.216    34.68.160.32    1779         6   
7   1709251268.250929000    34.68.160.32  177.136.80.216      52         6   
8   1709251268.251003000    34.68.160.32  177.136.80.216      52         6   
9   1709251268.251730000    34.68.160.32  177.136.80.216     116         6   
10  1709251268.251803000    34.68.160.32  177.136.80.216      76         6   
11  1709251268.251834000  177.136.80.216    34.68.160.32      52