In [6]:

from scapy.utils import rdpcap
from scapy.layers.inet import IP, TCP, UDP, ICMP
# Load Wireshark data from a .pcap file
packets = rdpcap('..\data\\final_capture.pcap')


In [7]:
# Import necessary libraries
import pandas as pd
from scapy.all import *



# Initialize lists to store extracted features
protocol_types = []
services = []
flags = []
durations = []
src_bytes = []
dst_bytes = []
src_ips = []  
dst_ips = [] 
src_ports = []
dst_ports = []
land = []
wrong_fragments = [] 
urgent = []  
hot = []  
num_failed_logins = []  
logged_in = []  
num_compromised = []  
root_shell = []  
su_attempted = []  
num_root = []  
num_file_creations = []  
num_shells = []  
num_access_files = []  
num_outbound_cmds = [] 
is_host_login = []  
is_guest_login = []
count = []
srv_count = []
serror_rate = []
srv_serror_rate = []
rerror_rate = []
srv_rerror_rate = []
same_srv_rate = []
diff_srv_rate = []
srv_diff_host_rate = []
dst_host_count = []
dst_host_srv_count = []
dst_host_same_srv_rate = []
dst_host_diff_srv_rate = []
dst_host_same_src_port_rate = []
dst_host_srv_diff_host_rate = []
dst_host_serror_rate = []
dst_host_srv_serror_rate = []
dst_host_rerror_rate = []
dst_host_srv_rerror_rate = []

# Extract features from each packet
for packet in packets:
    if IP in packet:
        # Extract protocol type
        protocol_types.append(packet[IP].proto if IP in packet else -1)

        # Extract service (you may need to customize this based on your data)
        #services.append(packet.sprintf("{Raw:%Raw.load%}"))
        if packet.haslayer(TCP):
            services.append(packet[TCP].dport)
        elif packet.haslayer(UDP):
            services.append(packet[UDP].dport)
        else:
            services.append(None)
        # Extract flags
        flags.append(packet.sprintf("%TCP.flags%") if TCP in packet else 'NA')

        # Extract source and destination IP addresses
        src_ips.append(packet[IP].src if IP in packet else 'NA')
        dst_ips.append(packet[IP].dst if IP in packet else 'NA')

        # Extract source and destination ports
        src_ports.append(packet[TCP].sport if TCP in packet else 'NA')
        dst_ports.append(packet[TCP].dport if TCP in packet else 'NA')

        # Extract duration, source, and destination bytes
        if TCP in packet:
            durations.append(packet.time - packets[0].time)  # Time elapsed since the first packet
            src_bytes.append(len(packet[TCP].payload))
            dst_bytes.append(len(packet[IP].payload) - len(packet[TCP].payload))
        elif UDP in packet:
            durations.append(packet.time - packets[0].time)  # Time elapsed since the first packet
            src_bytes.append(len(packet[UDP].payload))
            dst_bytes.append(len(packet[IP].payload) - len(packet[UDP].payload))
        elif ICMP in packet:
            durations.append(packet.time - packets[0].time)  # Time elapsed since the first packet
            
            # Extract ICMP type and code
            icmp_type = packet[ICMP].type
            icmp_code = packet[ICMP].code
            
            src_bytes.append(f'Type: {icmp_type}, Code: {icmp_code}')
            dst_bytes.append('NA')
        else:
            durations.append('NA')
            src_bytes.append('NA')
            dst_bytes.append('NA')
        # Extract the 'land' feature
        land.append('1' if packet[IP].src == packet[IP].dst and packet[TCP].sport == packet[TCP].dport else '0')
        # Extract the 'wrong_fragment' feature  
        wrong_fragments.append(packet[IP].frag if IP in packet else 'NA')
        
        # Extract the 'urgent' feature
        urgent.append(packet[TCP].urgptr if TCP in packet else 'NA')

        # Extract the 'hot' feature (example: searching for the keyword "hot" in the payload)
        hot.append('1' if Raw in packet and b'hot' in packet[Raw].load else '0')

        # Extract the 'num_failed_logins' feature (example: searching for specific strings indicating failed login)
        num_failed_logins.append('1' if Raw in packet and b'failed login' in packet[Raw].load else '0')

        # Extract the 'logged_in' feature (example: searching for specific strings indicating successful login)
        logged_in.append('1' if Raw in packet and b'successful login' in packet[Raw].load else '0')

        # Extract the 'num_compromised' feature (example: searching for specific strings indicating compromise)
        num_compromised.append('1' if Raw in packet and b'compromised' in packet[Raw].load else '0')

        # Extract the 'root_shell' feature (example: searching for specific strings indicating root shell)
        root_shell.append('1' if Raw in packet and b'root shell' in packet[Raw].load else '0')

        # Extract the 'su_attempted' feature (example: searching for specific strings indicating su attempt)
        su_attempted.append('1' if Raw in packet and b'su root' in packet[Raw].load else '0')

        # Extract the 'num_root' feature (example: searching for specific strings indicating root access)
        num_root.append('1' if Raw in packet and b'root access' in packet[Raw].load else '0')

        # Extract the 'num_file_creations' feature (example: searching for specific strings indicating file creation)
        num_file_creations.append('1' if Raw in packet and b'file creation' in packet[Raw].load else '0')

        # Extract the 'num_shells' feature (example: searching for specific strings indicating shell prompts)
        num_shells.append('1' if Raw in packet and b'shell prompt' in packet[Raw].load else '0')

        # Extract the 'num_access_files' feature (example: searching for specific strings indicating access file operations)
        num_access_files.append('1' if Raw in packet and b'access file' in packet[Raw].load else '0')

        # Extract the 'num_outbound_cmds' feature (example: searching for specific strings indicating outbound commands)
        num_outbound_cmds.append('1' if Raw in packet and b'outbound command' in packet[Raw].load else '0')

        # Extract the 'is_host_login' feature (example: searching for specific strings indicating host login)
        is_host_login.append('1' if Raw in packet and b'host login' in packet[Raw].load else '0')

        # Extract the 'is_guest_login' feature (example: searching for specific strings indicating guest login)
        is_guest_login.append('1' if Raw in packet and b'guest login' in packet[Raw].load else '0')

         # Extract the 'count' feature
        count.append('1' if Raw in packet and b'count' in packet[Raw].load else '0')

        # Extract the 'srv_count' feature
        srv_count.append('1' if Raw in packet and b'srv_count' in packet[Raw].load else '0')

        # Extract the 'serror_rate' feature
        serror_rate.append('1' if Raw in packet and b'serror_rate' in packet[Raw].load else '0')

        # Extract the 'srv_serror_rate' feature
        srv_serror_rate.append('1' if Raw in packet and b'srv_serror_rate' in packet[Raw].load else '0')

        # Extract the 'rerror_rate' feature
        rerror_rate.append('1' if Raw in packet and b'rerror_rate' in packet[Raw].load else '0')

        # Extract the 'srv_rerror_rate' feature
        srv_rerror_rate.append('1' if Raw in packet and b'srv_rerror_rate' in packet[Raw].load else '0')

        # Extract the 'same_srv_rate' feature
        same_srv_rate.append('1' if Raw in packet and b'same_srv_rate' in packet[Raw].load else '0')

        # Extract the 'diff_srv_rate' feature
        diff_srv_rate.append('1' if Raw in packet and b'diff_srv_rate' in packet[Raw].load else '0')

        # Extract the 'srv_diff_host_rate' feature
        srv_diff_host_rate.append('1' if Raw in packet and b'srv_diff_host_rate' in packet[Raw].load else '0')

        # Extract the 'dst_host_count' feature
        dst_host_count.append('1' if Raw in packet and b'dst_host_count' in packet[Raw].load else '0')

        # Extract the 'dst_host_srv_count' feature
        dst_host_srv_count.append('1' if Raw in packet and b'dst_host_srv_count' in packet[Raw].load else '0')

        # Extract the 'dst_host_same_srv_rate' feature
        dst_host_same_srv_rate.append('1' if Raw in packet and b'dst_host_same_srv_rate' in packet[Raw].load else '0')

        # Extract the 'dst_host_diff_srv_rate' feature
        dst_host_diff_srv_rate.append('1' if Raw in packet and b'dst_host_diff_srv_rate' in packet[Raw].load else '0')

        # Extract the 'dst_host_same_src_port_rate' feature
        dst_host_same_src_port_rate.append('1' if Raw in packet and b'dst_host_same_src_port_rate' in packet[Raw].load else '0')

        # Extract the 'dst_host_srv_diff_host_rate' feature
        dst_host_srv_diff_host_rate.append('1' if Raw in packet and b'dst_host_srv_diff_host_rate' in packet[Raw].load else '0')

        # Extract the 'dst_host_serror_rate' feature
        dst_host_serror_rate.append('1' if Raw in packet and b'dst_host_serror_rate' in packet[Raw].load else '0')

        # Extract the 'dst_host_srv_serror_rate' feature
        dst_host_srv_serror_rate.append('1' if Raw in packet and b'dst_host_srv_serror_rate' in packet[Raw].load else '0')

        # Extract the 'dst_host_rerror_rate' feature
        dst_host_rerror_rate.append('1' if Raw in packet and b'dst_host_rerror_rate' in packet[Raw].load else '0')

        # Extract the 'dst_host_srv_rerror_rate' feature
        dst_host_srv_rerror_rate.append('1' if Raw in packet and b'dst_host_srv_rerror_rate' in packet[Raw].load else '0')




# Create a DataFrame to organize the extracted features
data = {
    'src_ip': src_ips,
    'src_port': src_ports,
    'dst_ip': dst_ips,
    'dst_port': dst_ports,
    'duration': durations,
    'protocol_type': protocol_types,
    'service': services,
    'flag': flags,
    'src_bytes': src_bytes,
    'dst_bytes': dst_bytes,
    'land': land,
    'wrong_fragment': wrong_fragments,
    'urgent': urgent,
    'hot': hot,
    'num_failed_logins': num_failed_logins,
    'logged_in': logged_in,
    'num_compromised': num_compromised,
    'root_shell': root_shell,
    'su_attempted': su_attempted,
    'num_root': num_root,
    'num_file_creations': num_file_creations,
    'num_shells': num_shells,
    'num_access_files': num_access_files,
    'num_outbound_cmds': num_outbound_cmds,
    'is_host_login': is_host_login,
    'is_guest_login': is_guest_login,  
    'count' : count,
    'srv_count' : srv_count,
    'serror_rate' : serror_rate,
    'srv_serror_rate' : srv_serror_rate,
    'rerror_rate' : rerror_rate,
    'srv_rerror_rate' : srv_rerror_rate,
    'same_srv_rate' : same_srv_rate,
    'diff_srv_rate' : diff_srv_rate,
    'srv_diff_host_rate': srv_diff_host_rate,
    'dst_host_count': dst_host_count,
    'dst_host_srv_count': dst_host_srv_count,
    'dst_host_same_srv_rate': dst_host_same_srv_rate,
    'dst_host_diff_srv_rate': dst_host_diff_srv_rate,
    'dst_host_same_src_port_rate': dst_host_same_src_port_rate,
    'dst_host_srv_diff_host_rate': dst_host_srv_diff_host_rate,
    'dst_host_serror_rate': dst_host_serror_rate,
    'dst_host_srv_serror_rate': dst_host_srv_serror_rate,
    'dst_host_rerror_rate': dst_host_rerror_rate,
    'dst_host_srv_rerror_rate': dst_host_srv_rerror_rate


}

df = pd.DataFrame(data)



In [11]:
df['service'].unique()

array([1.3700e+02, 5.2364e+04, 8.0090e+03, 5.1596e+04, 6.5370e+03,
       4.4300e+02, 5.2312e+04, 5.2273e+04, 5.2307e+04, 5.2330e+04,
       1.5600e+04, 5.2315e+04, 5.2331e+04, 5.2316e+04, 2.7024e+04,
       5.1552e+04, 5.2180e+04, 5.2318e+04, 5.2350e+04, 5.2363e+04,
       5.2283e+04, 8.0080e+03, 5.1614e+04, 5.2280e+04, 5.2032e+04,
       5.2333e+04, 5.2319e+04, 5.2151e+04, 7.9890e+03, 5.7621e+04,
       5.2323e+04, 5.2322e+04, 5.2329e+04, 5.1566e+04, 5.3000e+01,
       5.6013e+04, 1.5130e+03, 5.2339e+04, 5.2365e+04, 5.0137e+04,
       5.2327e+04, 5.1443e+04, 5.1441e+04, 5.2301e+04, 5.2337e+04,
       6.2338e+04, 5.2366e+04, 5.2280e+03, 5.2272e+04, 5.2271e+04,
       5.2278e+04, 5.2281e+04, 5.9508e+04, 5.2335e+04, 5.2334e+04,
       5.2288e+04, 5.2277e+04, 5.2294e+04, 5.2302e+04, 5.2291e+04,
       5.2298e+04, 5.6700e+04, 5.2287e+04, 5.2293e+04, 1.9000e+03,
       6.1975e+04, 5.0986e+04, 5.2367e+04, 6.4385e+04, 6.2312e+04,
       5.2368e+04, 5.0281e+04, 5.4848e+04, 5.2357e+04, 6.1229e

In [None]:
# let's save to csv
df.to_csv('..\data\\benchmark.csv', index=False)

In [211]:
# now some processing to get the data into the right format of NSL-KDD dataset
# first let's create a copy of the dataframe without src_ip, dst_ip, src_port, dst_port
df2 = df.copy()
df2 = df2.drop(['src_ip', 'dst_ip', 'src_port', 'dst_port'], axis=1)


In [212]:
# Mapping destination ports to services
# Mapping destination ports to predefined service names
service_mapping = {
    8009: 'ajp',
    61225: 'other',
    6537: 'tve-file',
       53: 'dns',
    62455: 'other',
    15600: 'other',
    61229: 'other',
    27023: 'flexlm',
    61130: 'other',
    64186: 'other',
    56700: 'other',
    1900: 'ssdp',
    64188: 'other',
    64187: 'other',
    5228: 'gtalk',
    61155: 'other',
    8008: 'http-alt',
    61122: 'other',
    137: 'netbios-ns',
    138: 'netbios-dgm',
    55269: 'other',
    51838: 'other',
    7989: 'other',
    58711: 'other',
    58712: 'other',
    67: 'dhcp-server',
    68: 'dhcp-client',
    1: 'private',
    20: 'ftp_data',
    21: 'ftp',
    22: 'ssh',
    23: 'telnet',
    25: 'smtp',
    42: 'name',
    79: 'finger',
    80: 'http',
    110: 'pop_3',
    111: 'sunrpc',
    113: 'auth',
    119: 'nntp',
    123: 'ntp',
    139: 'netbios_ssn',
    143: 'imap4',
    161: 'snmp',
    194: 'irc',
    389: 'ldap',
    443: 'http_443',
    445: 'microsoft-ds',
    514: 'shell',
    636: 'ldaps',
    993: 'imaps',
    995: 'pop3s',
    1080: 'socks',
    1433: 'sql_net',
    1521: 'oracle',
    1720: 'h323q931',
    3306: 'mysql',
    3389: 'ms-wbt-server',
    5900: 'vnc',
    6660: 'irc',
    6661: 'irc',
    6662: 'irc',
    6663: 'irc',
    6664: 'irc',
    6665: 'irc',
    6666: 'irc',
    6667: 'irc',
    6668: 'irc',
    6669: 'irc',
    8001: 'http_8001',
    8080: 'http_8080'


}

# List of all destination ports you provided
all_destination_ports = [61225, 6537, 62455, 15600, 61229, 27023, 61130, 64186, 56700, 1900, 64188, 64187, 5228, 61155, 61222, 137, 138, 55269, 51838, 7989, 58711, 58712, 67, 68, 53209, 64879, 50687, 61699, 55222, 51480, 62015, 55593, 56337, 65082, 55278, 58283, 59256, 60506, 53441, 53442, 49302, 65049, 53862, 53861, 57983, 50259, 64137, 51599, 60829, 58425, 9999, 63234, 50421, 65341, 65342, 63803, 50608, 59121, 52469, 58868, 49677, 60881, 57183, 63518, 60720, 50685, 53684, 63872, 60453, 64253, 56593, 57395, 51385, 55844, 60064, 65356, 54169, 65292, 65423, 51535, 50348, 65112, 64936, 58792, 60496, 52515, 62117, 59554, 54847, 51538, 51086, 56863, 56723, 56724]

# Map all destination ports to predefined service names
mapped_services = [service_mapping.get(port, 'other') for port in all_destination_ports]

# let's add all_destination_ports to service_mapping
service_mapping.update(dict(zip(all_destination_ports, mapped_services)))

# Map destination ports to predefined service names
df2['service'] = df2['service'].map(service_mapping)


In [213]:
# Let's make a mapping for the protocol types
protocol_type_mapping = {
    -1: 'Unknown',
    2: 'IGMP',
    1: 'ICMP',
    6: 'TCP',
    17: 'UDP',
}
df2['protocol_type'] = df2['protocol_type'].map(protocol_type_mapping)


In [214]:
# Let's make a mapping for the flags
flag_mapping = {
    'NA': 'NA',
    'F': 'FIN',
    'S': 'SYN',
    'R': 'RST',
    'P': 'PSH',
    'A': 'ACK',
    'U': 'URG',
    'E': 'ECE',
    'C': 'CWR',
    'FPU': 'FIN, PSH, URG',
    'FSRPAU': 'FIN, SYN, RST, PSH, ACK, URG',
    'FSPA': 'FIN, SYN, PSH, ACK',
    'FSA': 'FIN, SYN, ACK',
    'FSR': 'FIN, SYN, RST',
    'FPU': 'FIN, PSH, URG',
    'FS': 'FIN, SYN',
    'FPA': 'FIN, PSH, ACK',
    'F': 'FIN',
    'SRA': 'SYN, RST, ACK',
    'SRPA': 'SYN, RST, PSH, ACK',
    'SR': 'SYN, RST',
    'SPA': 'SYN, PSH, ACK',
    'S': 'SYN',
    'RPA': 'RST, PSH, ACK',
    'R': 'RST',
    'PA': 'PSH, ACK',
    'PU': 'PSH, URG',
    'P': 'PSH',
    'UA': 'URG, ACK',
    'U': 'URG',
    'A': 'ACK',
    '': 'No Flag'
}
df2['flag'] = df2['flag'].map(flag_mapping)

In [216]:
#let's handle missing values and NA values
import numpy as np
df2 = df2.replace('NA', np.nan)
df2 = df2.replace('', np.nan)
df2 = df2.dropna()



In [217]:
df2

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
0,0.000000,TCP,ajp,"PSH, ACK",110,20,0,0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.008705,TCP,other,"PSH, ACK",110,20,0,0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.062685,TCP,ajp,ACK,0,20,0,0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
3,1.091396,TCP,ajp,"PSH, ACK",110,20,0,0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
4,1.104046,TCP,other,"PSH, ACK",110,20,0,0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
976,659.668482,TCP,other,"PSH, ACK",110,20,0,0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
977,659.723065,TCP,ajp,ACK,0,20,0,0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
978,659.723066,TCP,ajp,ACK,0,20,0,0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
979,661.652531,TCP,http-alt,ACK,1,20,0,0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
