In [1]:
import pandas as pd
from collections import Counter
import os

In [2]:
df = pd.read_csv('merged_Friday.csv')

In [3]:
# Divide each value in the 'Fwd Packet Length Mean' column by the corresponding value in the 'Total Fwd Packets' column
df['Fwd Packets Normalized'] = (df[' Fwd Packet Length Mean'] / df[' Total Fwd Packets']).round(3)

df['Bwd Packets Normalized'] = (df[' Bwd Packet Length Mean'] / df[' Total Backward Packets']).round(3)

In [4]:
# Define ranges for categorization
category_ranges = {
    'fwd_low': (0, 1000.00),
    'fwd_medium': (1001.00, 2000.00),
    'fwd_high': (2001.00, 3000.00)
}

# Function to categorize values based on ranges
def categorize_value(value, ranges):
    if pd.isnull(value):
        return 'fwd_low'  # Treat NaN values as 'low'
    for category, (lower, upper) in ranges.items():
        if lower <= value <= upper:
            return category
    return 'unknown'

# Apply categorization to 'Fwd Packets Normalized' column
df['Fwd Packets Category'] = df['Fwd Packets Normalized'].apply(lambda x: categorize_value(x, category_ranges))

In [5]:
# Define ranges for categorization
category_ranges = {
    'Bwd_low': (0, 1000.00),
    'Bwd_medium': (1001.00, 2000.00),
    'Bwd_high': (2001.00, 3000.00)
}

# Function to categorize values based on ranges
def categorize_value(value, ranges):
    if pd.isnull(value):
        return 'Bwd_low'  # Treat NaN values as 'low'
    for category, (lower, upper) in ranges.items():
        if lower <= value <= upper:
            return category
    return 'unknown'

# Apply categorization to 'Bwd Packets Normalized' column
df['Bwd Packets Category'] = df['Bwd Packets Normalized'].apply(lambda x: categorize_value(x, category_ranges))

In [6]:
essential_columns = [' Source IP', ' Destination IP',' Destination Port', ' Protocol', 'Fwd Packets Category', 'Bwd Packets Category', ' Flow Duration',]

In [7]:
# Drop non-essential columns
filtered_df = df[essential_columns]
print("\nFiltered dataset Summery")
# Function to convert IP addresses to first subnet
def convert_to_first_subnet(ip_address):
    octets = ip_address.split('.')
    octets[2] = '0'
    octets[3] = '0'
    return '.'.join(octets)


# Strip leading and trailing spaces from 'Source IP' and 'Destination IP' columns
filtered_df[' Source IP'] = filtered_df[' Source IP'].astype(str).str.strip()
filtered_df[' Destination IP'] = filtered_df[' Destination IP'].astype(str).str.strip()

# Apply the function to 'Source IP' and 'Destination IP' columns
filtered_df[' Source IP'] = filtered_df[' Source IP'].apply(convert_to_first_subnet)
filtered_df[' Destination IP'] = filtered_df[' Destination IP'].apply(convert_to_first_subnet)


Filtered dataset Summery


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df[' Source IP'] = filtered_df[' Source IP'].astype(str).str.strip()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df[' Destination IP'] = filtered_df[' Destination IP'].astype(str).str.strip()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df[' Source IP'] = filtered_df

In [8]:
with_ID = filtered_df.copy()
with_ID.insert(0, 'alertID', range(len(with_ID)))
print("\nAdding a ID number for the records")
print(with_ID.head())

dataset_with_ID = with_ID.values.tolist() 


Adding a ID number for the records
   alertID   Source IP  Destination IP   Destination Port   Protocol  \
0        0  104.16.0.0     192.168.0.0              54865          6   
1        1  104.16.0.0     192.168.0.0              55054          6   
2        2  104.16.0.0     192.168.0.0              55055          6   
3        3  104.17.0.0     192.168.0.0              46236          6   
4        4  104.19.0.0     192.168.0.0              54863          6   

  Fwd Packets Category Bwd Packets Category   Flow Duration  
0              fwd_low              Bwd_low               3  
1              fwd_low              Bwd_low             109  
2              fwd_low              Bwd_low              52  
3              fwd_low              Bwd_low              34  
4              fwd_low              Bwd_low               3  


In [9]:
print("\nconverting the dataset in to a list")
dataset = [alert[1:] for alert in dataset_with_ID]


converting the dataset in to a list


In [10]:
# Convert each sublist to a tuple for hashability
data_tuples = [tuple(sublist) for sublist in dataset]

# Count occurrences of each unique record
record_counts = Counter(data_tuples)

print("\nnumber occurrences of each unique record")
print(f"{len(record_counts) }")


number occurrences of each unique record
470366


In [11]:
# Assuming 'filtered_df' is your DataFrame with categorical values

# Create a copy of the DataFrame
new_df = filtered_df.copy()

# Initialize a global mapping to keep track of numerical values
global_mapping = Counter()

# Initialize dictionaries to store both forward and reverse mappings
forward_mapping = {}
reverse_mapping = {}

# Replace categorical values with numerical values using global mapping
for column in new_df.columns:
    # Get unique values in the current column
    unique_values = new_df[column].unique()

    # Map each unique value to a numerical value based on global mapping
    forward_mapping[column] = {value: global_mapping.setdefault(value, len(global_mapping)) for value in unique_values}

    # Create a reverse mapping dictionary
    reverse_mapping[column] = {num: val for val, num in forward_mapping[column].items()}

    # Replace categorical values in the column with numerical values
    new_df[column] = new_df[column].map(forward_mapping[column])



In [12]:
# Calculate the count of unique values in the 'type' field
type_counts = df[' Label'].value_counts()

print("\nDisplay the count of unique values in Label")
print(type_counts)


Display the count of unique values in Label
 Label
BENIGN      414322
PortScan    158930
DDoS        128027
Bot           1966
Name: count, dtype: int64


In [13]:
combined_dict = {k: v for inner_dict in reverse_mapping.values() for k, v in inner_dict.items()}

item_dataset = [tuple(x) for x in new_df.to_records(index=False)]

# Add ID field to the start of each tuple
item_dataset_withID = [(i,) + record for i, record in enumerate(item_dataset)]

new_df.to_csv('data.txt', index=False, sep=' ', header=False)

In [15]:
# Run the algorithm

os.system("java -jar spmf.jar run FPMax data.txt output.txt 0.05%")

itemset_records_object = []
itemset_records_numbers = []

# Read the output file line by line
outFile = open("output.txt", 'r', encoding='utf-8')
for string in outFile:
    itemset = []
    parts = string.split('#SUP:')
    numbers = list(map(int, parts[0].split()))
    support_count = int(parts[1].strip())

    itemset_using_numbers = [numbers, support_count]
    itemset_records_numbers.append(itemset_using_numbers)

    # Translate numerical values to attribute names using reverse mapping
    attribute_names = [str(combined_dict[num]) for num in numbers]
    itemset = [attribute_names, support_count]
    # Output the result
    #print(f"Pattern: {' '.join(attribute_names)}, Support Count: {str(support_count)}")
    itemset_records_object.append(itemset)

outFile.close()

def return_unique_labels(alertID_List):
    # Filter DataFrame based on selected IDs
    selected_records = df.iloc[alertID_List]
    # Count unique values in a certain field (e.g., Field1) in the selected records
    unique_value_counts = selected_records[' Label'].value_counts()
    return(unique_value_counts)

for index,record in enumerate(itemset_records_numbers):
    itemset = record[0]
    containing_alerts = []
    for alert in item_dataset_withID:
        alert_items = set(alert[1:])  # Exclude the ID field
        if set(itemset).issubset(alert_items):
            containing_alerts.append(alert[0])  # Append the ID

    print(f"Pattern {index}: {itemset_records_object[index][0]}, \n{return_unique_labels(containing_alerts)}\n===============================================================================================================")


Pattern 0: ['192.168.0.0', 'Bwd_low', 'fwd_low', '229'], 
 Label
BENIGN    353
Name: count, dtype: int64
Pattern 1: ['192.168.0.0', 'Bwd_low', 'fwd_low', '142'], 
 Label
BENIGN      346
PortScan      8
Name: count, dtype: int64
Pattern 2: ['192.168.0.0', 'Bwd_low', 'fwd_low', '6', '136.243.0.0'], 
 Label
BENIGN    354
Name: count, dtype: int64
Pattern 3: ['192.168.0.0', 'Bwd_low', 'fwd_low', '6', '185.13.0.0'], 
 Label
BENIGN    357
Name: count, dtype: int64
Pattern 4: ['192.168.0.0', 'Bwd_low', 'fwd_low', '6', '38.69.0.0'], 
 Label
BENIGN    359
Name: count, dtype: int64
Pattern 5: ['192.168.0.0', 'Bwd_low', 'fwd_low', '6', '192.0.0.0'], 
 Label
BENIGN    361
Name: count, dtype: int64
Pattern 6: ['192.168.0.0', 'Bwd_low', 'fwd_low', '242'], 
 Label
BENIGN    362
Name: count, dtype: int64
Pattern 7: ['192.168.0.0', 'Bwd_low', 'fwd_low', '6', '54.243.0.0'], 
 Label
BENIGN    363
Name: count, dtype: int64
Pattern 8: ['192.168.0.0', 'Bwd_low', 'fwd_low', '244'], 
 Label
BENIGN    363
Name