In [1]:
import pandas as pd
from collections import Counter
import os

In [2]:
df = pd.read_csv('merged_Friday.csv')

In [3]:
# Separate normal and non-normal instances
normal_traffic = df[df[' Label'] == 'BENIGN']  # Assuming 'benign' corresponds to normal traffic
non_normal_traffic = df[df[' Label'] != 'BENIGN']

# Randomly select 20000 instances from normal traffic
downsampled_normal_traffic = normal_traffic.sample(n=10000, random_state=42)

# Combine downsampled normal traffic with non-normal traffic
downsampled_df = pd.concat([downsampled_normal_traffic, non_normal_traffic])

# Shuffle the combined DataFrame
downsampled_df = downsampled_df.sample(frac=1, random_state=42).reset_index(drop=True)


In [4]:
# Divide each value in the 'Fwd Packet Length Mean' column by the corresponding value in the 'Total Fwd Packets' column
downsampled_df['Fwd Packets Normalized'] = (downsampled_df[' Fwd Packet Length Mean'] / downsampled_df[' Total Fwd Packets']).round(3)

downsampled_df['Bwd Packets Normalized'] = (downsampled_df[' Bwd Packet Length Mean'] / downsampled_df[' Total Backward Packets']).round(3)

In [5]:
# Define ranges for categorization
category_ranges = {
    'fwd_low': (0, 1000.00),
    'fwd_medium': (1001.00, 2000.00),
    'fwd_high': (2001.00, 3000.00)
}

# Function to categorize values based on ranges
def categorize_value(value, ranges):
    if pd.isnull(value):
        return 'fwd_low'  # Treat NaN values as 'low'
    for category, (lower, upper) in ranges.items():
        if lower <= value <= upper:
            return category
    return 'unknown'

# Apply categorization to 'Fwd Packets Normalized' column
downsampled_df['Fwd Packets Category'] = downsampled_df['Fwd Packets Normalized'].apply(lambda x: categorize_value(x, category_ranges))

In [6]:
# Define ranges for categorization
category_ranges = {
    'Bwd_low': (0, 1000.00),
    'Bwd_medium': (1001.00, 2000.00),
    'Bwd_high': (2001.00, 3000.00)
}

# Function to categorize values based on ranges
def categorize_value(value, ranges):
    if pd.isnull(value):
        return 'Bwd_low'  # Treat NaN values as 'low'
    for category, (lower, upper) in ranges.items():
        if lower <= value <= upper:
            return category
    return 'unknown'

# Apply categorization to 'Bwd Packets Normalized' column
downsampled_df['Bwd Packets Category'] = downsampled_df['Bwd Packets Normalized'].apply(lambda x: categorize_value(x, category_ranges))

In [7]:
essential_columns = [' Source IP', ' Destination IP',' Destination Port', ' Protocol', 'Fwd Packets Category', 'Bwd Packets Category', ' Flow Duration', ' Down/Up Ratio']

In [8]:
# Drop non-essential columns
filtered_df = downsampled_df[essential_columns]

In [9]:
with_ID = filtered_df.copy()
with_ID.insert(0, 'alertID', range(len(with_ID)))
print("\nAdding a ID number for the records")
print(with_ID.head())

dataset_with_ID = with_ID.values.tolist() 


Adding a ID number for the records
   alertID   Source IP  Destination IP   Destination Port   Protocol  \
0        0  172.16.0.1   192.168.10.50                 80          6   
1        1  172.16.0.1   192.168.10.50               6788          6   
2        2  172.16.0.1   192.168.10.50                 80          6   
3        3  172.16.0.1   192.168.10.50                800          6   
4        4  172.16.0.1   192.168.10.50               5102          6   

  Fwd Packets Category Bwd Packets Category   Flow Duration   Down/Up Ratio  
0              fwd_low              Bwd_low        72158382               0  
1              fwd_low              Bwd_low              79               1  
2              fwd_low              Bwd_low        10652759               0  
3              fwd_low              Bwd_low              97               1  
4              fwd_low              Bwd_low              31               1  


In [10]:
print("\nconverting the dataset in to a list")
dataset = [alert[1:] for alert in dataset_with_ID]


converting the dataset in to a list


In [11]:
# Convert each sublist to a tuple for hashability
data_tuples = [tuple(sublist) for sublist in dataset]

# Count occurrences of each unique record
record_counts = Counter(data_tuples)

print("\nnumber occurrences of each unique record")
print(f"{len(record_counts) }")


number occurrences of each unique record
203978


In [12]:
# Assuming 'filtered_df' is your DataFrame with categorical values

# Create a copy of the DataFrame
new_df = filtered_df.copy()

# Initialize a global mapping to keep track of numerical values
global_mapping = Counter()

# Initialize dictionaries to store both forward and reverse mappings
forward_mapping = {}
reverse_mapping = {}

# Replace categorical values with numerical values using global mapping
for column in new_df.columns:
    # Get unique values in the current column
    unique_values = new_df[column].unique()

    # Map each unique value to a numerical value based on global mapping
    forward_mapping[column] = {value: global_mapping.setdefault(value, len(global_mapping)) for value in unique_values}

    # Create a reverse mapping dictionary
    reverse_mapping[column] = {num: val for val, num in forward_mapping[column].items()}

    # Replace categorical values in the column with numerical values
    new_df[column] = new_df[column].map(forward_mapping[column])



In [13]:
# Calculate the count of unique values in the 'type' field
type_counts = downsampled_df[' Label'].value_counts()

print("\nDisplay the count of unique values in Label")
print(type_counts)


Display the count of unique values in Label
 Label
PortScan    158930
DDoS        128027
BENIGN       10000
Bot           1966
Name: count, dtype: int64


In [14]:
combined_dict = {k: v for inner_dict in reverse_mapping.values() for k, v in inner_dict.items()}

item_dataset = [tuple(x) for x in new_df.to_records(index=False)]

# Add ID field to the start of each tuple
item_dataset_withID = [(i,) + record for i, record in enumerate(item_dataset)]

new_df.to_csv('data.txt', index=False, sep=' ', header=False)

In [15]:
# Run the algorithm

os.system("java -jar spmf.jar run FPMax data.txt output.txt 0.1%")

itemset_records_object = []
itemset_records_numbers = []

# Read the output file line by line
outFile = open("output.txt", 'r', encoding='utf-8')
for string in outFile:
    itemset = []
    parts = string.split('#SUP:')
    numbers = list(map(int, parts[0].split()))
    support_count = int(parts[1].strip())

    itemset_using_numbers = [numbers, support_count]
    itemset_records_numbers.append(itemset_using_numbers)

    # Translate numerical values to attribute names using reverse mapping
    attribute_names = [str(combined_dict[num]) for num in numbers]
    itemset = [attribute_names, support_count]
    # Output the result
    #print(f"Pattern: {' '.join(attribute_names)}, Support Count: {str(support_count)}")
    itemset_records_object.append(itemset)

outFile.close()

def return_unique_labels(alertID_List):
    # Filter DataFrame based on selected IDs
    selected_records = downsampled_df.iloc[alertID_List]
    # Count unique values in a certain field (e.g., Field1) in the selected records
    unique_value_counts = selected_records[' Label'].value_counts()
    return(unique_value_counts)

for index,record in enumerate(itemset_records_numbers):
    itemset = record[0]
    containing_alerts = []
    for alert in item_dataset_withID:
        alert_items = set(alert[1:])  # Exclude the ID field
        if set(itemset).issubset(alert_items):
            containing_alerts.append(alert[0])  # Append the ID

    print(f"Pattern {index}: {itemset_records_object[index][0]}, \n{return_unique_labels(containing_alerts)}\n===============================================================================================================")


Pattern 0: ['fwd_low', 'Bwd_low', '6', '192.168.10.50', '172.16.0.1', '1', '96'], 
 Label
PortScan    309
Name: count, dtype: int64
Pattern 1: ['fwd_low', 'Bwd_low', '6', '192.168.10.50', '172.16.0.1', '1', '100'], 
 Label
PortScan    332
Name: count, dtype: int64
Pattern 2: ['fwd_low', 'Bwd_low', '6', '192.168.10.50', '172.16.0.1', '1', '95'], 
 Label
PortScan    330
BENIGN        1
Name: count, dtype: int64
Pattern 3: ['fwd_low', 'Bwd_low', '6', '192.168.10.50', '172.16.0.1', '1', '99'], 
 Label
PortScan    350
Name: count, dtype: int64
Pattern 4: ['fwd_low', 'Bwd_low', '192.168.10.25'], 
 Label
BENIGN    356
Name: count, dtype: int64
Pattern 5: ['fwd_low', '6', '192.168.10.50', '172.16.0.1', '80', 'Bwd_medium'], 
 Label
DDoS    371
Name: count, dtype: int64
Pattern 6: ['fwd_low', 'Bwd_low', '6', '192.168.10.50', '172.16.0.1', '1', '8'], 
 Label
PortScan    411
Name: count, dtype: int64
Pattern 7: ['fwd_low', 'Bwd_low', '6', '192.168.10.50', '172.16.0.1', '1', '94'], 
 Label
PortScan