In [1]:
import pandas as pd
from collections import Counter
import os

In [2]:
df = pd.read_csv('merged_Friday.csv')

In [3]:
# Divide each value in the 'Fwd Packet Length Mean' column by the corresponding value in the 'Total Fwd Packets' column
df['Fwd Packets Normalized'] = (df[' Fwd Packet Length Mean'] / df[' Total Fwd Packets']).round(3)

df['Bwd Packets Normalized'] = (df[' Bwd Packet Length Mean'] / df[' Total Backward Packets']).round(3)

In [4]:
import numpy as np
# Define ranges for categorization
category_ranges = {
    'fwd_low': (0, 1000.00),
    'fwd_medium': (1001.00, 2000.00),
    'fwd_high': (2001.00, 3000.00),
    'Bwd_very_high': (3001.00, np.inf)
}

# Function to categorize values based on ranges
def categorize_value(value, ranges):
    if pd.isnull(value):
        return 'fwd_low'  # Treat NaN values as 'low'
    for category, (lower, upper) in ranges.items():
        if lower <= value <= upper:
            return category
    return 'unknown'

# Apply categorization to 'Fwd Packets Normalized' column
df['Fwd Packets Category'] = df['Fwd Packets Normalized'].apply(lambda x: categorize_value(x, category_ranges))

In [5]:
import numpy as np
# Define ranges for categorization
category_ranges = {
    'Bwd_low': (0, 1000.00),
    'Bwd_medium': (1001.00, 2000.00),
    'Bwd_high': (2001.00, 3000.00),
    'Bwd_very_high': (3001.00, np.inf)
}

# Function to categorize values based on ranges
def categorize_value(value, ranges):
    if pd.isnull(value):
        return 'Bwd_low'  # Treat NaN values as 'low'
    for category, (lower, upper) in ranges.items():
        if lower <= value <= upper:
            return category
    return 'unknown'

# Apply categorization to 'Bwd Packets Normalized' column
df['Bwd Packets Category'] = df['Bwd Packets Normalized'].apply(lambda x: categorize_value(x, category_ranges))

In [6]:
import numpy as np
# Define ranges for categorization
ranges = {
    '0_750': (0.0, 750.0),
    '751_1500': (750.0, 1500.0),
    '1501_15000': (1500.0, 15000.0),
    '15001_120000': (15000.0, 120000.0),
    '120001_300000': (120000.0, 300000.0),
    '300001_6000000': (300000.0, 6000000.0),
    '6000001_65999946': (6000000.0, 65999946.0),
    '66000000_119999902': (65999946.0, 119999902.0),
    '119999902_infinity': (119999902.0, np.inf)
}

# Function to categorize values based on ranges
def categorize_value(value, ranges):
    if pd.isnull(value):
        return 'unknown'  # Treat NaN values as 'unknown'
    for category, (lower, upper) in ranges.items():
        if lower <= value <= upper:
            return category
    return 'unknown'

# Apply categorization to 'Flow Duration' column
df['Flow Duration Category'] = df[' Flow Duration'].apply(lambda x: categorize_value(x, ranges))



In [7]:
essential_columns = [' Source IP', ' Destination IP',' Destination Port', ' Protocol', 'Fwd Packets Category', 'Bwd Packets Category', 'Flow Duration Category',' SYN Flag Count', ' RST Flag Count',' ACK Flag Count']

In [8]:
# Drop non-essential columns
filtered_df = df[essential_columns]
print("\nFiltered dataset Summery")
# Function to convert IP addresses to first subnet
def convert_to_first_subnet(ip_address):
    octets = ip_address.split('.')
    octets[2] = '0'
    octets[3] = '0'
    return '.'.join(octets)


# Strip leading and trailing spaces from 'Source IP' and 'Destination IP' columns
filtered_df[' Source IP'] = filtered_df[' Source IP'].astype(str).str.strip()
filtered_df[' Destination IP'] = filtered_df[' Destination IP'].astype(str).str.strip()

# Apply the function to 'Source IP' and 'Destination IP' columns
filtered_df[' Source IP'] = filtered_df[' Source IP'].apply(convert_to_first_subnet)
filtered_df[' Destination IP'] = filtered_df[' Destination IP'].apply(convert_to_first_subnet)


Filtered dataset Summery


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df[' Source IP'] = filtered_df[' Source IP'].astype(str).str.strip()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df[' Destination IP'] = filtered_df[' Destination IP'].astype(str).str.strip()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df[' Source IP'] = filtered_df

In [9]:
with_ID = filtered_df.copy()
with_ID.insert(0, 'alertID', range(len(with_ID)))
print("\nAdding a ID number for the records")
print(with_ID.head())

dataset_with_ID = with_ID.values.tolist() 


Adding a ID number for the records
   alertID   Source IP  Destination IP   Destination Port   Protocol  \
0        0  104.16.0.0     192.168.0.0              54865          6   
1        1  104.16.0.0     192.168.0.0              55054          6   
2        2  104.16.0.0     192.168.0.0              55055          6   
3        3  104.17.0.0     192.168.0.0              46236          6   
4        4  104.19.0.0     192.168.0.0              54863          6   

  Fwd Packets Category Bwd Packets Category Flow Duration Category  \
0              fwd_low              Bwd_low                  0_750   
1              fwd_low              Bwd_low                  0_750   
2              fwd_low              Bwd_low                  0_750   
3              fwd_low              Bwd_low                  0_750   
4              fwd_low              Bwd_low                  0_750   

    SYN Flag Count   RST Flag Count   ACK Flag Count  
0                0                0                1  


In [10]:
print("\nconverting the dataset in to a list")
dataset = [alert[1:] for alert in dataset_with_ID]


converting the dataset in to a list


In [11]:
# Convert each sublist to a tuple for hashability
data_tuples = [tuple(sublist) for sublist in dataset]

# Count occurrences of each unique record
record_counts = Counter(data_tuples)

print("\nnumber occurrences of each unique record")
print(f"{len(record_counts) }")


number occurrences of each unique record
98886


In [12]:
import pandas as pd

# Assuming 'filtered_df' is your DataFrame with categorical values

# Initialize dictionaries to store both forward and reverse mappings
forward_mapping = {}
reverse_mapping = {}

# Initialize the global counter to keep track of numerical values
global_counter = 0

# Iterate over each column in the DataFrame
for column in filtered_df.columns:
    # Initialize a local counter for each column
    local_counter = 0
    
    # Initialize dictionaries for forward and reverse mappings for the current column
    forward_mapping[column] = {}
    reverse_mapping[column] = {}
    
    # Iterate over each unique value in the current column
    for value in filtered_df[column].unique():
        # Map each unique value to a numerical value based on the global counter
        forward_mapping[column][value] = global_counter + local_counter
        reverse_mapping[column][global_counter + local_counter] = value
        
        # Increment the local counter
        local_counter += 1
        
    # Update the global counter to continue numbering from the last value of the previous dictionary
    global_counter += local_counter

# Create a new DataFrame with mapped values
new_df = pd.DataFrame()

# Iterate over each column in the original DataFrame and fill in values in the new DataFrame
for column in filtered_df.columns:
    new_df[column] = filtered_df[column].map(forward_mapping[column])

# Display the DataFrame with numerical values
print(new_df)

# Display forward and reverse mappings for each column
for column in new_df.columns:
    print(f"For {column}:")
    print("Forward Mapping:", forward_mapping[column])
    print("Reverse Mapping:", reverse_mapping[column])
    print()


         Source IP   Destination IP   Destination Port   Protocol  \
0                0             1248               2686      40899   
1                0             1248               2687      40899   
2                0             1248               2688      40899   
3                1             1248               2689      40899   
4                2             1248               2690      40899   
...            ...              ...                ...        ...   
703240          18             1248               2750      40900   
703241          18             1248               2750      40900   
703242          18             1248               2750      40900   
703243          18             1458               2760      40900   
703244          18             1248               2750      40900   

        Fwd Packets Category  Bwd Packets Category  Flow Duration Category  \
0                      40902                 40905                   40908   
1              

In [13]:
# Calculate the count of unique values in the 'type' field
type_counts = df[' Label'].value_counts()

print("\nDisplay the count of unique values in Label")
print(type_counts)


Display the count of unique values in Label
 Label
BENIGN      414322
PortScan    158930
DDoS        128027
Bot           1966
Name: count, dtype: int64


In [14]:
combined_dict = {k: v for inner_dict in reverse_mapping.values() for k, v in inner_dict.items()}

item_dataset = [tuple(x) for x in new_df.to_records(index=False)]

# Add ID field to the start of each tuple
item_dataset_withID = [(i,) + record for i, record in enumerate(item_dataset)]

new_df.to_csv('data.txt', index=False, sep=' ', header=False)

In [19]:
# Run the algorithm

os.system("java -jar spmf.jar run FPMax data.txt output.txt 0.1%")

itemset_records_object = []
itemset_records_numbers = []

# Read the output file line by line
outFile = open("output.txt", 'r', encoding='utf-8')
for string in outFile:
    itemset = []
    parts = string.split('#SUP:')
    numbers = list(map(int, parts[0].split()))
    support_count = int(parts[1].strip())

    itemset_using_numbers = [numbers, support_count]
    itemset_records_numbers.append(itemset_using_numbers)

    # Translate numerical values to attribute names using reverse mapping
    attribute_names = [str(combined_dict[num]) for num in numbers]
    itemset = [attribute_names, support_count]
    # Output the result
    #print(f"Pattern: {' '.join(attribute_names)}, Support Count: {str(support_count)}")
    itemset_records_object.append(itemset)

outFile.close()

def return_unique_labels(alertID_List):
    # Filter DataFrame based on selected IDs
    selected_records = df.iloc[alertID_List]
    # Count unique values in a certain field (e.g., Field1) in the selected records
    unique_value_counts = selected_records[' Label'].value_counts().to_dict()
    return(unique_value_counts)

patterns_ID_list = []
pattern_label_list = []

for index,record in enumerate(itemset_records_numbers):
    itemset = record[0]
    containing_alerts = []
    for alert in item_dataset_withID:
        alert_items = set(alert[1:])  # Exclude the ID field
        if set(itemset).issubset(alert_items):
            containing_alerts.append(alert[0])  # Append the ID

    pattern_label_record = list(return_unique_labels(containing_alerts).items())
    patterns_ID_list.append((itemset_records_object[index][0],containing_alerts))
    pattern_label_list.append((itemset_records_object[index][0],pattern_label_record))
    print(f"Pattern {index}: {itemset_records_object[index][0]}, \n{pattern_label_record}\n===============================================================================================================")


Pattern 0: ['0', 'Bwd_low', 'fwd_low', '0', '192.168.0.0', '6', '0_750', '1', '205.174.0.0'], 
[('Bot', 705)]
Pattern 1: ['0', 'Bwd_low', 'fwd_low', '0', '192.168.0.0', '6', '1', '23.61.0.0'], 
[('BENIGN', 712)]
Pattern 2: ['0', 'Bwd_low', 'fwd_low', '0', '6', '192.168.0.0', '104.88.0.0'], 
[('BENIGN', 707)]
Pattern 3: ['0', 'Bwd_low', 'fwd_low', '0', '192.168.0.0', '6', '1', '74.119.0.0'], 
[('BENIGN', 735)]
Pattern 4: ['0', 'Bwd_low', 'fwd_low', '6', '192.168.0.0', '173.194.0.0'], 
[('BENIGN', 759)]
Pattern 5: ['0', 'Bwd_low', 'fwd_low', '6', '192.168.0.0', '199.59.0.0'], 
[('BENIGN', 762)]
Pattern 6: ['0', 'Bwd_low', 'fwd_low', '0', '6', '192.168.0.0', '80', '23.50.0.0'], 
[('BENIGN', 761)]
Pattern 7: ['0', 'Bwd_low', 'fwd_low', '0', '6', '192.168.0.0', '80', '72.167.0.0'], 
[('BENIGN', 780)]
Pattern 8: ['0', 'Bwd_low', 'fwd_low', '0', '6', '192.168.0.0', '80', '23.52.0.0'], 
[('BENIGN', 813)]
Pattern 9: ['0', 'Bwd_low', 'fwd_low', '0', '6', '192.168.0.0', '443', '178.172.0.0'], 
[(

In [20]:
# Function to get the original field and value using the reverse mapping dictionaries
def get_field_and_value(item):
    for column, mapping in reverse_mapping.items():
        if int(item) in mapping:
            return str(column), mapping[int(item)]
    return None, None

In [21]:
# Initialize a list to store the patterns with original field and value

pattern_record = pd.DataFrame(columns=['Support Count', 'Label'])
patterns_with_fields = []

for index, record in enumerate(itemset_records_numbers):
    # Split the line into items and support count
    items = record[0]
    support_count = int(record[1])

    # Initialize a list to store the original field and value of each item
    record_with_field = []

    # Map each item in the pattern to its original field and value
    for item in items:
        # Get the field and value using the reverse mapping dictionaries
        field, value = get_field_and_value(item)
        record_with_field.append({"field": field, "value": value})

        # Check if the field already exists in the DataFrame
        if field not in pattern_record.columns:
            # If not, add a new column with the field name and fill with NaN
            pattern_record[field] = pd.NA
        
        # Add the value to the corresponding field
        pattern_record.at[index, field] = value
    
        # Add the support_count to the pattern_data dictionary
    pattern_record.at[index, 'Support Count'] = support_count

    if len(pattern_label_list[index][1]) == 1:
        pattern_record.at[index, 'Label'] = pattern_label_list[index][1][0][0]
    else:
        pattern_record.at[index, 'Label'] = 'Mixed Labels'
    # Store the pattern with original field and value along with the support count
    
    patterns_with_fields.append({"pattern": record_with_field, "support_count": support_count})

pattern_record = pattern_record.fillna('NaN').infer_objects(copy=False)

# Print the patterns with original field and value
for pattern_info in patterns_with_fields:
    print(pattern_info)
    print()


{'pattern': [{'field': ' RST Flag Count', 'value': 0}, {'field': 'Bwd Packets Category', 'value': 'Bwd_low'}, {'field': 'Fwd Packets Category', 'value': 'fwd_low'}, {'field': ' SYN Flag Count', 'value': 0}, {'field': ' Destination IP', 'value': '192.168.0.0'}, {'field': ' Protocol', 'value': 6}, {'field': 'Flow Duration Category', 'value': '0_750'}, {'field': ' ACK Flag Count', 'value': 1}, {'field': ' Source IP', 'value': '205.174.0.0'}], 'support_count': 705}

{'pattern': [{'field': ' RST Flag Count', 'value': 0}, {'field': 'Bwd Packets Category', 'value': 'Bwd_low'}, {'field': 'Fwd Packets Category', 'value': 'fwd_low'}, {'field': ' SYN Flag Count', 'value': 0}, {'field': ' Destination IP', 'value': '192.168.0.0'}, {'field': ' Protocol', 'value': 6}, {'field': ' ACK Flag Count', 'value': 1}, {'field': ' Source IP', 'value': '23.61.0.0'}], 'support_count': 712}

{'pattern': [{'field': ' RST Flag Count', 'value': 0}, {'field': 'Bwd Packets Category', 'value': 'Bwd_low'}, {'field': 'Fw

In [22]:
# Save the pattern record DataFrame to a CSV file
pattern_record.to_csv('pattern_record_0.1%_NotDownsampled_DifferentFeatures.csv', index=False)