# Detecting Network Intrusions

## Function for identifying categorical features

In [17]:
def find_limited_unique_features(df, threshold=10):
    """
    Identifies features in a DataFrame that have a number of unique values below a certain threshold.
    
    This function is useful for detecting categorical features with limited variability.
    In network intrusion detection, such features might represent expected behaviors or settings 
    (e.g., a limited set of Modbus function codes or ports used by a PLC).

    Args:
        df (pd.DataFrame): The input DataFrame to analyze.
        threshold (int): The maximum number of unique values allowed for a feature to be considered.
                         Default is 10, but it can be adjusted based on user needs.

    Returns:
        dict: A dictionary where keys are feature names and values are dictionaries of the unique values 
              found in that feature.
    """
    limited_unique_features = {}
    for feature in df.columns:
        unique_values = df[feature].unique()
        if len(unique_values) <= threshold:
            limited_unique_features[feature] = unique_values.tolist()
    return limited_unique_features

## Function for comparing normal data with potential attacks

In [18]:
def compare_limited_unique_features(limited_unique_dict1, limited_unique_dict2):
    """
    Compares two dictionaries of limited unique features to detect changes in the number of categories.
    
    This function can be used to monitor categorical features over time. 
    An increase in the number of unique values in a feature might indicate a new anomaly, 
    such as a new Modbus function code or a different port being used, possibly due to an attack.

    Args:
        limited_unique_dict1 (dict): The first dictionary of limited unique features, created from normal network operation
        limited_unique_dict2 (dict): The second dictionary of limited unique features, created from network operation with potential attacks

    Returns:
        list: A list of discrepancies found between the two dictionaries. 
              Each item in the list is a tuple (feature, categories1, categories2).
    """
    discrepancies = []
    common_features = set(limited_unique_dict1.keys()) & set(limited_unique_dict2.keys())
    
    for feature in common_features:
        categories1 = set(limited_unique_dict1[feature])
        categories2 = set(limited_unique_dict2[feature])
        
        if categories2.difference(categories1):  # Check if there are new categories in the second dict
            discrepancies.append((feature, list(categories1), list(categories2)))
    
    if discrepancies:
        for feature, categories1, categories2 in discrepancies:
            new_categories = set(categories2) - set(categories1)
            print(f"Alert: New categories detected in feature '{feature}'.")
            print(f"Original categories: {sorted(categories1)}")
            print(f"Current categories: {sorted(categories2)}")
            print(f"Anomalous categories: {sorted(new_categories)}")
    else:
        print("No new categories detected in the features.")
    
    return discrepancies

## Example usage

### Function for generating example datasets

In [19]:
import pandas as pd
import random

# Seed for reproducibility
random.seed(42)

# Create a function to create simulated PLC data with limited unique categories
def generate_plc_dataframe(num_plcs, num_records):
    data = {}
    
    # Simulate ports and Modbus function codes
    for i in range(1, num_plcs + 1):
        # Each PLC has its own inbound and outbound ports and Modbus function codes
        data[f'PLC{i}_inbound_port'] = [random.choice([502, 503, 504]) for _ in range(num_records)]
        data[f'PLC{i}_outbound_port'] = [random.choice([8080, 8081, 8082]) for _ in range(num_records)]
        data[f'PLC{i}_Modbus_function_codes'] = [random.choice([1, 2, 3, 4, 5]) for _ in range(num_records)]
    
    return pd.DataFrame(data)

# Generate the first dataframe
df1 = generate_plc_dataframe(num_plcs=3, num_records=100)

# Generate the second dataframe, introducing a port scanning attack
df2 = df1.copy()

# Introduce new ports in the inbound port feature to simulate a scanning attack
for i in range(1, 4):
    if i == 1:  # Simulate attack only on the first PLC
        df2[f'PLC{i}_inbound_port'] = [random.choice([502, 503, 504, 6000 + i, 7000 + i]) for _ in range(100)]

# Display the first few rows of each dataframe
print("Simulated DataFrame 1 (Normal):")
display(df1.head(10))

print("\nSimulated DataFrame 2 (With Port Scanning Attack):")
display(df2.head(10))

Simulated DataFrame 1 (Normal):


Unnamed: 0,PLC1_inbound_port,PLC1_outbound_port,PLC1_Modbus_function_codes,PLC2_inbound_port,PLC2_outbound_port,PLC2_Modbus_function_codes,PLC3_inbound_port,PLC3_outbound_port,PLC3_Modbus_function_codes
0,504,8082,1,503,8080,3,503,8082,4
1,502,8082,2,502,8081,3,502,8080,2
2,502,8082,5,503,8080,5,502,8081,4
3,504,8080,1,503,8080,4,504,8082,2
4,503,8082,1,503,8082,5,504,8082,5
5,502,8081,4,503,8082,3,503,8082,3
6,502,8080,1,503,8080,1,502,8080,5
7,502,8080,5,504,8081,1,504,8080,5
8,504,8080,2,504,8081,3,502,8082,4
9,502,8081,2,504,8082,2,502,8081,4



Simulated DataFrame 2 (With Port Scanning Attack):


Unnamed: 0,PLC1_inbound_port,PLC1_outbound_port,PLC1_Modbus_function_codes,PLC2_inbound_port,PLC2_outbound_port,PLC2_Modbus_function_codes,PLC3_inbound_port,PLC3_outbound_port,PLC3_Modbus_function_codes
0,503,8082,1,503,8080,3,503,8082,4
1,6001,8082,2,502,8081,3,502,8080,2
2,502,8082,5,503,8080,5,502,8081,4
3,502,8080,1,503,8080,4,504,8082,2
4,7001,8082,1,503,8082,5,504,8082,5
5,504,8081,4,503,8082,3,503,8082,3
6,6001,8080,1,503,8080,1,502,8080,5
7,6001,8080,5,504,8081,1,504,8080,5
8,6001,8080,2,504,8081,3,502,8082,4
9,504,8081,2,504,8082,2,502,8081,4


### Checking for intrusions

In [20]:
# Detecting port scan using "find_limited_unique_features" and "compare_limited_unique_features" functions
# Finding limited unique features in both dataframes
limited_unique_df1 = find_limited_unique_features(df1)
limited_unique_df2 = find_limited_unique_features(df2)

# Comparing the limited unique features between the two dataframes
discrepancies = compare_limited_unique_features(limited_unique_df1, limited_unique_df2)


Alert: New categories detected in feature 'PLC1_inbound_port'.
Original categories: [502, 503, 504]
Current categories: [502, 503, 504, 6001, 7001]
Anomalous categories: [6001, 7001]
