In [1]:
import pandas as pd
import hashlib
import numpy as np
from collections import deque

# Load dataset (update path accordingly)
df = pd.read_csv("/Users/muskan/morgan/Train_data.csv")

# Debug: Print column names
print("Columns in CSV:", df.columns)

# Strip spaces in column names
df.columns = df.columns.str.strip()

# Ensure required columns exist
required_columns = ['src_bytes', 'dst_bytes']
missing_cols = [col for col in required_columns if col not in df.columns]

if missing_cols:
    raise KeyError(f"Missing columns in CSV: {missing_cols}")

# Extract relevant columns
df_filtered = df[required_columns]

print(df_filtered.head())  # Verify data

# Sliding window for processing network traffic
window_size = 100  # Process 100 data points at a time
traffic_stream = deque(maxlen=window_size)

class FlajoletMartin:
    def __init__(self, num_hashes=4):
        self.num_hashes = num_hashes
        self.max_zeros = np.zeros(num_hashes, dtype=int)

    def hash_function(self, value, seed):
        hash_val = int(hashlib.md5((str(value) + str(seed)).encode()).hexdigest(), 16)
        binary = bin(hash_val)[2:]  # Convert to binary
        return len(binary) - len(binary.rstrip('0'))  # Count trailing zeros

    def process_value(self, value):
        for i in range(self.num_hashes):
            self.max_zeros[i] = max(self.max_zeros[i], self.hash_function(value, i))

    def estimate_count(self):
        return 2 ** (np.mean(self.max_zeros))  # FM estimate

# Initialize FM algorithm
fm = FlajoletMartin()

# Process traffic data in a sliding window
for _, row in df_filtered.iterrows():
    combined_value = f"{row['src_bytes']}-{row['dst_bytes']}"  # Combine source & destination traffic
    traffic_stream.append(combined_value)
    fm.process_value(combined_value)

# Estimate distinct traffic patterns
distinct_count_estimate = fm.estimate_count()
print(f"Estimated Distinct Traffic Patterns: {distinct_count_estimate}")

# Define attack detection threshold
THRESHOLD = 500  # Adjust based on dataset analysis

if distinct_count_estimate > THRESHOLD:
    print("⚠️ Potential Attack Detected!")

# Calculate accuracy of estimation
true_distinct_count = len(set(traffic_stream))
accuracy = (1 - abs(true_distinct_count - distinct_count_estimate) / true_distinct_count) * 100

print(f"Accuracy of Distinct Count: {accuracy:.2f}%")

Columns in CSV: Index(['duration', 'protocol_type', 'service', 'flag', 'src_bytes',
       'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
       'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell',
       'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
       'num_access_files', 'num_outbound_cmds', 'is_host_login',
       'is_guest_login', 'count', 'srv_count', 'serror_rate',
       'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
       'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
       'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
       'dst_host_srv_rerror_rate', 'class'],
      dtype='object')
   src_bytes  dst_bytes
0        491          0
1        146          0
2          0          0
3        232       8153
4        199 