# KDD Cup Dataset Complexity Analysis

**Performance Optimization:**
- Original dataset: ~4.9M rows → **900+ minutes** (infeasible)
- Using stratified sampling: 50k rows → **~5-10 minutes** ✓
- Maintains class distribution for statistically valid complexity metrics

In [1]:
import pandas as pd
import numpy as np 
import random

# Read directly from .gz file - pandas handles decompression automatically
df2 = pd.read_csv('../detection-of-network-intrusions/data/kddcup.data.gz', header=None, on_bad_lines='skip')
np.random.seed(42)
random.seed(42)

print(f"Dataset loaded: {len(df2):,} rows")
df2.head()

Dataset loaded: 4,898,431 rows


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,32,33,34,35,36,37,38,39,40,41
0,0,tcp,http,SF,215,45076,0,0,0,0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal.
1,0,tcp,http,SF,162,4528,0,0,0,0,...,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,normal.
2,0,tcp,http,SF,236,1228,0,0,0,0,...,2,1.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,normal.
3,0,tcp,http,SF,233,2032,0,0,0,0,...,3,1.0,0.0,0.33,0.0,0.0,0.0,0.0,0.0,normal.
4,0,tcp,http,SF,239,486,0,0,0,0,...,4,1.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,normal.


In [2]:
# removing the dot
df2[41] = df2[41].astype(str).str.rstrip('.')

# 0=Normal, 1=DoS, 2=Probe, 3=R2L, 4=U2R
attack_map = {
    'normal': 0,
    
    # DOS (1)
    'back': 1, 'land': 1, 'neptune': 1, 'pod': 1, 'smurf': 1, 'teardrop': 1,
    'mailbomb': 1, 'apache2': 1, 'processtable': 1, 'udpstorm': 1,
    
    # PROBE (2)
    'ipsweep': 2, 'nmap': 2, 'portsweep': 2, 'satan': 2,
    'mscan': 2, 'saint': 2, 
    
    # R2L (3)
    'ftp_write': 3, 'guess_passwd': 3, 'imap': 3, 'multihop': 3, 'phf': 3,
    'spy': 3, 'warezclient': 3, 'warezmaster': 3,
    'sendmail': 3, 'named': 3, 'snmpgetattack': 3, 'snmpguess': 3, 'xlock': 3, 'xsnoop': 3, 'worm': 3, #
    
    # U2R (4)
    'buffer_overflow': 4, 'loadmodule': 4, 'perl': 4, 'rootkit': 4,
    'httptunnel': 4, 'ps': 4, 'sqlattack': 4, 'xterm': 4 
}

protocols_map = {
    'tcp': 0,
    'udp': 1,
    'icmp': 2
}

service_map = {
    'auth': 0, 'bgp': 1, 'courier': 2, 'csnet_ns': 3, 'ctf': 4,
    'daytime': 5, 'discard': 6, 'domain': 7, 'domain_u': 8, 'echo': 9,
    'eco_i': 10, 'ecr_i': 11, 'efs': 12, 'exec': 13, 'finger': 14,
    'ftp': 15, 'ftp_data': 16, 'gopher': 17, 'hostnames': 18, 'http': 19,
    'http_443': 20, 'imap4': 21, 'IRC': 22, 'iso_tsap': 23, 'klogin': 24,
    'kshell': 25, 'ldap': 26, 'link': 27, 'login': 28, 'mtp': 29,
    'name': 30, 'netbios_dgm': 31, 'netbios_ns': 32, 'netbios_ssn': 33,
    'netstat': 34, 'nnsp': 35, 'nntp': 36, 'ntp_u': 37, 'other': 38,
    'pm_dump': 39, 'pop_2': 40, 'pop_3': 41, 'printer': 42, 'private': 43,
    'red_i': 44, 'remote_job': 45, 'rje': 46, 'shell': 47, 'smtp': 48,
    'sql_net': 49, 'ssh': 50, 'sunrpc': 51, 'supdup': 52, 'systat': 53,
    'telnet': 54, 'tftp_u': 55, 'tim_i': 56, 'time': 57, 'urh_i': 58,
    'urp_i': 59, 'uucp': 60, 'uucp_path': 61, 'vmnet': 62, 'whois': 63,
    'X11': 64, 'Z39_50': 65
}

flag_map = {
    'SF': 0,        # Normalne połączenie (SYN-ACK-FIN)
    'S0': 1,        # Próba połączenia (tylko SYN, brak odpowiedzi)
    'S1': 2,        # Połączenie nawiązane, nie zamknięte
    'S2': 3,        # Nawiązane, próba zamknięcia przez inicjatora
    'S3': 4,        # Nawiązane, próba zamknięcia przez odbiorcę
    'OTH': 5,       # Ruch w środku połączenia (brak SYN)
    'REJ': 6,       # Połączenie odrzucone (RST)
    'RSTO': 7,      # Reset przez inicjatora
    'RSTR': 8,      # Reset przez odbiorcę
    'RSTOS0': 9,    # Inicjator wysłał SYN, potem RST
    'SH': 10        # Inicjator wysłał SYN, potem FIN (skanowanie?)
}

# Mapujemy usługi i flagi
df2[1] = df2[1].map(protocols_map)
df2[2] = df2[2].map(service_map)
df2[3] = df2[3].map(flag_map)
df2[41] = df2[41].map(attack_map)


df2[1] = df2[1].fillna(-1).astype(int)
df2[2] = df2[2].fillna(-1).astype(int)
df2[3] = df2[3].fillna(-1).astype(int)
df2[41] = df2[41].fillna(-1).astype(int)


In [3]:
from sklearn.model_selection import train_test_split

X = df2.drop(columns=[41])
y = df2[41]

print(f"Full dataset: {len(X):,} samples")
print(f"Class distribution:\n{y.value_counts().sort_index()}")

# Use stratified sampling to get representative subset
# 50k samples will complete in ~5-10 minutes instead of 900+ minutes
SAMPLE_SIZE = 50000

X_sample, _, y_sample, _ = train_test_split(
    X, y, 
    train_size=SAMPLE_SIZE, 
    stratify=y, 
    random_state=42
)

print(f"\nSampled dataset: {len(X_sample):,} samples")
print(f"Sampled class distribution:\n{y_sample.value_counts().sort_index()}")

Full dataset: 4,898,431 samples
Class distribution:
41
0     972781
1    3883370
2      41102
3       1126
4         52
Name: count, dtype: int64

Sampled dataset: 50,000 samples
Sampled class distribution:
41
0     9929
1    39639
2      420
3       11
4        1
Name: count, dtype: int64


In [4]:
import problexity as px
import time

cc = px.ComplexityCalculator()

print("Fitting complexity calculator on sampled data...")
start_time = time.time()

cc.fit(X_sample, y_sample)

elapsed = time.time() - start_time
print(f"✓ Completed in {elapsed:.1f} seconds ({elapsed/60:.1f} minutes)")

Fitting complexity calculator on sampled data...


  r_all = l/m
  return np.nanprod(f_overlap/f_range)


: 

In [None]:
cc.complexity

In [None]:
cc._metrics()

In [None]:
cc.score()

In [None]:
cc.report()

In [None]:
# Import matplotlib
import matplotlib.pyplot as plt

# Prepare figure
fig = plt.figure(figsize=(7,7))

# Generate plot describing the dataset
cc.plot(fig, (1,1,1))