In [2]:
import os
import requests

# Define the URL and the target path
url = "https://raw.githubusercontent.com/defcom17/NSL_KDD/master/KDDTrain%2B_20Percent.txt"
save_path = "../data/KDDTrain+_20Percent.txt"

# Create the data folder if it doesn't exist (just in case)
if not os.path.exists("../data"):
    os.makedirs("../data")

# Download the file
print(f"Downloading dataset from {url}...")
response = requests.get(url)

if response.status_code == 200:
    with open(save_path, "wb") as f:
        f.write(response.content)
    print(f"✅ Success! Dataset saved to: {save_path}")
else:
    print(f"❌ Error downloading file. Status code: {response.status_code}")

# NSL-KDD does not have headers in the raw file. We must define them manually.
COL_NAMES = [
    "duration", "protocol_type", "service", "flag", "src_bytes",
    "dst_bytes", "land", "wrong_fragment", "urgent", "hot",
    "num_failed_logins", "logged_in", "num_compromised", "root_shell",
    "su_attempted", "num_root", "num_file_creations", "num_shells",
    "num_access_files", "num_outbound_cmds", "is_host_login",
    "is_guest_login", "count", "srv_count", "serror_rate",
    "srv_serror_rate", "rerror_rate", "srv_rerror_rate", "same_srv_rate",
    "diff_srv_rate", "srv_diff_host_rate", "dst_host_count",
    "dst_host_srv_count", "dst_host_same_srv_rate",
    "dst_host_diff_srv_rate", "dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate", "dst_host_serror_rate",
    "dst_host_srv_serror_rate", "dst_host_rerror_rate",
    "dst_host_srv_rerror_rate", "class", "difficulty_level"
]

def load_and_process_data():
    print("Loading dataset...")
    # Load data (CSV format, no header)
    df = pd.read_csv(DATA_PATH, names=COL_NAMES, index_col=False)
    
    # Drop 'difficulty_level' (last column, not needed for intrusion detection)
    df.drop('difficulty_level', axis=1, inplace=True)
    
    print(f"Initial shape: {df.shape}")
    
    # --- Step 1: Feature Selection ---
    # For Association Rule Mining, we focus on the "intrinsic" attributes 
    # that define a connection, rather than the "content" (payload) metrics 
    # which can be infinite. We select the most relevant columns.
    selected_features = [
        'protocol_type', 'service', 'flag', 'class',  # Categorical (Keep as is)
        'duration', 'src_bytes', 'dst_bytes', 'count' # Numerical (Need Binning)
    ]
    df = df[selected_features]

    # --- Step 2: Binning Continuous Variables ---
    # Apriori cannot handle "src_bytes = 491". It needs "src_bytes = Low".
    # We use Quantile-based discretization (qcut) or fixed logic.
    
    # Binning Duration
    # Most connections are 0 seconds. We create a 'Zero' bin and others.
    df['duration_bin'] = pd.cut(df['duration'], 
                                bins=[-1, 0, 60, float('inf')], 
                                labels=['Zero', 'Short', 'Long'])
    
    # Binning Source Bytes (Traffic Volume)
    # 0 = No data sent. 
    df['src_bytes_bin'] = pd.cut(df['src_bytes'], 
                                 bins=[-1, 0, 1000, 10000, float('inf')], 
                                 labels=['Zero', 'Low', 'Medium', 'High'])
    
    # Binning Destination Bytes
    df['dst_bytes_bin'] = pd.cut(df['dst_bytes'], 
                                 bins=[-1, 0, 1000, 10000, float('inf')], 
                                 labels=['Zero', 'Low', 'Medium', 'High'])
    
    # Binning Count (Traffic spikes in past 2 seconds)
    df['count_bin'] = pd.cut(df['count'], 
                             bins=[-1, 10, 100, float('inf')], 
                             labels=['Normal', 'High', 'Surge'])

    # --- Step 3: Final Formatting ---
    # Drop the original number columns, keep only bins + categorical
    final_cols = ['protocol_type', 'service', 'flag', 'class', 
                  'duration_bin', 'src_bytes_bin', 'dst_bytes_bin', 'count_bin']
    
    df_clean = df[final_cols].copy()
    
    # Rename columns to make rules readable (e.g. "duration_bin" -> "Duration")
    df_clean.columns = ['Protocol', 'Service', 'Flag', 'Class', 
                        'Duration', 'Src_Bytes', 'Dst_Bytes', 'Traffic_Count']

    # Convert all to string to ensure they are treated as categorical items
    df_clean = df_clean.astype(str)
    
    print("\nSample Processed Transaction:")
    print(df_clean.head())
    
    # Save to CSV
    if not os.path.exists('../data'):
        os.makedirs('../data')
    
    df_clean.to_csv(OUTPUT_PATH, index=False)
    print(f"\nProcessed data saved to: {OUTPUT_PATH}")

if __name__ == "__main__":
    load_and_process_data()

Downloading dataset from https://raw.githubusercontent.com/defcom17/NSL_KDD/master/KDDTrain%2B_20Percent.txt...
✅ Success! Dataset saved to: ../data/KDDTrain+_20Percent.txt
Loading dataset...
Initial shape: (25192, 42)

Sample Processed Transaction:
  Protocol   Service Flag    Class Duration Src_Bytes Dst_Bytes Traffic_Count
0      tcp  ftp_data   SF   normal     Zero       Low      Zero        Normal
1      udp     other   SF   normal     Zero       Low      Zero          High
2      tcp   private   S0  neptune     Zero      Zero      Zero         Surge
3      tcp      http   SF   normal     Zero       Low    Medium        Normal
4      tcp      http   SF   normal     Zero       Low       Low          High

Processed data saved to: ../data/processed_data.csv
