In [None]:
# Import necessary libraries
import pandas as pd
import csv
import os

In [None]:
# Load and preprocess individual attack datasets
df_dos = pd.read_csv('DoS_dataset.csv')
print("DoS Dataset:")
print(df_dos.head())
print(df_dos.info())
print(df_dos.shape)

In [None]:
# Load Gear dataset
df_gear = pd.read_csv('gear_dataset.csv')
print("Gear Dataset:")
print(df_gear.head())
print(df_gear.shape)

In [None]:
# Load RPM dataset
df_rpm = pd.read_csv('RPM_dataset.csv')
print("RPM Dataset:")
print(df_rpm.head())
print(df_rpm.shape)

In [None]:
# Load Fuzzy dataset
df_fuzzy = pd.read_csv('Fuzzy_dataset.csv')
print("Fuzzy Dataset:")
print(df_fuzzy.head())
print(df_fuzzy.shape)

In [None]:
# Process normal_run_data.txt file
print("Reading normal_run_data.txt...")
df_normal = pd.read_csv('normal_run_data.txt', sep='\s+', header=None, skiprows=1)

# Select the correct columns using the compressed indices
# Index 1: Timestamp Value, Index 3: ID Value, Index 5: DLC Value, Index 6-13: Data Bytes
df_normal = df_normal.iloc[:, [1, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13]]

# Rename to standard 11 columns
df_normal.columns = ['Timestamp', 'ID', 'DLC', 'D0', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7']

# Add the Label column
df_normal['Label'] = 'Normal'

# Save to a new CSV file
output_filename = 'normal_dataset_clean_withR.csv'
df_normal.to_csv(output_filename, index=False)
print(f"Success! Saved {len(df_normal)} normal records to {output_filename}")

In [None]:
# Verify cleaned normal dataset
df_normal_check = pd.read_csv("normal_dataset_clean_withR.csv")
print(f"Normal dataset shape: {df_normal_check.shape}")
print(df_normal_check.head())
print(df_normal_check.isnull().sum())

In [None]:
# Merge all datasets into master dataset
files_to_merge = [
    ('normal_dataset_clean_withR.csv', 'Normal'),
    ('DoS_dataset.csv', 'DoS'),
    ('Fuzzy_dataset.csv', 'Fuzzy'),
    ('RPM_dataset.csv', 'RPM'),
    ('Gear_dataset.csv', 'Gear')
]

all_data = []
final_columns = ['Timestamp', 'ID', 'DLC', 'D0', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'Label']

print("Starting robust merge...")

for file_name, attack_type in files_to_merge:
    print(f"\nProcessing {file_name}...")
    
    try:
        # If it's the CLEAN Normal file - it has headers
        if 'normal_dataset' in file_name:
            df = pd.read_csv(file_name, on_bad_lines='skip')
            
            # Ensure it has the right columns
            if len(df.columns) == 12:
                df.columns = final_columns
            
        # If it's a RAW Attack CSV - NO headers, assign them manually
        else:
            df = pd.read_csv(file_name, header=None, on_bad_lines='skip')
            
            # Temporary names to process the Flag
            df.columns = ['Timestamp', 'ID', 'DLC', 'D0', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'Flag']
            
            # Convert Flag ('R'/'T') to Label
            df['Label'] = df['Flag'].apply(lambda x: 'Normal' if x == 'R' else attack_type)
            
            # Drop Flag so we have exactly 12 columns
            df.drop(columns=['Flag'], inplace=True)
            
            # Align column names exactly
            df.columns = final_columns

        print(f" -> Successfully loaded {len(df)} rows.")
        all_data.append(df)

    except Exception as e:
        print(f"!!! Error loading {file_name}: {e}")

# Combine all loaded frames
if all_data:
    print("\nConcatenating...")
    final_df = pd.concat(all_data, ignore_index=True)
    
    # Save the Master Dataset
    output_file = 'master_dataset_final.csv'
    final_df.to_csv(output_file, index=False)
    print(f"DONE! Saved {len(final_df)} records to {output_file}")
    print("\nClass Distribution:")
    print(final_df['Label'].value_counts())
else:
    print("No data was loaded. Please check your file paths.")

In [None]:
# Verify final master dataset
df_master = pd.read_csv('master_dataset_final.csv')
print(f"Master dataset shape: {df_master.shape}")
print(df_master.head())
print("\nData types:")
print(df_master.dtypes)
print("\nNull values:")
print(df_master.isnull().sum())

In [None]:
import pandas as pd
import numpy as np

# 1. Load the data
print("Loading master dataset...")
df = pd.read_csv('master_dataset_final.csv', low_memory=False)

# 2. Define a safer conversion function
def safe_hex_to_int(x):
    # If the value is missing/NaN, return 0 (or None if you plan to drop later)
    if pd.isnull(x):
        return None
    
    # Convert to string to handle Floats (e.g., 8.0 -> "8.0")
    s = str(x).strip()
    
    # Remove the decimal point if pandas added it (e.g., "8.0" -> "8")
    if '.' in s:
        s = s.split('.')[0]
        
    # If the string is empty or 'nan', return None
    if s == '' or s.lower() == 'nan':
        return None
        
    try:
        # Convert Hex string to Integer (Base 16)
        return int(s, 16)
    except ValueError:
        # If it's pure garbage data, return None so we can drop it later
        return None

# 3. Apply the fix
hex_columns = ['ID', 'D0', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7']
print("Converting Hex to Integers (Robust Mode)...")

for col in hex_columns:
    print(f"Processing {col}...")
    df[col] = df[col].apply(safe_hex_to_int)

# 4. Drop Nulls (Cleaning Step)
print("Dropping rows with corrupted/missing values...")
initial_count = len(df)
df.dropna(inplace=True)
print(f"Dropped {initial_count - len(df)} rows.")

# 5. Label Encoding (Turning 'DoS', 'Normal' into numbers)
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df['Label_Encoded'] = encoder.fit_transform(df['Label'])

# 6. Save the clean numeric file
output_file = 'final_clean_numeric_dataset.csv'
df.to_csv(output_file, index=False)
print(f"Success! Cleaned numeric dataset saved to {output_file}")

In [None]:

df=pd.read_csv("final_clean_numeric_dataset.csv")
print(df['Label'].value_counts())


In [None]:
import pandas as pd

# 1. Load the clean dataset
filename = 'final_clean_numeric_dataset.csv'
print(f"Loading {filename}...")
df = pd.read_csv(filename)

# 2. Check the ACTUAL counts after cleaning
class_counts = df['Label_Encoded'].value_counts()
print("\nActual Class Counts:")
print(class_counts)

# 3. Determine the limit automatically
# We take the count of the smallest class
min_class_size = class_counts.min()
print(f"\nSmallest class has {min_class_size} samples.")
print(f"Balancing all classes to {min_class_size}...")

# 4. Balanced Sampling
# This uses 'min_class_size' so it will never crash
balanced_df = df.groupby('Label_Encoded').apply(lambda x: x.sample(n=min_class_size, random_state=42))

# 5. Reset index and Verify
balanced_df = balanced_df.reset_index(drop=True)
print("\nNew Balanced Distribution:")
print(balanced_df['Label_Encoded'].value_counts())

# 6. Save
output_file = 'balanced_training_data_final.csv'
balanced_df.to_csv(output_file, index=False)
print(f"\nSuccess! Balanced dataset saved to {output_file}")

In [None]:
import pandas as pd
import numpy as np

# 1. Load your balanced data
print("Loading balanced dataset...")
df = pd.read_csv('balanced_training_data_final.csv', low_memory=False)

# 2. Define all columns that MUST be numbers
# (Everything except the Label)
numeric_cols = ['Timestamp', 'ID', 'DLC', 'D0', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7']

print("Scrubbing non-numeric text (like 'DLC:', 'ID:', etc)...")
for col in numeric_cols:
    # pd.to_numeric with errors='coerce' turns "DLC:" into NaN (Null)
    df[col] = pd.to_numeric(df[col], errors='coerce')

# 3. Check how many bad rows we found
initial_count = len(df)
df.dropna(inplace=True)
dropped_count = initial_count - len(df)

print(f"Found and removed {dropped_count} rows containing text errors.")
print(f"Final clean shape: {df.shape}")

# 4. Save it so you never face this error again
df.to_csv('final_train_ready_scrubbed.csv', index=False)
print("Saved to 'final_train_ready_scrubbed.csv'. Ready for training!")