<a href="https://colab.research.google.com/github/Khalidaman9555/IDS-AI/blob/main/Data_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Step II.A: Mount Google Drive
from google.colab import drive
try:
    drive.mount('/content/drive')
    print("Google Drive mounted successfully.")
except Exception as e:
    print(f"Error mounting Google Drive: {e}")
    # Consider adding error handling or exiting if Drive mount fails

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Google Drive mounted successfully.


In [None]:
# Ensure kagglehub is installed (usually pre-installed in Colab)
!pip install kagglehub -q

In [None]:
# Step II.B: Download dataset using kagglehub
import kagglehub
import os # Import os for path operations later

try:
    # Download latest version
    dataset_path_runtime = kagglehub.dataset_download("mohamedamineferrag/edgeiiotset-cyber-security-dataset-of-iot-iiot")
    print(f"Dataset downloaded to temporary runtime path: {dataset_path_runtime}")
except Exception as e:
    print(f"Error downloading dataset via kagglehub: {e}")
    dataset_path_runtime = None # Ensure variable exists but indicates failure

Dataset downloaded to temporary runtime path: /kaggle/input/edgeiiotset-cyber-security-dataset-of-iot-iiot


In [None]:
# Step II.C: Inspect downloaded files and identify target CSV
# Ensure the 'os' module was imported in the previous step (II.B)
# Ensure 'dataset_path_runtime' variable holds the correct path from step II.B output

target_csv_filename = 'ML-EdgeIIoT-dataset.csv'
target_csv_path_runtime = None # Initialize path variable

# Use the dataset_path_runtime variable from the previous step's output
# Make sure it's defined correctly in your notebook session.
# If not, you might need to manually set it:
# dataset_path_runtime = '/kaggle/input/edgeiiotset-cyber-security-dataset-of-iot-iiot' # Example if needed

if dataset_path_runtime and os.path.exists(dataset_path_runtime):
  print("\nInspecting downloaded dataset contents...")

  # Check common potential subdirectories based on Kaggle examples and the PDF
  potential_base_dirs = [
      dataset_path_runtime,
      os.path.join(dataset_path_runtime, 'Edge-IIoTset dataset'),
      os.path.join(dataset_path_runtime, 'Edge-IIoTset dataset', 'Selected dataset for ML and DL')
  ]
  potential_paths = [os.path.join(base, target_csv_filename) for base in potential_base_dirs]

  for potential_path in potential_paths:
    if os.path.exists(potential_path):
      target_csv_path_runtime = potential_path
      print(f"Identified target CSV file at: {target_csv_path_runtime}")
      break # Stop searching once found

  if not target_csv_path_runtime:
    print(f"\nError: Could not automatically locate '{target_csv_filename}'.")
    print("Please manually inspect the contents of:")
    print(f"{dataset_path_runtime}")
    print("And update the 'target_csv_path_runtime' variable accordingly.")
    # List top-level contents to help user
    try:
        print("\nTop-level contents:")
        for item in os.listdir(dataset_path_runtime):
            print(os.path.join(dataset_path_runtime, item))
    except Exception as e:
        print(f"Could not list directory contents: {e}")

else:
  print("\nSkipping file inspection: Dataset download failed or path is invalid.")
  print(f"Please ensure 'dataset_path_runtime' is set correctly to: {dataset_path_runtime}")

# This variable will be used in the next step
print(f"\n'target_csv_path_runtime' is now set to: {target_csv_path_runtime}")


Inspecting downloaded dataset contents...
Identified target CSV file at: /kaggle/input/edgeiiotset-cyber-security-dataset-of-iot-iiot/Edge-IIoTset dataset/Selected dataset for ML and DL/ML-EdgeIIoT-dataset.csv

'target_csv_path_runtime' is now set to: /kaggle/input/edgeiiotset-cyber-security-dataset-of-iot-iiot/Edge-IIoTset dataset/Selected dataset for ML and DL/ML-EdgeIIoT-dataset.csv


In [None]:
# Step II.D: Copy Target CSV to Google Drive for Persistence
import shutil # Import shutil for file operations

# Ensure 'os' module was imported earlier
# Ensure 'target_csv_path_runtime' holds the correct path from step II.C

# Define the destination path in Google Drive
# You can change 'Colab Notebooks/datasets/' if you prefer a different location
destination_folder_drive = '/content/drive/MyDrive/Colab Notebooks/datasets/'
destination_path_drive = os.path.join(destination_folder_drive, target_csv_filename) # Reuse target_csv_filename

if target_csv_path_runtime and os.path.exists(target_csv_path_runtime):
  try:
    # Create the destination directory in Google Drive if it doesn't exist
    os.makedirs(destination_folder_drive, exist_ok=True)
    print(f"Destination folder '{destination_folder_drive}' ensured.")

    # Copy the file
    shutil.copyfile(target_csv_path_runtime, destination_path_drive)
    print(f"\nSuccessfully copied:")
    print(f"  FROM (runtime): {target_csv_path_runtime}")
    print(f"  TO (Drive):     {destination_path_drive}")

    # Verify the copy (optional but recommended)
    if os.path.exists(destination_path_drive):
      print("\nCopy verified successfully in Google Drive.")
      # Store the Drive path for the next step
      target_csv_path_drive = destination_path_drive
    else:
      print("\nError: Copy verification failed. File not found at destination.")
      target_csv_path_drive = None

  except Exception as e:
    print(f"\nError copying file to Google Drive: {e}")
    target_csv_path_drive = None

else:
  print("\nSkipping copy: Source CSV path is invalid or file does not exist.")
  print(f"Please ensure 'target_csv_path_runtime' ({target_csv_path_runtime}) is correct.")
  target_csv_path_drive = None

# This variable holds the path in Google Drive for future steps
print(f"\n'target_csv_path_drive' is now set to: {target_csv_path_drive}")

Destination folder '/content/drive/MyDrive/Colab Notebooks/datasets/' ensured.

Successfully copied:
  FROM (runtime): /kaggle/input/edgeiiotset-cyber-security-dataset-of-iot-iiot/Edge-IIoTset dataset/Selected dataset for ML and DL/ML-EdgeIIoT-dataset.csv
  TO (Drive):     /content/drive/MyDrive/Colab Notebooks/datasets/ML-EdgeIIoT-dataset.csv

Copy verified successfully in Google Drive.

'target_csv_path_drive' is now set to: /content/drive/MyDrive/Colab Notebooks/datasets/ML-EdgeIIoT-dataset.csv


In [None]:
# Step III: Load the Dataset from Google Drive using Pandas
import pandas as pd

# Ensure 'target_csv_path_drive' holds the correct Google Drive path from step II.D
# Ensure 'os' module was imported earlier if you want to re-verify path existence

# Define the path explicitly if the variable is not available (replace if needed)
# target_csv_path_drive = '/content/drive/MyDrive/Colab Notebooks/datasets/ML-EdgeIIoT-dataset.csv'

df = None # Initialize DataFrame variable

if target_csv_path_drive and os.path.exists(target_csv_path_drive):
  try:
    print(f"Loading dataset from: {target_csv_path_drive}")
    # Load the CSV file into a pandas DataFrame
    # The PDF mentions potential "low_memory=False" usage if dtype warnings appear
    df = pd.read_csv(target_csv_path_drive, low_memory=False)

    print("\nDataset loaded successfully.")
    # Display the dimensions (shape) of the DataFrame
    print(f"DataFrame shape (rows, columns): {df.shape}")

    # Display the first few rows to verify loading
    print("\nFirst 5 rows of the DataFrame:")
    print(df.head())

  except FileNotFoundError:
    print(f"Error: File not found at the specified path: {target_csv_path_drive}")
    print("Please ensure the path is correct and the file exists in your Google Drive.")
  except pd.errors.EmptyDataError:
    print(f"Error: The file at {target_csv_path_drive} is empty.")
  except Exception as e:
    print(f"An error occurred while loading the dataset: {e}")
else:
  print("\nSkipping dataset loading: Invalid or non-existent path.")
  print(f"Please ensure 'target_csv_path_drive' ({target_csv_path_drive}) is correct.")

# The 'df' variable now holds the loaded data (or None if loading failed)

Loading dataset from: /content/drive/MyDrive/Colab Notebooks/datasets/ML-EdgeIIoT-dataset.csv

Dataset loaded successfully.
DataFrame shape (rows, columns): (157800, 63)

First 5 rows of the DataFrame:
  frame.time    ip.src_host ip.dst_host arp.dst.proto_ipv4  arp.opcode  \
0        6.0  192.168.0.152         0.0                0.0         0.0   
1        6.0  192.168.0.101         0.0                0.0         0.0   
2        6.0  192.168.0.152         0.0                0.0         0.0   
3        6.0  192.168.0.101         0.0                0.0         0.0   
4        6.0  192.168.0.152         0.0                0.0         0.0   

   arp.hw.size arp.src.proto_ipv4  icmp.checksum  icmp.seq_le  \
0          0.0                0.0            0.0          0.0   
1          0.0                0.0            0.0          0.0   
2          0.0                0.0            0.0          0.0   
3          0.0                0.0            0.0          0.0   
4          0.0              

In [None]:
# Step IV: Initial Data Exploration

# Ensure the DataFrame 'df' exists from Step III

if df is not None:
  print("--- IV.A: Displaying Basic Information ---")
  # Get concise summary: column names, non-null counts, data types, memory usage
  df.info()

  print("\n\n--- IV.B: Checking for Missing Values (Sum per Column) ---")
  # Calculate and display the total number of missing values for each column
  missing_values = df.isnull().sum()
  print(missing_values[missing_values > 0]) # Only display columns with missing values
  if missing_values.sum() == 0:
      print("No missing values found.")

  print("\n\n--- IV.C: Generating Descriptive Statistics (Numerical Columns) ---")
  # Generate statistics like count, mean, std, min, max, quartiles for numerical columns
  # Using .describe(include='all') might show stats for object types too if needed
  print(df.describe())

  # Optional: If you suspect object/categorical columns need separate stats
  # print("\n\n--- Descriptive Statistics (Object Columns) ---")
  # print(df.describe(include='object'))

else:
  print("Skipping Data Exploration: DataFrame 'df' is not loaded.")

--- IV.A: Displaying Basic Information ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157800 entries, 0 to 157799
Data columns (total 63 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   frame.time                 157800 non-null  object 
 1   ip.src_host                157800 non-null  object 
 2   ip.dst_host                157800 non-null  object 
 3   arp.dst.proto_ipv4         157800 non-null  object 
 4   arp.opcode                 157800 non-null  float64
 5   arp.hw.size                157800 non-null  float64
 6   arp.src.proto_ipv4         157800 non-null  object 
 7   icmp.checksum              157800 non-null  float64
 8   icmp.seq_le                157800 non-null  float64
 9   icmp.transmit_timestamp    157800 non-null  float64
 10  icmp.unused                157800 non-null  float64
 11  http.file_data             157800 non-null  object 
 12  http.content_length        157800 non-null 

In [None]:
# Step V.A: Handle Mixed Data Types - Convert Object Columns to Numeric

# Ensure the DataFrame 'df' exists from Step III

if df is not None:
  print("--- V.A: Converting relevant 'object' columns to numeric ---")

  # Columns identified as 'object' that likely should be numeric based on PDF/info()
  cols_to_convert = [
      'frame.time',
      'ip.src_host',
      'ip.dst_host',
      'arp.dst.proto_ipv4',
      'arp.src.proto_ipv4',
      'http.file_data', # Might contain non-numeric strings
      'http.request.uri.query', # Might contain non-numeric strings
      'http.request.method', # Treat as numeric for now, may revisit
      'http.referer', # Might contain non-numeric strings
      'http.request.full_uri', # Might contain non-numeric strings
      'http.request.version', # E.g., '1.1' - needs conversion
      'tcp.options', # Complex, likely non-numeric
      'tcp.payload', # Hex or byte strings, likely non-numeric
      'tcp.srcport', # Should be numeric port number
      'dns.qry.name.len', # Should be numeric length
      'mqtt.conack.flags', # Often hex strings
      'mqtt.msg', # Message content, can be varied
      'mqtt.protoname', # Protocol name string
      'mqtt.topic' # Topic string
      # Note: 'Attack_type' is object but is a label, handle later if needed
  ]

  # Keep track of NaNs introduced
  nan_counts_before = df.isnull().sum()

  for col in cols_to_convert:
    if col in df.columns and df[col].dtype == 'object':
      print(f"Converting column: {col}")
      original_non_numeric = pd.to_numeric(df[col], errors='coerce').isnull().sum()
      df[col] = pd.to_numeric(df[col], errors='coerce')
      new_nan_count = df[col].isnull().sum()
      introduced_nans = new_nan_count - nan_counts_before.get(col, 0)
      if introduced_nans > 0:
          print(f"  -> Introduced {introduced_nans} NaN values by coercing non-numeric entries.")
      else:
          print(f"  -> Conversion successful without introducing new NaNs (or column was already numeric).")
    elif col not in df.columns:
       print(f"Skipping column {col}: Not found in DataFrame.")
    # else: Column exists but is not object, skip conversion


  print("\n--- Data types after conversion attempt: ---")
  df.info() # Display info again to see changes in Dtypes and non-null counts

else:
  print("Skipping Data Cleaning: DataFrame 'df' is not loaded.")

--- V.A: Converting relevant 'object' columns to numeric ---
Converting column: frame.time
  -> Introduced 156586 NaN values by coercing non-numeric entries.
Converting column: ip.src_host
  -> Introduced 149809 NaN values by coercing non-numeric entries.
Converting column: ip.dst_host
  -> Introduced 136372 NaN values by coercing non-numeric entries.
Converting column: arp.dst.proto_ipv4
  -> Introduced 2976 NaN values by coercing non-numeric entries.
Converting column: arp.src.proto_ipv4
  -> Introduced 1574 NaN values by coercing non-numeric entries.
Converting column: http.file_data
  -> Introduced 9471 NaN values by coercing non-numeric entries.
Converting column: http.request.uri.query
  -> Introduced 2751 NaN values by coercing non-numeric entries.
Converting column: http.request.method
  -> Introduced 7196 NaN values by coercing non-numeric entries.
Converting column: http.referer
  -> Introduced 290 NaN values by coercing non-numeric entries.
Converting column: http.request.fu

In [None]:
# Step V.B: Handle Infinity Values

import numpy as np # Import numpy for inf representation

# Ensure the DataFrame 'df' exists and has been processed by Step V.A

if df is not None:
  print("--- V.B: Checking for and replacing infinite values ---")

  # Count infinite values before replacement
  infinite_values_count = np.isinf(df.select_dtypes(include=[np.number])).sum().sum()
  print(f"Found {infinite_values_count} infinite values (inf or -inf) before replacement.")

  if infinite_values_count > 0:
    # Replace infinite values with NaN
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    # Verify replacement (optional)
    infinite_values_count_after = np.isinf(df.select_dtypes(include=[np.number])).sum().sum()
    print(f"Replaced infinite values with NaN. Found {infinite_values_count_after} infinite values after replacement.")
  else:
    print("No infinite values found to replace.")

  # Display NaN counts again to see if any new NaNs were added from infinities
  print("\n--- Current NaN counts per column (including any from replaced infinities): ---")
  nan_counts = df.isnull().sum()
  print(nan_counts[nan_counts > 0])
  if nan_counts.sum() == 0:
      print("No missing (NaN) values found.")

else:
  print("Skipping Infinity Handling: DataFrame 'df' is not loaded.")

--- V.B: Checking for and replacing infinite values ---
Found 0 infinite values (inf or -inf) before replacement.
No infinite values found to replace.

--- Current NaN counts per column (including any from replaced infinities): ---
Series([], dtype: int64)
No missing (NaN) values found.


In [None]:
# Step V.C: Handle Missing Values (NaN Imputation using Mean)

# Ensure the DataFrame 'df' exists and has been processed by Steps V.A & V.B

if df is not None:
  print("--- V.C: Imputing NaN values using the mean ---")

  # Identify numeric columns with NaN values
  cols_with_nan = df.isnull().sum()
  cols_to_impute = cols_with_nan[cols_with_nan > 0].index.tolist()

  # Exclude non-numeric columns explicitly if necessary (Attack_type is object, so select_dtypes handles it)
  # numeric_cols_to_impute = df[cols_to_impute].select_dtypes(include=np.number).columns

  print("Columns identified for mean imputation:")
  imputed_cols_list = []

  for col in cols_to_impute:
      # Check if column is numeric before imputing
      if pd.api.types.is_numeric_dtype(df[col]):
          col_mean = df[col].mean()
          df[col].fillna(col_mean, inplace=True)
          imputed_cols_list.append(col)
          # print(f"  - Imputed NaNs in '{col}' with mean: {col_mean:.4f}") # Optional: print mean used
      else:
          print(f"  - Skipped non-numeric column: '{col}'")

  if imputed_cols_list:
      print(f"\nImputed NaNs in the following {len(imputed_cols_list)} numeric columns using their respective means:")
      print(imputed_cols_list)
  else:
      print("\nNo numeric columns required imputation.")


  # Verify that NaNs have been handled in numeric columns
  print("\n--- NaN counts per column after imputation: ---")
  nan_counts_after = df.isnull().sum()
  print(nan_counts_after[nan_counts_after > 0])
  if nan_counts_after.sum() == 0:
      print("No missing (NaN) values remaining in the dataset.")
  elif df[nan_counts_after > 0].select_dtypes(include=np.number).shape[1] == 0 :
       print("No missing (NaN) values remaining in numeric columns.")
       print("Remaining NaNs are likely in non-numeric columns (e.g., object types).")

  print("\n--- Data types after imputation: ---")
  df.info() # Check if any dtypes changed (shouldn't have)

else:
  print("Skipping NaN Imputation: DataFrame 'df' is not loaded.")

--- V.C: Imputing NaN values using the mean ---
Columns identified for mean imputation:

No numeric columns required imputation.

--- NaN counts per column after imputation: ---
Series([], dtype: int64)
No missing (NaN) values remaining in the dataset.

--- Data types after imputation: ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157800 entries, 0 to 157799
Data columns (total 63 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   frame.time                 157800 non-null  object 
 1   ip.src_host                157800 non-null  object 
 2   ip.dst_host                157800 non-null  object 
 3   arp.dst.proto_ipv4         157800 non-null  object 
 4   arp.opcode                 157800 non-null  float64
 5   arp.hw.size                157800 non-null  float64
 6   arp.src.proto_ipv4         157800 non-null  object 
 7   icmp.checksum              157800 non-null  float64
 8   icmp.seq_le            

In [None]:
# Re-apply COMPLETE Step V: Data Cleaning (on DataFrame 'df')

import pandas as pd
import numpy as np

# --- Ensure DataFrame 'df' is loaded and available ---
if 'df' in locals() and df is not None:
    print("--- Applying Step V.A: Converting relevant 'object' columns to numeric ---")
    # Columns identified as 'object' that likely should be numeric
    cols_to_convert = [
        'frame.time', 'ip.src_host', 'ip.dst_host', 'arp.dst.proto_ipv4',
        'arp.src.proto_ipv4', 'http.file_data', 'http.request.uri.query',
        'http.request.method', 'http.referer', 'http.request.full_uri',
        'http.request.version', 'tcp.options', 'tcp.payload', 'tcp.srcport',
        'dns.qry.name.len', 'mqtt.conack.flags', 'mqtt.msg',
        'mqtt.protoname', 'mqtt.topic'
    ]
    nan_counts_before = df.isnull().sum()
    for col in cols_to_convert:
        if col in df.columns and df[col].dtype == 'object':
            print(f"Converting column: {col}")
            df[col] = pd.to_numeric(df[col], errors='coerce')
            new_nan_count = df[col].isnull().sum()
            introduced_nans = new_nan_count - nan_counts_before.get(col, 0)
            if introduced_nans > 0:
                print(f"  -> Introduced {introduced_nans} NaN values.")
            else:
                print(f"  -> Conversion complete.")
        elif col not in df.columns:
            print(f"Skipping column {col}: Not found.")

    print("\n--- Applying Step V.B: Checking for and replacing infinite values ---")
    infinite_values_count = np.isinf(df.select_dtypes(include=[np.number])).sum().sum()
    if infinite_values_count > 0:
        print(f"Found {infinite_values_count} infinite values. Replacing with NaN.")
        df.replace([np.inf, -np.inf], np.nan, inplace=True)
    else:
        print("No infinite values found.")

    print("\n--- Applying Step V.C: Imputing NaN values using the mean ---")
    cols_with_nan = df.isnull().sum()
    cols_to_impute = cols_with_nan[cols_with_nan > 0].index.tolist()
    imputed_cols_list = []
    for col in cols_to_impute:
        if pd.api.types.is_numeric_dtype(df[col]):
            col_mean = df[col].mean()
            df[col].fillna(col_mean, inplace=True)
            imputed_cols_list.append(col)
        else:
             print(f"  - Skipped non-numeric column during imputation: '{col}'") # Should only be Attack_type if present
    if imputed_cols_list:
         print(f"Imputed NaNs in {len(imputed_cols_list)} numeric columns using mean.")
    else:
         print("No numeric columns required imputation.")

    print("\n--- Verification after re-applying Step V ---")
    print("Checking final NaN counts in 'df':")
    final_nans = df.isnull().sum().sum()
    if final_nans == 0:
        print("  Success: No NaN values remaining in 'df'.")
    else:
        print(f"  Warning: {final_nans} NaN values still remain in 'df'.")
        print(df.isnull().sum()[df.isnull().sum() > 0])

    print("\nChecking final data types in 'df':")
    df.info()

else:
    print("Error: DataFrame 'df' not found. Please load the data using Step III code first.")

--- Applying Step V.A: Converting relevant 'object' columns to numeric ---
Converting column: frame.time
  -> Introduced 156586 NaN values.
Converting column: ip.src_host
  -> Introduced 149809 NaN values.
Converting column: ip.dst_host
  -> Introduced 136372 NaN values.
Converting column: arp.dst.proto_ipv4
  -> Introduced 2976 NaN values.
Converting column: arp.src.proto_ipv4
  -> Introduced 1574 NaN values.
Converting column: http.file_data
  -> Introduced 9471 NaN values.
Converting column: http.request.uri.query
  -> Introduced 2751 NaN values.
Converting column: http.request.method
  -> Introduced 7196 NaN values.
Converting column: http.referer
  -> Introduced 290 NaN values.
Converting column: http.request.full_uri
  -> Introduced 7174 NaN values.
Converting column: http.request.version
  -> Introduced 7196 NaN values.
Converting column: tcp.options
  -> Introduced 79280 NaN values.
Converting column: tcp.payload
  -> Introduced 34608 NaN values.
Converting column: tcp.srcport


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(col_mean, inplace=True)


In [None]:
# Step VI: Separate Features (X) and Target Variables (y)

# Ensure the DataFrame 'df' exists and is cleaned

if df is not None:
    print("--- VI: Separating Features (X) and Target Variables (y) ---")

    try:
        # Define target column names
        target_label_col = 'Attack_label' # Binary label
        target_type_col = 'Attack_type'  # Categorical type

        # Create the features DataFrame (X) by dropping the target columns
        X = df.drop(columns=[target_label_col, target_type_col])

        # Create the target Series (y)
        y_label = df[target_label_col] # Binary target
        y_type = df[target_type_col]   # Categorical target

        print("Features (X) and Target variables (y_label, y_type) created successfully.")
        print(f"\nShape of Features (X): {X.shape}")
        print(f"Shape of Target (y_label): {y_label.shape}")
        print(f"Shape of Target (y_type): {y_type.shape}")

        print("\nFirst 5 rows of Features (X):")
        print(X.head())

        print("\nFirst 5 rows of Target (y_label):")
        print(y_label.head())

        print("\nFirst 5 rows of Target (y_type):")
        print(y_type.head())

    except KeyError as e:
        print(f"Error separating features and target: Column not found - {e}")
        print("Please ensure 'Attack_label' and 'Attack_type' columns exist in the DataFrame.")
        X = None
        y_label = None
        y_type = None
    except Exception as e:
        print(f"An error occurred during feature/target separation: {e}")
        X = None
        y_label = None
        y_type = None
else:
    print("Skipping Feature/Target Separation: DataFrame 'df' is not loaded.")

--- VI: Separating Features (X) and Target Variables (y) ---
Features (X) and Target variables (y_label, y_type) created successfully.

Shape of Features (X): (157800, 61)
Shape of Target (y_label): (157800,)
Shape of Target (y_type): (157800,)

First 5 rows of Features (X):
   frame.time  ip.src_host  ip.dst_host  arp.dst.proto_ipv4  arp.opcode  \
0         6.0          0.0          0.0                 0.0         0.0   
1         6.0          0.0          0.0                 0.0         0.0   
2         6.0          0.0          0.0                 0.0         0.0   
3         6.0          0.0          0.0                 0.0         0.0   
4         6.0          0.0          0.0                 0.0         0.0   

   arp.hw.size  arp.src.proto_ipv4  icmp.checksum  icmp.seq_le  \
0          0.0                 0.0            0.0          0.0   
1          0.0                 0.0            0.0          0.0   
2          0.0                 0.0            0.0          0.0   
3        

In [None]:
# Step VII: Encode Categorical Target Variable (y_type)

from sklearn.preprocessing import LabelEncoder
import numpy as np # Import numpy for mapping display

# Ensure y_type exists from Step VI

if 'y_type' in locals() and y_type is not None:
    print("--- VII: Encoding Categorical Target Variable (y_type) using LabelEncoder ---")

    try:
        # Initialize the LabelEncoder
        label_encoder = LabelEncoder()

        # Fit the encoder on the target variable and transform it
        y_type_encoded = label_encoder.fit_transform(y_type)

        print("Categorical target 'y_type' encoded successfully.")

        # Display the mapping from original labels to encoded numbers
        print("\nMapping (Original Label -> Encoded Number):")
        # Use numpy unique on original y_type and corresponding encoded values
        original_labels = label_encoder.classes_
        encoded_labels = label_encoder.transform(original_labels)
        for label, encoded in zip(original_labels, encoded_labels):
             print(f"  '{label}' -> {encoded}")


        print("\nFirst 10 original 'y_type' values:")
        print(y_type.head(10))

        print("\nFirst 10 encoded 'y_type_encoded' values:")
        print(y_type_encoded[:10])

        print(f"\nShape of encoded target (y_type_encoded): {y_type_encoded.shape}")

    except Exception as e:
        print(f"An error occurred during Label Encoding: {e}")
        y_type_encoded = None

else:
    print("Skipping Label Encoding: Target variable 'y_type' is not defined.")

--- VII: Encoding Categorical Target Variable (y_type) using LabelEncoder ---
Categorical target 'y_type' encoded successfully.

Mapping (Original Label -> Encoded Number):
  'Backdoor' -> 0
  'DDoS_HTTP' -> 1
  'DDoS_ICMP' -> 2
  'DDoS_TCP' -> 3
  'DDoS_UDP' -> 4
  'Fingerprinting' -> 5
  'MITM' -> 6
  'Normal' -> 7
  'Password' -> 8
  'Port_Scanning' -> 9
  'Ransomware' -> 10
  'SQL_injection' -> 11
  'Uploading' -> 12
  'Vulnerability_scanner' -> 13
  'XSS' -> 14

First 10 original 'y_type' values:
0    MITM
1    MITM
2    MITM
3    MITM
4    MITM
5    MITM
6    MITM
7    MITM
8    MITM
9    MITM
Name: Attack_type, dtype: object

First 10 encoded 'y_type_encoded' values:
[6 6 6 6 6 6 6 6 6 6]

Shape of encoded target (y_type_encoded): (157800,)


In [None]:
# Step VIII: Split Data into Training and Testing Sets

from sklearn.model_selection import train_test_split

# Ensure X, y_label, and y_type_encoded exist from previous steps

if 'X' in locals() and X is not None and \
   'y_label' in locals() and y_label is not None and \
   'y_type_encoded' in locals() and y_type_encoded is not None:

    print("--- VIII: Splitting data into Training (80%) and Testing (20%) sets ---")

    try:
        # Define test size (e.g., 0.20 for 20%)
        test_set_size = 0.20
        # Use a fixed random_state for reproducibility
        random_seed = 42

        # Split the data, stratifying on the binary label (y_label)
        X_train, X_test, y_label_train, y_label_test, y_type_train, y_type_test = train_test_split(
            X,
            y_label,
            y_type_encoded,
            test_size=test_set_size,
            random_state=random_seed,
            stratify=y_label # Stratify based on the binary attack label
        )

        print("Data splitting completed successfully.")
        print("\nShapes of the resulting sets:")
        print(f"  X_train shape:      {X_train.shape}")
        print(f"  X_test shape:       {X_test.shape}")
        print(f"  y_label_train shape: {y_label_train.shape}")
        print(f"  y_label_test shape:  {y_label_test.shape}")
        print(f"  y_type_train shape:  {y_type_train.shape}")
        print(f"  y_type_test shape:   {y_type_test.shape}")

        # Optional: Verify stratification by checking class distribution (e.g., for y_label)
        print("\nVerifying stratification (distribution of Attack_label in train/test):")
        print(f"  y_label Original distribution:\n{y_label.value_counts(normalize=True)}")
        print(f"  y_label_train distribution:\n{y_label_train.value_counts(normalize=True)}")
        print(f"  y_label_test distribution:\n{y_label_test.value_counts(normalize=True)}")


    except Exception as e:
        print(f"An error occurred during data splitting: {e}")
        # Reset variables in case of error
        X_train, X_test, y_label_train, y_label_test, y_type_train, y_type_test = [None]*6

else:
    print("Skipping Data Splitting: Features (X) or Target variables (y_label, y_type_encoded) are not defined.")

--- VIII: Splitting data into Training (80%) and Testing (20%) sets ---
Data splitting completed successfully.

Shapes of the resulting sets:
  X_train shape:      (126240, 61)
  X_test shape:       (31560, 61)
  y_label_train shape: (126240,)
  y_label_test shape:  (31560,)
  y_type_train shape:  (126240,)
  y_type_test shape:   (31560,)

Verifying stratification (distribution of Attack_label in train/test):
  y_label Original distribution:
Attack_label
1    0.846001
0    0.153999
Name: proportion, dtype: float64
  y_label_train distribution:
Attack_label
1    0.846
0    0.154
Name: proportion, dtype: float64
  y_label_test distribution:
Attack_label
1    0.846008
0    0.153992
Name: proportion, dtype: float64


In [None]:
# Step IX: Feature Scaling using StandardScaler

from sklearn.preprocessing import StandardScaler
import pandas as pd # Import pandas to display scaled data as DataFrame

# Ensure X_train and X_test exist from Step VIII

if 'X_train' in locals() and X_train is not None and \
   'X_test' in locals() and X_test is not None:

    print("--- IX: Scaling features using StandardScaler ---")
    print("Fitting scaler on X_train and transforming X_train & X_test...")

    try:
        # Initialize the StandardScaler
        scaler = StandardScaler()

        # Fit the scaler ONLY on the training data features
        scaler.fit(X_train)

        # Transform both the training and testing data features
        # The output will be numpy arrays, retain column names for clarity if needed
        X_train_scaled = scaler.transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        print("Feature scaling completed successfully.")

        # Convert scaled arrays back to DataFrames to show head() with column names
        X_train_scaled_df = pd.DataFrame(X_train_scaled, index=X_train.index, columns=X_train.columns)
        # X_test_scaled_df = pd.DataFrame(X_test_scaled, index=X_test.index, columns=X_test.columns) # Optional: create for X_test too

        print("\nShapes of the scaled feature sets:")
        print(f"  X_train_scaled shape: {X_train_scaled.shape}")
        print(f"  X_test_scaled shape:  {X_test_scaled.shape}")

        print("\nFirst 5 rows of scaled training data (X_train_scaled):")
        # Displaying the DataFrame version for readability
        print(X_train_scaled_df.head())

        # Optional: Verify mean and std dev of scaled training data (should be close to 0 and 1)
        # print("\nMean of scaled training data (first 5 columns):")
        # print(X_train_scaled_df.mean()[:5])
        # print("\nStandard Deviation of scaled training data (first 5 columns):")
        # print(X_train_scaled_df.std()[:5])


    except Exception as e:
        print(f"An error occurred during feature scaling: {e}")
        X_train_scaled = None
        X_test_scaled = None

else:
    print("Skipping Feature Scaling: Training data (X_train) or Testing data (X_test) not defined.")

--- IX: Scaling features using StandardScaler ---
Fitting scaler on X_train and transforming X_train & X_test...
Feature scaling completed successfully.

Shapes of the scaled feature sets:
  X_train_scaled shape: (126240, 61)
  X_test_scaled shape:  (31560, 61)

First 5 rows of scaled training data (X_train_scaled):
        frame.time  ip.src_host  ip.dst_host  arp.dst.proto_ipv4  arp.opcode  \
115205    0.002788          0.0          0.0                 0.0   -0.095086   
120894    0.002788          0.0          0.0                 0.0   -0.095086   
100324    0.002788          0.0          0.0                 0.0   -0.095086   
144944    0.002788          0.0          0.0                 0.0   -0.095086   
21612     0.002788          0.0          0.0                 0.0   -0.095086   

        arp.hw.size  arp.src.proto_ipv4  icmp.checksum  icmp.seq_le  \
115205    -0.100729                 0.0      -0.274174    -0.284575   
120894    -0.100729                 0.0      -0.274174    -

  temp **= 2
  new_unnormalized_variance -= correction**2 / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  upper_bound = n_samples * eps * var + (n_samples * mean * eps) ** 2


In [None]:
# Diagnostic Step: Identify Non-Numeric Columns in X_train

import pandas as pd

# Ensure X_train exists from Step VIII

if 'X_train' in locals() and X_train is not None and isinstance(X_train, pd.DataFrame):
    print("--- Diagnosing non-numeric data in X_train columns ---")
    non_numeric_cols = []
    for col in X_train.columns:
        try:
            # Attempt to convert the column to numeric.
            # If it fails for any value, it will raise an error.
            pd.to_numeric(X_train[col])
        except (ValueError, TypeError):
            # ValueError occurs for strings that can't be floats
            # TypeError might occur for other non-numeric types
            non_numeric_cols.append(col)
            print(f"  - Found non-numeric data in column: {col}")
            # Optional: Show a sample of non-numeric values
            # try:
            #     is_numeric = pd.to_numeric(X_train[col], errors='coerce').isna()
            #     print(f"    Sample non-numeric values:\n{X_train[col][is_numeric].unique()[:5]}")
            # except Exception:
            #     print("    Could not retrieve non-numeric samples.")


    if not non_numeric_cols:
        print("\nDiagnosis complete: No columns found with strictly non-numeric data that prevents basic conversion.")
        print("The error might be more subtle (e.g., mixed types not caught easily, or error in scaler itself).")
        print("Let's re-check data types:")
        print(X_train.info())
    else:
        print(f"\nDiagnosis complete: Found {len(non_numeric_cols)} column(s) containing non-numeric data:")
        print(non_numeric_cols)
        print("\nPlease re-examine Step V (Data Cleaning) for these columns before attempting Step IX (Scaling) again.")

elif 'X_train' not in locals() or X_train is None:
     print("Skipping Diagnosis: DataFrame 'X_train' is not defined.")
elif not isinstance(X_train, pd.DataFrame):
     print(f"Skipping Diagnosis: 'X_train' is not a Pandas DataFrame (Type: {type(X_train)}). Scaling requires DataFrame or NumPy array.")

--- Diagnosing non-numeric data in X_train columns ---
  - Found non-numeric data in column: frame.time
  - Found non-numeric data in column: ip.src_host
  - Found non-numeric data in column: ip.dst_host
  - Found non-numeric data in column: arp.dst.proto_ipv4
  - Found non-numeric data in column: arp.src.proto_ipv4
  - Found non-numeric data in column: http.file_data
  - Found non-numeric data in column: http.request.uri.query
  - Found non-numeric data in column: http.request.method
  - Found non-numeric data in column: http.referer
  - Found non-numeric data in column: http.request.full_uri
  - Found non-numeric data in column: http.request.version
  - Found non-numeric data in column: tcp.options
  - Found non-numeric data in column: tcp.payload
  - Found non-numeric data in column: tcp.srcport
  - Found non-numeric data in column: dns.qry.name.len
  - Found non-numeric data in column: mqtt.conack.flags
  - Found non-numeric data in column: mqtt.msg
  - Found non-numeric data in co

In [None]:
# Step X: Save Processed Data to Google Drive

import numpy as np
import os
import pandas as pd # Needed if targets are still Pandas Series

# Define the folder path in Google Drive where you want to save the files
# You can change this path if you prefer a different location
save_folder_drive = '/content/drive/MyDrive/Colab Notebooks/processed_data/'

# Ensure all the necessary variables exist from previous steps
required_vars = ['X_train_scaled', 'X_test_scaled', 'y_label_train', 'y_label_test', 'y_type_train', 'y_type_test']
vars_exist = all(var in locals() and locals()[var] is not None for var in required_vars)

if vars_exist:
    print(f"--- Saving processed data to: {save_folder_drive} ---")

    try:
        # Create the destination directory in Google Drive if it doesn't exist
        os.makedirs(save_folder_drive, exist_ok=True)
        print(f"Destination folder '{save_folder_drive}' ensured.")

        # Define file paths
        paths = {
            'X_train': os.path.join(save_folder_drive, 'X_train_scaled.npy'),
            'X_test': os.path.join(save_folder_drive, 'X_test_scaled.npy'),
            'y_label_train': os.path.join(save_folder_drive, 'y_label_train.npy'),
            'y_label_test': os.path.join(save_folder_drive, 'y_label_test.npy'),
            'y_type_train': os.path.join(save_folder_drive, 'y_type_train.npy'),
            'y_type_test': os.path.join(save_folder_drive, 'y_type_test.npy')
        }

        # Data arrays map
        data_to_save = {
            'X_train': X_train_scaled,
            'X_test': X_test_scaled,
            # Convert pandas Series to numpy array before saving if needed
            'y_label_train': y_label_train.to_numpy() if isinstance(y_label_train, pd.Series) else y_label_train,
            'y_label_test': y_label_test.to_numpy() if isinstance(y_label_test, pd.Series) else y_label_test,
            'y_type_train': y_type_train, # Already numpy array from LabelEncoder/split
            'y_type_test': y_type_test   # Already numpy array from LabelEncoder/split
        }

        # Save each array
        for name, data in data_to_save.items():
            file_path = paths[name]
            np.save(file_path, data)
            print(f"  - Saved {name} data to {file_path} (shape: {data.shape})")

        print("\nAll processed data arrays saved successfully to Google Drive.")

        # Optional: Verify files exist
        print("\nVerifying saved files:")
        all_files_exist = True
        for name, file_path in paths.items():
            if os.path.exists(file_path):
                print(f"  - Verified: {file_path}")
            else:
                print(f"  - Verification FAILED for: {file_path}")
                all_files_exist = False
        if all_files_exist:
            print("All files verified successfully.")


    except Exception as e:
        print(f"An error occurred while saving data: {e}")

else:
    print("Skipping Save Data: One or more required data variables are not defined.")
    print("Please ensure X_train_scaled, X_test_scaled, y_label_train, y_label_test, y_type_train, y_type_test exist.")

Skipping Save Data: One or more required data variables are not defined.
Please ensure X_train_scaled, X_test_scaled, y_label_train, y_label_test, y_type_train, y_type_test exist.


In [None]:
# Diagnostic & Save Step: Verify variable existence then attempt saving

import numpy as np
import os
import pandas as pd

print("--- Verifying existence and type of required variables ---")

required_vars = {
    'X_train_scaled': (np.ndarray), # Expect NumPy array after scaling
    'X_test_scaled': (np.ndarray),  # Expect NumPy array after scaling
    'y_label_train': (pd.Series, np.ndarray), # Could be Series or Array
    'y_label_test': (pd.Series, np.ndarray),  # Could be Series or Array
    'y_type_train': (np.ndarray), # Expect NumPy array after encoding/split
    'y_type_test': (np.ndarray)   # Expect NumPy array after encoding/split
}

all_vars_found = True
vars_data = {}

for var_name, expected_type in required_vars.items():
    if var_name in locals():
        var_value = locals()[var_name]
        if var_value is not None:
            print(f"  - Found '{var_name}' of type: {type(var_value)}")
            # Simple check if it's one of the expected types
            if not isinstance(var_value, expected_type):
                 print(f"    - Warning: Type is not as expected ({expected_type}).")
            vars_data[var_name] = var_value # Store for saving
        else:
            print(f"  - Found '{var_name}' but it is None.")
            all_vars_found = False
    else:
        print(f"  - Did NOT find '{var_name}'.")
        all_vars_found = False

if not all_vars_found:
    print("\nError: One or more required variables were not found or are None. Cannot proceed with saving.")
else:
    print("\nVerification successful: All required variables appear to exist.")
    print("--- Attempting to save data (Step X) ---")

    # Define the folder path in Google Drive
    save_folder_drive = '/content/drive/MyDrive/Colab Notebooks/processed_data/'

    try:
        # Create the destination directory
        os.makedirs(save_folder_drive, exist_ok=True)
        print(f"Destination folder '{save_folder_drive}' ensured.")

        # Define file paths
        paths = {
            'X_train': os.path.join(save_folder_drive, 'X_train_scaled.npy'),
            'X_test': os.path.join(save_folder_drive, 'X_test_scaled.npy'),
            'y_label_train': os.path.join(save_folder_drive, 'y_label_train.npy'),
            'y_label_test': os.path.join(save_folder_drive, 'y_label_test.npy'),
            'y_type_train': os.path.join(save_folder_drive, 'y_type_train.npy'),
            'y_type_test': os.path.join(save_folder_drive, 'y_type_test.npy')
        }

        # Data arrays map (using verified vars_data)
        data_to_save = {
            'X_train': vars_data['X_train_scaled'],
            'X_test': vars_data['X_test_scaled'],
            'y_label_train': vars_data['y_label_train'].to_numpy() if isinstance(vars_data['y_label_train'], pd.Series) else vars_data['y_label_train'],
            'y_label_test': vars_data['y_label_test'].to_numpy() if isinstance(vars_data['y_label_test'], pd.Series) else vars_data['y_label_test'],
            'y_type_train': vars_data['y_type_train'],
            'y_type_test': vars_data['y_type_test']
        }

        # Save each array
        for name, data in data_to_save.items():
            file_path = paths[name]
            np.save(file_path, data)
            print(f"  - Saved {name} data to {file_path} (shape: {data.shape})")

        print("\nAll processed data arrays saved successfully to Google Drive.")

        # Optional: Verify files exist
        print("\nVerifying saved files:")
        all_files_exist_verify = True
        for name, file_path in paths.items():
            if os.path.exists(file_path):
                print(f"  - Verified: {file_path}")
            else:
                print(f"  - Verification FAILED for: {file_path}")
                all_files_exist_verify = False
        if all_files_exist_verify:
            print("All files verified successfully.")

    except Exception as e:
        print(f"\nAn error occurred while saving data: {e}")

--- Verifying existence and type of required variables ---
  - Found 'X_train_scaled' of type: <class 'numpy.ndarray'>
  - Found 'X_test_scaled' of type: <class 'numpy.ndarray'>
  - Found 'y_label_train' of type: <class 'pandas.core.series.Series'>
  - Found 'y_label_test' of type: <class 'pandas.core.series.Series'>
  - Found 'y_type_train' of type: <class 'numpy.ndarray'>
  - Found 'y_type_test' of type: <class 'numpy.ndarray'>

Verification successful: All required variables appear to exist.
--- Attempting to save data (Step X) ---
Destination folder '/content/drive/MyDrive/Colab Notebooks/processed_data/' ensured.
  - Saved X_train data to /content/drive/MyDrive/Colab Notebooks/processed_data/X_train_scaled.npy (shape: (126240, 61))
  - Saved X_test data to /content/drive/MyDrive/Colab Notebooks/processed_data/X_test_scaled.npy (shape: (31560, 61))
  - Saved y_label_train data to /content/drive/MyDrive/Colab Notebooks/processed_data/y_label_train.npy (shape: (126240,))
  - Saved y_

# Exploratory Data Analysis of the Edge-IIoTset Dataset

In [None]:
# Code Block 1: Imports and Setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set plot style
sns.set_style("whitegrid")

print("Libraries imported and plot style set.")

Libraries imported and plot style set.


In [None]:
# Code Block 2: Data Loading (Corrected - MODIFY FILE PATH)
import pandas as pd # Re-import pandas just in case the session was interrupted

# ====> MODIFY THIS LINE to the correct path to your CSV FILE <====

file_path = '/content/drive/MyDrive/Colab Notebooks/processed_data/ML-EdgeIIoT-dataset.csv' # Example if in that Drive folder

try:
    df = pd.read_csv(file_path)

    # Display the first 5 rows
    print("First 5 rows of the dataset:")
    print(df.head())

    # Display the shape of the dataset
    print(f"\nDataset dimensions (rows, columns): {df.shape}")

except FileNotFoundError:
    print(f"Error: The file {file_path} was not found.")
    print("Please ensure the path points directly to the CSV file and it exists.")
    raise

except Exception as e:
    print(f"An error occurred during file loading: {e}")
    raise

Error: The file /content/drive/MyDrive/Colab Notebooks/processed_data/ML-EdgeIIoT-dataset.csv was not found.
Please ensure the path points directly to the CSV file and it exists.


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/Colab Notebooks/processed_data/ML-EdgeIIoT-dataset.csv'

In [None]:
# Load the dataset
# Updated file_path based on your output:
file_path = '/content/drive/MyDrive/Colab Notebooks/datasets/ML-EdgeIIoT-dataset.csv'

try:
    df = pd.read_csv(file_path)
    # Display the first 5 rows
    print("First 5 rows of the dataset:")
    print(df.head())

    # Display the shape of the dataset
    print(f"\nDataset dimensions (rows, columns): {df.shape}")

except FileNotFoundError:
    print(f"Error: The file {file_path} was not found. Please check the file path.")
    # Exit or handle the error appropriately
    # exit() # You might want to comment this out in a notebook
except Exception as e:
    print(f"An error occurred: {e}")

  df = pd.read_csv(file_path)


First 5 rows of the dataset:
  frame.time    ip.src_host ip.dst_host arp.dst.proto_ipv4  arp.opcode  \
0        6.0  192.168.0.152         0.0                0.0         0.0   
1        6.0  192.168.0.101         0.0                0.0         0.0   
2        6.0  192.168.0.152         0.0                0.0         0.0   
3        6.0  192.168.0.101         0.0                0.0         0.0   
4        6.0  192.168.0.152         0.0                0.0         0.0   

   arp.hw.size arp.src.proto_ipv4  icmp.checksum  icmp.seq_le  \
0          0.0                0.0            0.0          0.0   
1          0.0                0.0            0.0          0.0   
2          0.0                0.0            0.0          0.0   
3          0.0                0.0            0.0          0.0   
4          0.0                0.0            0.0          0.0   

   icmp.transmit_timestamp  ...  mqtt.proto_len mqtt.protoname  mqtt.topic  \
0                      0.0  ...             0.0          