<a href="https://colab.research.google.com/github/LavanyaShivamurthy/myCoLabLearing/blob/main/StructuredCode_V2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
# network_preprocessor.py
import pandas as pd
import numpy as np
from collections import defaultdict
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from IPython import get_ipython
from IPython.display import display
# %%

warnings.filterwarnings('ignore')

class NetworkDataPreprocessor:
    def __init__(self, filepath, low_memory=False):
        self.filepath = filepath
        self.low_memory = low_memory
        self.df = None
        self.selected_features = []
        self.flow_metrics = {}

    def load_data(self):
        print("[INFO] Loading dataset...")
        self.df = pd.read_csv(
            self.filepath,
            low_memory=self.low_memory,
            dtype={
                'frame.time_delta': 'float64',
                'frame.len': 'float64',
                26: 'object', 28: 'object', 35: 'object'
            }
        )
        print(f"[INFO] Dataset shape: {self.df.shape}")


    def handle_missing_values(self):
        # get columns with missig vaulues
        missing_cols = self.df.columns[self.df.isnull().any()]
        missing_stats = {'columns': {}}# # Initialize missing_stats dictionary
        print(f"Total missing values: {self.df.isnull().sum().sum()}")
        print(f"Columns with missing values: {len(missing_cols)} out of {len(self.df.columns)}")
        self.df.drop_duplicates(inplace=True)
        # Detailed report for columns with missing values
        print("\nMissing value analysis by column:")
        for col in missing_cols:
          missing_count = self.df[col].isnull().sum()
          missing_percent = (missing_count / len(self.df)) * 100
          missing_stats['columns'][col] = {
            'count': int(missing_count),
            'percent': float(missing_percent)
        }


        # Row-level missing value analysis
        missing_counts_per_row =self.df.isnull().sum(axis=1)
        rows_with_missing = (missing_counts_per_row > 0).sum()
        missing_stats['rows'] = {
        'total_rows_with_missing': int(rows_with_missing),
        'percent_rows_with_missing': float((rows_with_missing / len(self.df)) * 100),
        'distribution': {}
        }

        # Distribution of missing values per row
        value_counts = missing_counts_per_row.value_counts().sort_index()
        print("\nMissing value analysis by row:")
        print(f"Rows with at least one missing value: {rows_with_missing} out of {len(self.df)} ({(rows_with_missing/len(self.df))*100:.2f}%)")
        print("\nDistribution of missing values per row:")
        missing_stats['rows']['distribution'] = {}
        percent=0.0
        for count, frequency in value_counts.items():
         if count > 0:  # Only show rows that have missing values
            percent = (frequency / len(self.df)) * 100
            missing_stats['rows']['distribution'][int(count)] = {
                'frequency': int(frequency),
                'percent': float(percent)
            }
         print(f"  - {count} missing values: {frequency} rows ({percent:.2f}%)")
         # Print column names
         print("Column Names:")
         print(self.df.columns)

         #2. Duplicate and Null  removal
         print("Number of duplciate rows:",self.df.duplicated().sum())
         #print("Null values in each column:")
         #print(df.isnull().sum())
         print("Data frame before the removal")
         print(self.df)
         if self.df.duplicated().sum()>0:
            #df.drop_duplicates().dropna()# Null valeus  have meaning in this data set, so connot remove null
            # for example tcp.connection.fin = 0 means that the FIN (Finish) flag in the TCP header is not set,
            self.df.drop_duplicates()


         print("Data frame after the removal")
         print(self.df)
         # Print column names
         print("Column Names:")
         print(self.df.columns)

         # Replace invalid frame time deltas
         # Ensure non-zero time deltas to prevent division by zero
         epsilon = 1e-6
         self.df['frame.time_delta'] = self.df['frame.time_delta'].apply(lambda x: epsilon if x <= 0 else x)
         #  Ensure non-zero time deltas to prevent division by zero
         print(f"Updated dataset shape: {self.df.shape}")

    def calculate_bandwidth(self):
       print("[INFO] Calculating bandwidth...")
       # Create flow groups

       flow_groups = self.df.groupby(['ip.src', 'ip.dst', 'tcp.srcport', 'tcp.dstport'])
       # Calculate bandwidth
       bandwidth_data = []
       for name, group in flow_groups:
          if group['frame.time_delta'].sum() > 0:
              bw = (group['frame.len'].sum() * 8) / group['frame.time_delta'].sum()
          else:
              bw = 0
          # Create a record with the group keys and bandwidth value
          src_ip, dst_ip, src_port, dst_port = name
          bandwidth_data.append({
          'ip.src': src_ip,
          'ip.dst': dst_ip,
          'tcp.srcport': src_port,
          'tcp.dstport': dst_port,
          'bandwidth_bps': bw
          })

       # Create a fresh DataFrame with the bandwidth data
       flow_bandwidth = pd.DataFrame(bandwidth_data)

       # Create a temporary key for merging to avoid duplicate column issues
       self.df['temp_key'] = self.df['ip.src'].astype(str) + "_" + \
                      self.df['ip.dst'].astype(str) + "_" + \
                      self.df['tcp.srcport'].astype(str) + "_" + \
                      self.df['tcp.dstport'].astype(str)

       flow_bandwidth['temp_key'] = flow_bandwidth['ip.src'].astype(str) + "_" + \
                            flow_bandwidth['ip.dst'].astype(str) + "_" + \
                            flow_bandwidth['tcp.srcport'].astype(str) + "_" + \
                            flow_bandwidth['tcp.dstport'].astype(str)

       # Merge on the temporary key
       self.df = self.df.merge(
        flow_bandwidth[['temp_key', 'bandwidth_bps']],
        on='temp_key',
        how='left'
       )

       # Clean up by removing the temporary key
       self.df.drop('temp_key', axis=1, inplace=True)
       return flow_bandwidth


    def calculate_flow_metrics(self):
        print("[INFO] Calculating flow metrics...")
        # Use pd.notna instead of plt.plot_date.notna
        self.df['flow_key_forward'] = self.df.apply(
            lambda x: f"{x['ip.src']}:{x['tcp.srcport']}-{x['ip.dst']}:{x['tcp.dstport']}" if pd.notna(x['tcp.srcport']) else "", axis=1
        )
        # Use pd.notna instead of self.pd.notna (which is incorrect)
        self.df['flow_key_backward'] = self.df.apply(
            lambda x: f"{x['ip.dst']}:{x['tcp.dstport']}-{x['ip.src']}:{x['tcp.srcport']}" if pd.notna(x['tcp.dstport']) else "", axis=1
        )

        flow_stats = defaultdict(lambda: {'packet_count': 0, 'byte_count': 0, 'time_deltas': [], 'packet_sizes': []})

        for _, row in self.df.iterrows():
            key = row['flow_key_forward'] if row['flow_key_forward'] else row['flow_key_backward']
            if key:
                flow_stats[key]['packet_count'] += 1
                flow_stats[key]['byte_count'] += row['frame.len']
                flow_stats[key]['time_deltas'].append(row['frame.time_delta'])
                flow_stats[key]['packet_sizes'].append(row['frame.len'])

        self.flow_metrics = {k: {
            'packet_count': v['packet_count'],
            'byte_count': v['byte_count'],
            'avg_packet_size': np.mean(v['packet_sizes']),
            'std_packet_size': np.std(v['packet_sizes']),
            'avg_time_delta': np.mean(v['time_deltas']),
            'jitter': np.std(v['time_deltas']),
            'flow_duration': sum(v['time_deltas']),
            'avg_bandwidth': v['byte_count'] / sum(v['time_deltas']) if sum(v['time_deltas']) else 0
        } for k, v in flow_stats.items() if v['packet_count'] >= 3}
        print("[INFO] Flow metrics calculated.")
        # Add flow metrics to dataframe
        df['flow_id'] = df.apply(
            lambda x: x['flow_key_forward'] if x['flow_key_forward'] in flow_metrics
                     else (x['flow_key_backward'] if x['flow_key_backward'] in flow_metrics else ""),
            axis=1
        )

    def feature_engineering(self):
        print("[INFO] Starting feature engineering...")
        #self.calculate_bandwidth()
        # First calculate bandwidth
        bandwidth_df = self.calculate_bandwidth()
        outlier_df = self.detect_bandwidth_outliers(bandwidth_df, method='iqr', threshold=1.5)
        print(outlier_df)
        #Then detect outliers
        outlier_df = self.detect_bandwidth_outliers(bandwidth_df, method='iqr', threshold=1.5)
        # Handle outliers based on your needs
        cleaned_df = self.handle_bandwidth_outliers(outlier_df, method='cap')
        self.detect_mqtt()
        self.calculate_flow_metrics()
        self.assess_emergency_importance()
        self.assign_eisenhower_category()
        self.select_features()

    def detect_mqtt(self):
        print("[INFO] Detecting MQTT traffic...")
        self.df['is_mqtt'] = 0
        if 'mqtt.topic' in self.df.columns:
            self.df['mqtt.topic'] = self.df['mqtt.topic'].astype(str)
            self.df['is_mqtt'] = self.df['mqtt.topic'].apply(lambda x: 0 if x in ["0", "nan", None] else 1)

    def assess_importance(self, df, flow_metrics=None):
      """
      Assess importance level of network traffic based on multiple factors:
      - Protocol ports (critical databases, business apps)
      - IP address ranges (internal networks)
      - Application protocols (MQTT, business apps)
      - Data transfer sizes (large transfers)
      """
      print("[INFO] Assessing Importance...")

      # Initialize importance level column
      df['importance_level'] = 0  # 0: not important, 1: somewhat important, 2: highly important

      # Define port columns to check
      port_cols = ['tcp.srcport', 'tcp.dstport', 'udp.srcport', 'udp.dstport']
      port_cols = [col for col in port_cols if col in df.columns]

      # Debug: Initial state
      print("Before importance assignment:")
      print(df['importance_level'].value_counts())

      # 1. Protocol-based importance assessment
      importance_protocols = {
          # Business critical applications
          'critical_ports': [1433, 1521, 3306, 5432, 6379, 27017, 7000, 7001, 9042],  # Databases, key infrastructure
          'important_ports': [22, 23, 25, 110, 143, 465, 587, 993, 995, 389, 636],  # SSH, Email, LDAP
          'business_web_ports': [8080, 8443, 9000, 9090, 8008, 8888]  # Business web apps
      }

      if port_cols:
          # Create tiered importance based on protocol groups
          high_importance_ports = importance_protocols['critical_ports']
          medium_importance_ports = importance_protocols['important_ports'] + importance_protocols['business_web_ports']

          # Apply importance levels based on ports
          for col in port_cols:
              if col in df.columns:
                  # Mark high importance protocols (level 2)
                  high_imp_mask = df[col].isin(high_importance_ports)
                  df.loc[high_imp_mask & (df['importance_level'] < 2), 'importance_level'] = 2

                  # Mark medium importance protocols (level 1)
                  med_imp_mask = df[col].isin(medium_importance_ports)
                  df.loc[med_imp_mask & (df['importance_level'] < 1), 'importance_level'] = 1

      # Debug: After port-based assessment
      print("\nAfter port-based importance:")
      if any(df['importance_level'] > 0):
          for col in port_cols:
              if col in df.columns:
                  port_counts = df[df['importance_level'] > 0][col].value_counts()
                  if not port_counts.empty:
                      print(f"{col}: {port_counts.head()}")
      """
      # 2. IP-based importance assessment
      def is_important_ip(ip):
          #Check if IP belongs to important internal networks
          if pd.isna(ip) or not isinstance(ip, str):
              return False
          # Common private IP ranges
          return (ip.startswith("10.") or
                  ip.startswith("192.168.") or
                  ip.startswith("172.16.") or
                  ip.startswith("172.17.") or
                  ip.startswith("172.18.") or
                  ip.startswith("172.19.") or
                  ip.startswith("172.20.") or
                  ip.startswith("172.21.") or
                  ip.startswith("172.22.") or
                  ip.startswith("172.23.") or
                  ip.startswith("172.24.") or
                  ip.startswith("172.25.") or
                  ip.startswith("172.26.") or
                  ip.startswith("172.27.") or
                  ip.startswith("172.28.") or
                  ip.startswith("172.29.") or
                  ip.startswith("172.30.") or
                  ip.startswith("172.31."))
      """
      def is_important_ip(ip):
        if isinstance(ip, str):
          return ip.startswith("10.") or ip.startswith("192.168.") or ip.startswith("172.16.")
          return False
        df['is_important_ip'] = df['ip.src'].apply(is_important_ip) | df['ip.dst'].apply(is_important_ip)
        df.loc[df['is_important_ip'], 'importance_level'] = 1

      # Apply IP-based importance if columns exist
      if 'ip.src' in df.columns and 'ip.dst' in df.columns:
          df['is_important_ip'] = df['ip.src'].apply(is_important_ip) | df['ip.dst'].apply(is_important_ip)
          df.loc[df['is_important_ip'] & (df['importance_level'] < 1), 'importance_level'] = 1

          # Debug: After IP-based assessment
          print("\nAfter IP-based importance:")
          if 'is_important_ip' in df.columns and df['is_important_ip'].any():
              ip_counts = df[df['is_important_ip']]['ip.src'].value_counts()
              if not ip_counts.empty:
                  print(f"Important source IPs: {ip_counts.head()}")

      # 3. MQTT topic-based importance
      if 'mqtt.topic' in df.columns:
          # Ensure mqtt.topic is string and handle NaN values
          df['mqtt.topic'] = df['mqtt.topic'].fillna('').astype(str).str.lower()

          important_topics = ['temperature', 'humidity', 'solar', 'rad', 'motion', 'door', 'window',
                            'alarm', 'security', 'sensor', 'control', 'critical']

          # Create boolean mask for important MQTT topics
          mqtt_mask = df['mqtt.topic'].apply(
              lambda x: any(topic in x for topic in important_topics) if x else False
          )
          df.loc[mqtt_mask & (df['importance_level'] < 1), 'importance_level'] = 1

          # Debug: After MQTT assessment
          print("\nAfter MQTT-based importance:")
          if mqtt_mask.any():
              topic_counts = df[mqtt_mask]['mqtt.topic'].value_counts()
              if not topic_counts.empty:
                  print(f"Important MQTT topics: {topic_counts.head()}")

      # 4. Application-level importance (protocol detection)
      critical_protocols = ['ldap', 'kerberos', 'mssql', 'mysql', 'oracle', 'postgresql', 'mongodb']
      important_protocols = ['ssh', 'telnet', 'smtp', 'imap', 'pop', 'dns', 'ntp']

      # Check for critical protocols
      for protocol in critical_protocols:
          protocol_cols = [col for col in df.columns if protocol in col.lower()]
          if protocol_cols:
              # Mark as critical if protocol data is present
              for col in protocol_cols:
                  protocol_mask = df[col].notna() & (df[col] != 0) & (df[col] != '')
                  df.loc[protocol_mask & (df['importance_level'] < 2), 'importance_level'] = 2

      # Check for important protocols
      for protocol in important_protocols:
          protocol_cols = [col for col in df.columns if protocol in col.lower()]
          if protocol_cols:
              # Mark as important if protocol data is present
              for col in protocol_cols:
                  protocol_mask = df[col].notna() & (df[col] != 0) & (df[col] != '')
                  df.loc[protocol_mask & (df['importance_level'] < 1), 'importance_level'] = 1

      # 5. Data size-based importance (large data transfers)
      if 'flow_id' in df.columns and flow_metrics is not None:
          # Add flow byte count information
          if isinstance(flow_metrics, dict):
              df['flow_byte_count'] = df['flow_id'].map(
                  {k: v.get('byte_count', 0) if isinstance(v, dict) else 0
                  for k, v in flow_metrics.items()}
              ).fillna(0)
          else:
              # If flow_metrics is already a DataFrame
              try:
                  flow_df = pd.DataFrame(flow_metrics).T
                  if 'byte_count' in flow_df.columns:
                      df = df.merge(flow_df[['byte_count']].rename(columns={'byte_count': 'flow_byte_count'}),
                                  left_on='flow_id', right_index=True, how='left')
                      df['flow_byte_count'] = df['flow_byte_count'].fillna(0)
              except Exception as e:
                  print(f"[WARNING] Could not process flow_metrics: {e}")
                  df['flow_byte_count'] = 0

          # Mark large transfers as important (reduced threshold to 500KB)
          if 'flow_byte_count' in df.columns:
              large_transfer_mask = df['flow_byte_count'] > 500000
              df.loc[large_transfer_mask & (df['importance_level'] < 1), 'importance_level'] = 1

      # Final debug output
      print("\nFinal importance distribution:")
      importance_counts = df['importance_level'].value_counts().sort_index()
      print(f"  - Not important (level 0): {importance_counts.get(0, 0)} packets")
      print(f"  - Somewhat important (level 1): {importance_counts.get(1, 0)} packets")
      print(f"  - Highly important (level 2): {importance_counts.get(2, 0)} packets")

      # Clean up temporary columns
      temp_cols = ['is_important_ip']
      for col in temp_cols:
          if col in df.columns:
              df.drop(col, axis=1, inplace=True)

      print("[INFO] Importance assessment complete.")
      return df


    def assess_emergency(self, df, flow_metrics=None):

        """
        Assess emergency level of network traffic based on multiple factors:
        - Protocol ports (critical, urgent, standard)
        - TCP flags (URG flag)
        - QoS markings (DSCP values)
        - Flow characteristics (regularity, jitter)
        - Real-time traffic patterns
        """
        print("[INFO] Assessing Emergency...")
        df['emergency_level'] = 0  # 0: not emergency, 1: somewhat emergency, 2: high emergency

        # Protocol-based detection for emergency assessment
        emergency_sensitive_protocols = {
            # Real-time communications (VoIP, video conferencing)
            'critical_ports': [5060, 5061, 16384, 16394, 10000, 10001, 3478, 3479, 5004, 5005],  # SIP, RTP, STUN
            'urgent_ports': [1935, 8554, 554, 8000, 8080, 8443, 3074, 3075, 3076, 27015],  # Streaming, Gaming
            'standard_ports': [3389, 5900, 5800, 4172, 80, 443, 8443]  # RDP, VNC, HTTP(S)
        }

        # Check which port columns exist in the dataframe
        port_cols = ['tcp.srcport', 'tcp.dstport', 'udp.srcport', 'udp.dstport']
        port_cols = [col for col in port_cols if col in df.columns]

        if port_cols:
            # Create tiered emergency based on protocol groups
            high_emergency_ports = emergency_sensitive_protocols['critical_ports']
            # Fixed: Remove quotes around 'standard_ports' and combine lists properly
            medium_emergency_ports = emergency_sensitive_protocols['urgent_ports'] + emergency_sensitive_protocols['standard_ports']

            # Apply emergency levels based on ports
            for col in port_cols:
                if col in df.columns:
                    # Mark high emergency protocols (level 2)
                    high_emerg_mask = df[col].isin(high_emergency_ports)
                    df.loc[high_emerg_mask & (df['emergency_level'] < 2), 'emergency_level'] = 2

                    # Mark medium emergency protocols (level 1)
                    med_emerg_mask = df[col].isin(medium_emergency_ports)
                    df.loc[med_emerg_mask & (df['emergency_level'] < 1), 'emergency_level'] = 1

        # TCP flags for potential emergency
        if 'tcp.flags.urg' in df.columns:
            # URG flag often indicates emergency traffic
            df.loc[df['tcp.flags.urg'] == 1, 'emergency_level'] = 2

        # Check for QoS markings for emergency assessment
        if 'ip.dsfield' in df.columns:
            # Check for expedited forwarding (EF) or voice admit DSCP values
            ef_dscp_values = [46, 44, 45]  # EF (46), Voice-Admit (44,45)
            af_dscp_values = [26, 28, 30, 32, 34, 36, 38]  # Assured Forwarding values

            # High priority QoS markings indicate emergency
            df.loc[df['ip.dsfield'].isin(ef_dscp_values), 'emergency_level'] = 2

            # Medium priority QoS markings
            medium_qos_mask = df['ip.dsfield'].isin(af_dscp_values)
            df.loc[medium_qos_mask & (df['emergency_level'] < 1), 'emergency_level'] = 1

        # Flow-level analysis for emergency assessment
        if 'flow_regularity' in df.columns and 'flow_jitter' in df.columns:
            # Flows with high regularity and low jitter may indicate emergency traffic
            if 'flow_packet_count' in df.columns:
                high_regularity_flows = (
                    (df['flow_regularity'] > 0.8) &
                    (df['flow_jitter'] < 0.01) &
                    (df['flow_packet_count'] >= 10)
                )
                df.loc[high_regularity_flows & (df['emergency_level'] < 2), 'emergency_level'] = 2

                # Moderately regular flows
                med_regularity_flows = (
                    (df['flow_regularity'] > 0.6) &
                    (df['flow_jitter'] < 0.05) &
                    (df['flow_packet_count'] >= 5)
                )
                df.loc[med_regularity_flows & (df['emergency_level'] < 1), 'emergency_level'] = 1

        # Real-time traffic pattern detection
        if all(col in df.columns for col in ['frame.len', 'frame.time_delta', 'flow_id']):
            # Check if flow_metrics is provided and merge if needed
            if flow_metrics is not None and 'flow_packet_count' not in df.columns:
                # Convert flow_metrics to DataFrame if it's a dictionary
                if isinstance(flow_metrics, dict):
                    flow_df = pd.DataFrame(flow_metrics).T
                    flow_df = flow_df.rename(columns={'packet_count': 'flow_packet_count'})
                    df = df.merge(flow_df[['flow_packet_count']],
                                left_on='flow_id', right_index=True, how='left')

            # Check if we now have flow_packet_count column
            if 'flow_packet_count' in df.columns:
                # Very stringent pattern for small, regular, bidirectional packets (real-time traffic)
                realtime_traffic = (
                    (df['frame.len'] < 150) &        # Smaller packets
                    (df['frame.time_delta'] > 0.005) &  # Not too fast
                    (df['frame.time_delta'] < 0.04) &   # Not too slow (25+ packets per second)
                    (df['flow_packet_count'] > 10)      # Part of an established flow
                )
                df.loc[realtime_traffic & (df['emergency_level'] < 2), 'emergency_level'] = 2

        # Print summary statistics
        emergency_counts = df['emergency_level'].value_counts().sort_index()
        print(f"[INFO] Emergency Assessment Complete:")
        print(f"  - Normal traffic (level 0): {emergency_counts.get(0, 0)} packets")
        print(f"  - Medium emergency (level 1): {emergency_counts.get(1, 0)} packets")
        print(f"  - High emergency (level 2): {emergency_counts.get(2, 0)} packets")

        return df

    def assign_eisenhower_category(self, df=None):
      """
      Assign Eisenhower Matrix categories based on emergency and importance levels.
      Uses the results from assess_emergency() and assess_importance() functions.

      Eisenhower Matrix Categories:
      - Emergency and Important (Quadrant 1): Do First
      - Not Emergency but Important (Quadrant 2): Schedule
      - Emergency but Not Important (Quadrant 3): Delegate
      - Not Emergency and Not Important (Quadrant 4): Eliminate

      Args:
          df: DataFrame with emergency_level and importance_level columns

      Returns:
          DataFrame with eisenhower_category column added
      """
      print("[INFO] Assigning Eisenhower categories...")

      # Use provided df or self.df
      if df is None:
          if not hasattr(self, 'df'):
              raise ValueError("No DataFrame provided and self.df not found")
          df = self.df
      else:
          self.df = df

      # Verify required columns exist
      required_cols = ['emergency_level', 'importance_level']
      missing_cols = [col for col in required_cols if col not in df.columns]
      if missing_cols:
          raise ValueError(f"Missing required columns: {missing_cols}. "
                          "Please run assess_emergency() and assess_importance() first.")

      # Initialize with default category (Quadrant 4)
      df['eisenhower_category'] = 'Not Emergency and Not Important'

      # Quadrant 1: Emergency and Important (Do First)
      # High priority: immediate action required
      df.loc[
          (df['emergency_level'] >= 1) & (df['importance_level'] >= 1),
          'eisenhower_category'
      ] = 'Emergency and Important'

      # Quadrant 2: Not Emergency but Important (Schedule)
      # Important but not urgent: plan and schedule
      df.loc[
          (df['emergency_level'] == 0) & (df['importance_level'] >= 1),
          'eisenhower_category'
      ] = 'Not Emergency but Important'

      # Quadrant 3: Emergency but Not Important (Delegate)
      # Urgent but not important: can be delegated
      df.loc[
          (df['emergency_level'] >= 1) & (df['importance_level'] == 0),
          'eisenhower_category'
      ] = 'Emergency but Not Important'

      # Quadrant 4: Not Emergency and Not Important (Eliminate)
      # Neither urgent nor important: eliminate or minimize
      # Already set as default, no additional assignment needed

      # Generate summary statistics
      category_counts = df['eisenhower_category'].value_counts()
      total_packets = len(df)

      print("[INFO] Eisenhower Matrix Distribution:")
      print(f"  Quadrant 1 - Emergency and Important: {category_counts.get('Emergency and Important', 0)} packets ({category_counts.get('Emergency and Important', 0)/total_packets*100:.1f}%)")
      print(f"  Quadrant 2 - Not Emergency but Important: {category_counts.get('Not Emergency but Important', 0)} packets ({category_counts.get('Not Emergency but Important', 0)/total_packets*100:.1f}%)")
      print(f"   Quadrant 3 - Emergency but Not Important: {category_counts.get('Emergency but Not Important', 0)} packets ({category_counts.get('Emergency but Not Important', 0)/total_packets*100:.1f}%)")
      print(f"  Quadrant 4 - Not Emergency and Not Important: {category_counts.get('Not Emergency and Not Important', 0)} packets ({category_counts.get('Not Emergency and Not Important', 0)/total_packets*100:.1f}%)")

      # Additional analysis: Show breakdown by emergency and importance levels
      print("\n[INFO] Detailed Level Breakdown:")
      level_breakdown = df.groupby(['emergency_level', 'importance_level']).size().reset_index(name='count')
      for _, row in level_breakdown.iterrows():
          emergency = row['emergency_level']
          importance = row['importance_level']
          count = row['count']
          percentage = (count / total_packets) * 100
          print(f"  Emergency Level {emergency}, Importance Level {importance}: {count} packets ({percentage:.1f}%)")

      print("[INFO] Eisenhower categorization complete.")
      return df


    # Helper function to run complete analysis pipeline
    def run_complete_analysis(self, df, flow_metrics=None):
        """
        Run the complete analysis pipeline: emergency assessment, importance assessment,
        and Eisenhower categorization.

        Args:
            df: Input DataFrame with network traffic data
            flow_metrics: Flow metrics dictionary (optional)

        Returns:
            DataFrame with all analysis columns added
        """
        print("[INFO] Starting complete network traffic analysis...")

        # Step 1: Assess emergency level
        df = self.assess_emergency(df, flow_metrics)

        # Step 2: Assess importance level
        df = self.assess_importance(df, flow_metrics)

        # Step 3: Assign Eisenhower categories
        df = self.assign_eisenhower_category(df)

        print("[INFO] Complete analysis finished.")
        return df


    def select_features(self):
        print("[INFO] Selecting final features...")
        # Example feature selection
        print("[INFO] Available features:", self.df.columns)
        possible_features = ['frame.time_delta', 'frame.len', 'bandwidth_bps', 'is_mqtt']
        self.selected_features = [feat for feat in possible_features if feat in self.df.columns]
        print("[INFO] Selected features:", self.selected_features)

    def get_preprocessed_data(self):
        return self.df, self.selected_features

    def detect_bandwidth_outliers(self, bandwidth_df, method='iqr', threshold=1.5):
      """
      Detect outliers in bandwidth data.

      Parameters:
      -----------
      bandwidth_df : DataFrame
      DataFrame containing bandwidth values
      method : str, default='iqr'
      Method for outlier detection. Options: 'iqr', 'zscore', 'percentile'
      threshold : float, default=1.5
      Threshold for outlier detection (IQR multiplier or z-score)
      Returns:
      --------
      DataFrame with outlier flag
      """
      print("[INFO] Detecting outliers in bandwidth data...")
      print(f"Bandwidth data shape: {bandwidth_df.shape}")
      result = bandwidth_df.copy()

      if method == 'iqr':
        # IQR method
        Q1 = result['bandwidth_bps'].quantile(0.25)
        Q3 = result['bandwidth_bps'].quantile(0.75)
        IQR = Q3 - Q1

        lower_bound = Q1 - threshold * IQR
        upper_bound = Q3 + threshold * IQR

        # Flag outliers
        result['is_outlier'] = (result['bandwidth_bps'] < lower_bound) | (result['bandwidth_bps'] > upper_bound)

        # Optionally add bounds for reference
        result['lower_bound'] = lower_bound
        result['upper_bound'] = upper_bound

      elif method == 'zscore':
        # Z-score method
        from scipy import stats
        z_scores = stats.zscore(result['bandwidth_bps'], nan_policy='omit')
        result['is_outlier'] = abs(z_scores) > threshold

      elif method == 'percentile':
        # Percentile method
        lower_bound = result['bandwidth_bps'].quantile(0.01)  # Bottom 1%
        upper_bound = result['bandwidth_bps'].quantile(0.99)  # Top 1%

        result['is_outlier'] = (result['bandwidth_bps'] < lower_bound) | (result['bandwidth_bps'] > upper_bound)
        result['lower_bound'] = lower_bound
        result['upper_bound'] = upper_bound

       # Count outliers
      num_outliers = result['is_outlier'].sum()
      print(f"Detected {num_outliers} outliers out of {len(result)} flows ({num_outliers/len(result):.2%})")

      return result

    def handle_bandwidth_outliers(self, bandwidth_df, method='cap'):
      """
      Handle outliers in bandwidth data.

      Parameters:
      -----------
      bandwidth_df : DataFrame
      DataFrame containing bandwidth values and outlier flags
      method : str, default='cap'
      Method for handling outliers. Options: 'cap', 'remove', 'log'

      Returns:
      --------
      DataFrame with handled outliers
      """
      if 'is_outlier' not in bandwidth_df.columns:
        print("No outlier flags found. Run detect_bandwidth_outliers first.")
        return bandwidth_df

      result = bandwidth_df.copy()

      if method == 'cap':
        # Cap outliers at bounds
        if 'lower_bound' in result.columns and 'upper_bound' in result.columns:
            result.loc[result['is_outlier'], 'bandwidth_bps_handled'] = result.loc[result['is_outlier']].apply(
                lambda x: min(x['upper_bound'], max(x['lower_bound'], x['bandwidth_bps'])),
                axis=1
            )
            # Keep original values for non-outliers
            result.loc[~result['is_outlier'], 'bandwidth_bps_handled'] = result.loc[~result['is_outlier'], 'bandwidth_bps']
        else:
            print("Bounds not found. Recalculating...")
            # Recalculate bounds
            Q1 = result['bandwidth_bps'].quantile(0.25)
            Q3 = result['bandwidth_bps'].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR

            # Cap values
            result['bandwidth_bps_handled'] = result['bandwidth_bps'].clip(lower=lower_bound, upper=upper_bound)

      elif method == 'remove':
        # Remove outliers
        result = result[~result['is_outlier']].copy()

      elif method == 'log':
        # Log transform
        import numpy as np
        result['bandwidth_bps_handled'] = np.log1p(result['bandwidth_bps'])

      return result






In [None]:
# NEW: Importance assessment (separate from emergency)
df['importance_level'] = 0  # 0: not important, 1: somewhat important, 2: highly important  # This line creates the 'importance_level' column
#debuging
print("Before importance assignment:")
print(df['importance_level'].value_counts())
# Debugging each condition separately
print("\nPort-based importance:")
print(df[df['importance_level'] > 0]['tcp.srcport'].value_counts())

print("\nIP-based importance:")
print(df[df['importance_level'] > 0]['ip.src'].value_counts())

print("\nApplication-level importance:")
print(df[df['importance_level'] > 0]['mqtt.topic'].value_counts())

print("\nFinal importance distribution:")
print(df['importance_level'].value_counts())

# --- FIX 1: Broaden IP-based filtering ---
# Instead of strict prefixes, allow more ranges
def is_important_ip(ip):
  if isinstance(ip, str):
    return ip.startswith("10.") or ip.startswith("192.168.") or ip.startswith("172.16.")
  return False

df['is_important_ip'] = df['ip.src'].apply(is_important_ip) | df['ip.dst'].apply(is_important_ip)
df.loc[df['is_important_ip'], 'importance_level'] = 1

# --- FIX 2: Expand MQTT topic filtering ---
if 'mqtt.topic' in df.columns:
  df['mqtt.topic'] = df['mqtt.topic'].astype(str).str.lower()
  important_topics = ['temperature', 'humidity', 'solar rad', 'motion', 'door', 'window']  # Expanded topics
  df['is_important_mqtt'] = df['mqtt.topic'].apply(lambda x: any(topic in x for topic in important_topics))
  df.loc[df['is_important_mqtt'], 'importance_level'] = 1

# --- FIX 3: Adjust large data transfer detection ---
if 'flow_id' in df.columns:
  df = df.merge(pd.DataFrame(flow_metrics).T.rename(columns={'byte_count': 'flow_byte_count'}),
              left_on='flow_id', right_index=True, how='left')
  df['flow_byte_count'] = df['flow_byte_count'].fillna(0)
  df['is_large_transfer'] = df['flow_byte_count'] > 500000  # Reduced threshold from 1M to 500K bytes
  df.loc[df['is_large_transfer'], 'importance_level'] = 1

# Re-run importance distribution check
print("\nUpdated importance distribution:")
print(df['importance_level'].value_counts())
# NEW: Importance assessment (separate from emergency)
df['importance_level'] = 0  # 0: not important, 1: somewhat important, 2: highly important

# 1. Protocol-based importance assessment
importance_protocols = {
    # Business critical applications
    'critical_ports': [1433, 1521, 3306, 5432, 6379, 27017, 7000, 7001, 9042],  # Databases, key infrastructure
    'important_ports': [22, 23, 25, 110, 143, 465, 587, 993, 995, 389, 636],  # SSH, Email, LDAP
    'business_web_ports': [8080, 8443, 9000, 9090, 8008, 8888]  # Business web apps
}

if port_cols:
    # Create tiered importance based on protocol groups
    high_importance_ports = importance_protocols['critical_ports']
    medium_importance_ports = importance_protocols['important_ports'] + importance_protocols['business_web_ports']

    # Apply importance levels based on ports
    for col in port_cols:
        if col in df.columns:
            # Mark high importance protocols (level 2)
            high_imp_mask = df[col].isin(high_importance_ports)
            df.loc[high_imp_mask & (df['importance_level'] < 2), 'importance_level'] = 2

            # Mark medium importance protocols (level 1)
            med_imp_mask = df[col].isin(medium_importance_ports)
            df.loc[med_imp_mask & (df['importance_level'] < 1), 'importance_level'] = 1

Before importance assignment:
importance_level
0    108568
Name: count, dtype: int64

Port-based importance:
Series([], Name: count, dtype: int64)

IP-based importance:
Series([], Name: count, dtype: int64)

Application-level importance:
Series([], Name: count, dtype: int64)

Final importance distribution:
importance_level
0    108568
Name: count, dtype: int64

Updated importance distribution:
importance_level
1    108568
Name: count, dtype: int64


# New section

# New section

In [None]:

# network_trainer.py
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from xgboost import XGBClassifier

class NetworkModelTrainer:
    def __init__(self, df, features):
        self.df = df
        self.features = features
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.emergency_model = None
        self.important_model = None

    def prepare_data(self):
        print("[INFO] Preparing data for modeling...")
        self.df['is_emergency'] = self.df['eisenhower_category'].apply(
            lambda x: 1 if 'Emergency' in x else 0
        )
        self.df['is_important'] = self.df['eisenhower_category'].apply(
            lambda x: 1 if 'Important' in x else 0
        )

        X = self.df[self.features]
        y = self.df[['is_emergency', 'is_important']]


        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y, test_size=0.3, random_state=42
        )
         # Print class distributions
        print(f"[INFO] Original 'is_emergency' distribution: {self.df['is_emergency'].value_counts()}")
        print(f"[INFO] Original 'is_important' distribution: {self.df['is_important'].value_counts()}")
        print(f"[INFO] Training 'is_emergency' distribution: {self.y_train['is_emergency'].value_counts()}")
        print(f"[INFO] Training 'is_important' distribution: {self.y_train['is_important'].value_counts()}")
        print(f"[INFO] Testing 'is_emergency' distribution: {self.y_test['is_emergency'].value_counts()}")
        print(f"[INFO] Testing 'is_important' distribution: {self.y_test['is_important'].value_counts()}")
        print(f"[INFO] Train shape: {self.X_train.shape}")
        print(f"[INFO] Test shape: {self.X_test.shape}")
    """
    def balance_data_with_smote(self):
        print("[INFO] Applying SMOTE to balance classes...")
        smote = SMOTE(random_state=42)

        self.X_train_emergency, self.y_train_emergency = smote.fit_resample(
            self.X_train, self.y_train['is_emergency']
        )

        self.X_train_important, self.y_train_important = smote.fit_resample(
            self.X_train, self.y_train['is_important']
        )

        print(f"[INFO] Emergency class distribution after SMOTE: {np.bincount(self.y_train_emergency)}")
        print(f"[INFO] Important class distribution after SMOTE: {np.bincount(self.y_train_important)}")
    """


    def balance_data_with_smote(self):
        print("[INFO] Applying SMOTE to balance classes...")
        smote = SMOTE(random_state=42)
        # Check for emergency class
        emergency_classes = np.unique(self.y_train['is_emergency'])
        if len(emergency_classes) > 1:
            self.X_train_emergency, self.y_train_emergency = smote.fit_resample(
            self.X_train, self.y_train['is_emergency']
            )
            print(f"[INFO] After SMOTE - Emergency classes: {np.unique(self.y_train_emergency)}")
            print(f"[INFO] After SMOTE - Emergency shape: {self.X_train_emergency.shape}")
        else:
            print(f"[WARNING] Only one class found in is_emergency: {emergency_classes}")
            self.X_train_emergency, self.y_train_emergency = self.X_train, self.y_train['is_emergency']

        # Check for important class
        important_classes = np.unique(self.y_train['is_important'])
        if len(important_classes) > 1:
          self.X_train_important, self.y_train_important = smote.fit_resample(
            self.X_train, self.y_train['is_important']
          )
          print(f"[INFO] After SMOTE - Important classes: {np.unique(self.y_train_important)}")
          print(f"[INFO] After SMOTE - Important shape: {self.X_train_important.shape}")
        else:
          print(f"[WARNING] Only one class found in is_important: {important_classes}")
          self.X_train_important, self.y_train_important = self.X_train, self.y_train['is_important']
        # After SMOTE
        print(f"[INFO] After SMOTE - 'is_emergency' distribution: {np.bincount(self.y_train_emergency)}")
        print(f"[INFO] After SMOTE - 'is_important' distribution: {np.bincount(self.y_train_important)}")

    def train_emergency_model(self):
        print("[INFO] Training Emergency detection model...")
        preprocessor = ColumnTransformer([
            ('scale', StandardScaler(), self.features)
        ])

        self.emergency_model = Pipeline([
            ('preprocess', preprocessor),
            ('classifier', RandomForestClassifier(
                n_estimators=200,
                max_depth=10,
                class_weight={0:1, 1:10},
                random_state=42
            ))
        ])

        self.emergency_model.fit(self.X_train_emergency, self.y_train_emergency)
        print("[INFO] Emergency model trained successfully.")

    def train_important_model(self):
        print("[INFO] Training Importance detection model...")

        preprocessor = ColumnTransformer([
            ('scale', StandardScaler(), self.features)
        ])

        self.important_model = Pipeline([
            ('preprocess', preprocessor),
            ('classifier', XGBClassifier(
                n_estimators=200,
                learning_rate=0.1,
                max_depth=5,
                subsample=0.8,
                colsample_bytree=0.8,
                scale_pos_weight=5,
                random_state=42
            ))
        ])

        self.important_model.fit(self.X_train_important, self.y_train_important)
        print("[INFO] Important model trained successfully.")

    def evaluate_models(self):
        print("\n=== Evaluation Results ===")

        # Emergency Model Evaluation
        emergency_preds = self.emergency_model.predict(self.X_test)
        print("\n[Emergency Model Results]")
        print(classification_report(self.y_test['is_emergency'], emergency_preds))
        print("Confusion Matrix:")
        print(confusion_matrix(self.y_test['is_emergency'], emergency_preds))

        # Important Model Evaluation
        important_preds = self.important_model.predict(self.X_test)
        print("\n[Important Model Results]")
        print(classification_report(self.y_test['is_important'], important_preds))
        print("Confusion Matrix:")
        print(confusion_matrix(self.y_test['is_important'], important_preds))


In [None]:

# network_analyzer.py
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix

class NetworkAnalyzer:
    def __init__(self):
        pass

    def print_classification_report(self, y_true, y_pred, model_name="Model"):
        print(f"\n=== Classification Report: {model_name} ===")
        print(classification_report(y_true, y_pred))

    def plot_confusion_matrix(self, y_true, y_pred, model_name="Model"):
        print(f"[INFO] Plotting confusion matrix for {model_name}...")
        cm = confusion_matrix(y_true, y_pred)
        plt.figure(figsize=(6, 4))
        sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False)
        plt.title(f"Confusion Matrix: {model_name}")
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.show()

    def plot_feature_importance(self, model, feature_names, top_n=15, model_name="Model"):
        print(f"[INFO] Plotting feature importance for {model_name}...")

        if hasattr(model.named_steps['classifier'], 'feature_importances_'):
            importances = model.named_steps['classifier'].feature_importances_
        else:
            print("[WARNING] This model does not support feature_importances_. Skipping plot.")
            return

        indices = np.argsort(importances)[::-1]
        feature_names_sorted = [feature_names[i] for i in indices]

        plt.figure(figsize=(10, 6))
        sns.barplot(x=importances[indices][:top_n], y=feature_names_sorted[:top_n], palette="viridis")
        plt.title(f"Top {top_n} Important Features ({model_name})")
        plt.xlabel("Feature Importance Score")
        plt.ylabel("Feature Names")
        plt.tight_layout()
        plt.show()


In [None]:
# train_pipeline.py

#from network_preprocessor import NetworkDataPreprocessor
#%from network_trainer import NetworkModelTrainer
#%from network_analyzer import NetworkAnalyzer

def main():
    print("\n Starting Network Traffic Prioritization Pipeline...\n")
    from google.colab import drive
    drive.mount('/content/drive')
    # Step 1: Preprocess the Data
    print(" Step 1: Preprocessing Data...")
    preprocessor = NetworkDataPreprocessor(filepath="/content/drive/MyDrive/DataSets/ICUMonitoring.csv")
    preprocessor.load_data()
    preprocessor.handle_missing_values()
    preprocessor.feature_engineering()
    df, features = preprocessor.get_preprocessed_data()
    print(f"Feature names: {features}")
    df =preprocessor.run_complete_analysis(df, flow_metrics)






    # Step 2: Train Models
    print("\n🔹 Step 2: Training Models...")
    trainer = NetworkModelTrainer(df, features)
    trainer.prepare_data()
    trainer.balance_data_with_smote()
    trainer.train_emergency_model()
    trainer.train_important_model()

    # Step 3: Evaluate Models
    print("\n🔹 Step 3: Evaluating Models...")
    trainer.evaluate_models()

    # Step 4: Analyze Results
    print("\n🔹 Step 4: Detailed Analysis...")
    analyzer = NetworkAnalyzer()

    # Emergency Model Evaluation
    emergency_preds = trainer.emergency_model.predict(trainer.X_test)
    analyzer.print_classification_report(trainer.y_test['is_emergency'], emergency_preds, model_name="Emergency Model")
    analyzer.plot_confusion_matrix(trainer.y_test['is_emergency'], emergency_preds, model_name="Emergency Model")
    analyzer.plot_feature_importance(trainer.emergency_model, features, model_name="Emergency Model")

    # Important Model Evaluation
    important_preds = trainer.important_model.predict(trainer.X_test)
    analyzer.print_classification_report(trainer.y_test['is_important'], important_preds, model_name="Important Model")
    analyzer.plot_confusion_matrix(trainer.y_test['is_important'], important_preds, model_name="Important Model")
    analyzer.plot_feature_importance(trainer.important_model, features, model_name="Important Model")

    print("\n✅ Pipeline completed successfully!")


if __name__ == "__main__":
    main()



 Starting Network Traffic Prioritization Pipeline...

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
 Step 1: Preprocessing Data...
[INFO] Loading dataset...
[INFO] Dataset shape: (108568, 50)
Total missing values: 0
Columns with missing values: 0 out of 50

Missing value analysis by column:

Missing value analysis by row:
Rows with at least one missing value: 0 out of 108568 (0.00%)

Distribution of missing values per row:
  - 0 missing values: 108568 rows (0.00%)
Column Names:
Index(['frame.time_delta', 'frame.time_relative', 'frame.len', 'ip.src',
       'ip.dst', 'tcp.srcport', 'tcp.dstport', 'tcp.flags', 'tcp.time_delta',
       'tcp.len', 'tcp.ack', 'tcp.connection.fin', 'tcp.connection.rst',
       'tcp.connection.sack', 'tcp.connection.syn', 'tcp.flags.ack',
       'tcp.flags.fin', 'tcp.flags.push', 'tcp.flags.reset', 'tcp.flags.syn',
       'tcp.flags.urg', 'tcp.hdr_len', 'tcp.payload', 'tcp.pdu

ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0], got [1]