<a href="https://colab.research.google.com/github/Hmdo0711/IIoT-Implementations/blob/main/IIoT_Implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Step 1: Clone and explore the Azure IoT Telemetry Simulator
!rm -rf iot-telemetry-simulator
!git clone -q https://github.com/azure-samples/iot-telemetry-simulator.git
!cd iot-telemetry-simulator && ls -la

# Step 2: Install required packages
!pip install -q pandas numpy scikit-learn matplotlib seaborn azure-iot-device

# Step 3: Import libraries
import pandas as pd
import numpy as np
import json
import random
from datetime import datetime, timedelta
import time
import asyncio
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import inspect

# Add the simulator to path
sys.path.append('/content/iot-telemetry-simulator')

# Let's check what's actually in the repository
print("Repository structure:")
!find /content/iot-telemetry-simulator -name "*.py" -type f | head -20

# Step 4: Create a compatible IIoT simulator based on Azure patterns
class IIoTSimulator:
    def __init__(self):
        self.output_dir = '/content/iiot_dataset'
        os.makedirs(self.output_dir, exist_ok=True)

        # Device configurations based on Azure simulator patterns
        self.devices = {
            'temp_sensor_001': {
                'type': 'temperature_sensor',
                'template': {
                    "temperature": 25.0,
                    "humidity": 45.0,
                    "pressure": 1013.25,
                    "timestamp": ""
                },
                'interval_range': (2000, 5000),  # ms
                'value_ranges': {
                    'temperature': (20.0, 80.0),
                    'humidity': (30.0, 60.0),
                    'pressure': (1000.0, 1020.0)
                }
            },
            'vibration_sensor_002': {
                'type': 'vibration_sensor',
                'template': {
                    "vibration_x": 0.1,
                    "vibration_y": 0.1,
                    "vibration_z": 0.1,
                    "rms": 0.1,
                    "frequency": 50,
                    "timestamp": ""
                },
                'interval_range': (100, 500),  # ms - more frequent
                'value_ranges': {
                    'vibration_x': (0.05, 2.0),
                    'vibration_y': (0.05, 2.0),
                    'vibration_z': (0.05, 2.0),
                    'rms': (0.1, 1.5),
                    'frequency': (45, 55)
                }
            },
            'pressure_sensor_003': {
                'type': 'pressure_sensor',
                'template': {
                    "pressure": 1.0,
                    "temperature": 25.0,
                    "flow_rate": 10.0,
                    "timestamp": ""
                },
                'interval_range': (1000, 3000),  # ms
                'value_ranges': {
                    'pressure': (0.5, 10.0),
                    'temperature': (15.0, 40.0),
                    'flow_rate': (5.0, 20.0)
                }
            },
            'hmi_panel_004': {
                'type': 'hmi_panel',
                'template': {
                    "setpoint": 50,
                    "actual_value": 50,
                    "mode": "auto",
                    "alarm": 0,
                    "operator_id": "OP001",
                    "timestamp": ""
                },
                'interval_range': (2000, 8000),  # ms - less frequent
                'value_ranges': {
                    'setpoint': (0, 100),
                    'actual_value': (0, 100)
                }
            }
        }

    def generate_device_message(self, device_id, base_timestamp):
        """Generate a realistic message for a specific device"""
        device_config = self.devices[device_id]
        message = device_config['template'].copy()

        # Add realistic variations based on device type
        if device_config['type'] == 'temperature_sensor':
            # Temperature has slow drift with some noise
            base_temp = random.uniform(25.0, 75.0)
            message['temperature'] = round(base_temp + random.gauss(0, 0.2), 2)
            message['humidity'] = round(random.uniform(35.0, 55.0), 2)
            message['pressure'] = round(random.uniform(1005.0, 1015.0), 2)

        elif device_config['type'] == 'vibration_sensor':
            # Vibration can have occasional spikes
            base_vib = random.uniform(0.1, 1.0)
            if random.random() < 0.02:  # 2% chance of spike
                base_vib *= random.uniform(2.0, 5.0)

            message['vibration_x'] = round(max(0.05, base_vib + random.gauss(0, 0.05)), 3)
            message['vibration_y'] = round(max(0.05, base_vib * 0.8 + random.gauss(0, 0.05)), 3)
            message['vibration_z'] = round(max(0.05, base_vib * 0.6 + random.gauss(0, 0.05)), 3)
            message['rms'] = round(np.sqrt(message['vibration_x']**2 + message['vibration_y']**2 + message['vibration_z']**2), 3)
            message['frequency'] = random.randint(48, 52)

        elif device_config['type'] == 'pressure_sensor':
            # Pressure changes slowly with correlation to temperature
            base_pressure = random.uniform(1.0, 8.0)
            message['pressure'] = round(max(0.5, base_pressure + random.gauss(0, 0.1)), 2)
            message['temperature'] = round(20.0 + (message['pressure'] / 10.0) + random.gauss(0, 0.5), 2)
            message['flow_rate'] = round(message['pressure'] * 2.0 + random.uniform(-1.0, 1.0), 2)

        else:  # HMI panel
            message['setpoint'] = random.randint(40, 60)
            # Actual value follows setpoint with some lag/variation
            message['actual_value'] = message['setpoint'] + random.randint(-5, 5)
            message['mode'] = random.choice(['auto', 'manual'])
            message['alarm'] = 1 if random.random() < 0.01 else 0  # 1% alarm probability
            message['operator_id'] = f"OP{random.randint(1, 5):03d}"

        # Add timestamp
        message['timestamp'] = base_timestamp.isoformat()

        return message

    async def simulate_normal_traffic(self, duration_minutes=30):
        """Simulate normal IIoT device traffic"""
        print("Simulating normal IIoT traffic...")

        events = []
        start_time = datetime.now() - timedelta(minutes=duration_minutes)
        end_time = datetime.now()

        # Track current time for each device
        device_times = {device_id: start_time for device_id in self.devices}

        while any(device_time < end_time for device_time in device_times.values()):
            for device_id, device_config in self.devices.items():
                current_time = device_times[device_id]

                if current_time >= end_time:
                    continue

                # Generate message for this device
                message = self.generate_device_message(device_id, current_time)

                # Create event in our standard format
                event = {
                    'timestamp_epoch': int(current_time.timestamp()),
                    'timestamp_iso': current_time.isoformat(),
                    'device_id': device_id,
                    'device_type': device_config['type'],
                    'topic': f"iiot/telemetry/{device_config['type']}",
                    'payload': message,
                    'properties': {
                        'firmware': f"v2.{random.randint(1, 3)}.{random.randint(0, 5)}",
                        'location': random.choice(['line_a', 'line_b', 'quality_station', 'packaging']),
                        'status': random.choice(['healthy', 'optimal', 'normal'])
                    }
                }

                events.append(event)

                # Calculate next message time for this device
                interval_ms = random.randint(*device_config['interval_range'])
                device_times[device_id] = current_time + timedelta(milliseconds=interval_ms)

            # Small sleep to prevent overwhelming the system
            await asyncio.sleep(0.001)

        print(f"Generated {len(events)} normal events")
        return events

    def inject_advanced_attacks(self, normal_events):
        """Inject sophisticated IIoT attacks"""
        print("Injecting advanced IIoT attacks...")

        attack_events = []
        labels = []

        df_normal = pd.DataFrame(normal_events)
        df_normal['timestamp'] = pd.to_datetime(df_normal['timestamp_iso'])

        # Attack 1: DDoS/Flood Attack
        flood_start = df_normal['timestamp'].min() + (df_normal['timestamp'].max() - df_normal['timestamp'].min()) * 0.25
        flood_duration = timedelta(seconds=120)
        flood_end = flood_start + flood_duration

        print(f"🌀 DDoS/Flood Attack: {flood_start} to {flood_end}")

        # Target specific vulnerable devices
        target_devices = ['vibration_sensor_002', 'temp_sensor_001']

        current_flood_time = flood_start
        flood_messages = 0

        while current_flood_time < flood_end:
            for device_id in target_devices:
                # Create rapid burst of messages
                burst_size = random.randint(50, 100)

                for i in range(burst_size):
                    # Get template message
                    device_events = [e for e in normal_events if e['device_id'] == device_id]
                    if device_events:
                        template = random.choice(device_events)

                        flood_event = template.copy()
                        flood_event['timestamp_epoch'] = int(current_flood_time.timestamp())
                        flood_event['timestamp_iso'] = current_flood_time.isoformat()
                        flood_event['payload']['flood_attack'] = True
                        flood_event['payload']['burst_id'] = flood_messages

                        attack_events.append(flood_event)
                        flood_messages += 1

                    current_flood_time += timedelta(milliseconds=random.randint(1, 5))

            current_flood_time += timedelta(milliseconds=random.randint(10, 50))

        labels.append({
            'attack_type': 'ddos_flood',
            'start_time': flood_start.isoformat(),
            'end_time': flood_end.isoformat(),
            'target_devices': target_devices,
            'messages_injected': flood_messages,
            'severity': 'high',
            'description': 'Distributed denial of service via rapid message flooding'
        })

        # Attack 2: Data Manipulation/Replay Attack
        replay_start = df_normal['timestamp'].min() + (df_normal['timestamp'].max() - df_normal['timestamp'].min()) * 0.55
        replay_duration = timedelta(seconds=90)
        replay_end = replay_start + replay_duration

        print(f"🎭 Data Manipulation/Replay Attack: {replay_start} to {replay_end}")

        # Select historical messages to replay with manipulation
        replay_messages = []
        for device_type in df_normal['device_type'].unique():
            type_events = [e for e in normal_events if e['device_type'] == device_type]
            if type_events:
                replay_messages.extend(random.sample(type_events, min(20, len(type_events))))

        current_replay_time = replay_start
        replayed_count = 0

        while current_replay_time < replay_end:
            original_msg = random.choice(replay_messages)
            replayed_msg = original_msg.copy()

            # Manipulate the payload values
            if 'temperature' in replayed_msg['payload']:
                replayed_msg['payload']['temperature'] += random.uniform(-10, 10)
            if 'pressure' in replayed_msg['payload']:
                replayed_msg['payload']['pressure'] += random.uniform(-2, 2)
            if 'vibration_x' in replayed_msg['payload']:
                replayed_msg['payload']['vibration_x'] *= random.uniform(0.5, 2.0)

            replayed_msg['timestamp_epoch'] = int(current_replay_time.timestamp())
            replayed_msg['timestamp_iso'] = current_replay_time.isoformat()
            replayed_msg['payload']['replay_attack'] = True
            replayed_msg['payload']['original_timestamp'] = original_msg['timestamp_iso']

            attack_events.append(replayed_msg)
            replayed_count += 1

            # Variable timing to avoid pattern detection
            interval = random.choice([
                timedelta(milliseconds=100),
                timedelta(seconds=2),
                timedelta(milliseconds=500),
                timedelta(seconds=5)
            ])
            current_replay_time += interval

        labels.append({
            'attack_type': 'data_manipulation_replay',
            'start_time': replay_start.isoformat(),
            'end_time': replay_end.isoformat(),
            'messages_replayed': replayed_count,
            'severity': 'medium',
            'description': 'Replay of historical messages with manipulated data values'
        })

        # Attack 3: Command Injection (HMI-specific)
        cmd_start = df_normal['timestamp'].min() + (df_normal['timestamp'].max() - df_normal['timestamp'].min()) * 0.75
        cmd_duration = timedelta(seconds=60)
        cmd_end = cmd_start + cmd_duration

        print(f"⚡ Command Injection Attack: {cmd_start} to {cmd_end}")

        current_cmd_time = cmd_start
        cmd_count = 0

        while current_cmd_time < cmd_end:
            # Create malicious HMI commands
            cmd_event = {
                'timestamp_epoch': int(current_cmd_time.timestamp()),
                'timestamp_iso': current_cmd_time.isoformat(),
                'device_id': 'hmi_panel_004',
                'device_type': 'hmi_panel',
                'topic': 'iiot/control/commands',
                'payload': {
                    'command': random.choice(['emergency_stop', 'override_setpoint', 'change_mode']),
                    'value': random.randint(0, 100),
                    'malicious': True,
                    'injection_type': random.choice(['sql', 'buffer_overflow', 'privilege_escalation'])
                },
                'properties': {
                    'firmware': 'v1.0.0',  # Outdated firmware
                    'location': 'control_room',
                    'status': 'compromised'
                }
            }

            attack_events.append(cmd_event)
            cmd_count += 1
            current_cmd_time += timedelta(seconds=random.uniform(1, 5))

        labels.append({
            'attack_type': 'command_injection',
            'start_time': cmd_start.isoformat(),
            'end_time': cmd_end.isoformat(),
            'target_device': 'hmi_panel_004',
            'messages_injected': cmd_count,
            'severity': 'critical',
            'description': 'Malicious command injection targeting HMI control system'
        })

        print(f"Total attack events injected: {len(attack_events)}")
        return attack_events, labels

    def save_datasets(self, normal_events, attack_events, labels):
        """Save all dataset files"""
        # Save normal events
        with open(f'{self.output_dir}/events_normal.jsonl', 'w') as f:
            for event in normal_events:
                f.write(json.dumps(event) + '\n')

        # Save combined events
        combined_events = normal_events + attack_events
        with open(f'{self.output_dir}/events_full.jsonl', 'w') as f:
            for event in combined_events:
                f.write(json.dumps(event) + '\n')

        # Save labels
        with open(f'{self.output_dir}/labels.json', 'w') as f:
            json.dump(labels, f, indent=2)

        print(f"📁 Datasets saved:")
        print(f"   - events_normal.jsonl: {len(normal_events)} events")
        print(f"   - events_full.jsonl: {len(combined_events)} events")
        print(f"   - labels.json: {len(labels)} attack definitions")

        return combined_events

# Step 5: Enhanced preprocessing for IIoT security
def enhanced_iiot_preprocessing(events):
    """Advanced preprocessing for IIoT security dataset"""
    print("🔄 Preprocessing IIoT data with security features...")

    records = []
    for event in events:
        try:
            record = {
                'timestamp': pd.to_datetime(event['timestamp_iso']),
                'device_id': event['device_id'],
                'device_type': event['device_type'],
                'topic': event['topic'],
                'message_size': len(json.dumps(event)),
                'is_attack': 0  # Default to normal
            }

            # Extract payload values
            payload = event.get('payload', {})
            for key, value in payload.items():
                if isinstance(value, (int, float)):
                    record[f'payload_{key}'] = value
                elif isinstance(value, str):
                    record[f'payload_{key}'] = value
                elif isinstance(value, bool):
                    record[f'payload_{key}'] = int(value)

            # Extract properties
            properties = event.get('properties', {})
            for key, value in properties.items():
                record[f'prop_{key}'] = value

            # Attack indicators
            record['is_flood'] = 1 if payload.get('flood_attack') else 0
            record['is_replay'] = 1 if payload.get('replay_attack') else 0
            record['is_malicious_cmd'] = 1 if payload.get('malicious') else 0

            # Mark as attack if any attack indicator is present
            if record['is_flood'] or record['is_replay'] or record['is_malicious_cmd']:
                record['is_attack'] = 1

            records.append(record)

        except Exception as e:
            continue

    df = pd.DataFrame(records)
    print(f"📊 Initial dataset: {df.shape[0]} records, {df.shape[1]} features")

    # Remove any null records
    df = df.dropna()

    # Sort by timestamp
    df = df.sort_values('timestamp').reset_index(drop=True)

    # Advanced feature engineering
    print("🔧 Engineering advanced features...")

    # Time-based features
    df['hour'] = df['timestamp'].dt.hour
    df['minute'] = df['timestamp'].dt.minute
    df['second'] = df['timestamp'].dt.second
    df['day_of_week'] = df['timestamp'].dt.dayofweek

    # Device behavior analytics
    for device_id in df['device_id'].unique():
        device_mask = df['device_id'] == device_id

        # Message frequency (1-minute rolling window)
        df.loc[device_mask, 'msg_freq_1min'] = (
            df[device_mask].rolling('1min', on='timestamp')['device_id'].count().values
        )

        # Value statistics for numeric telemetry
        numeric_cols = [col for col in df.columns if col.startswith('payload_') and df[col].dtype in ['float64', 'int64']]
        for col in numeric_cols:
            # Rolling statistics
            df.loc[device_mask, f'{col}_rolling_mean'] = (
                df[device_mask][col].rolling(window=20, min_periods=1).mean()
            )
            df.loc[device_mask, f'{col}_rolling_std'] = (
                df[device_mask][col].rolling(window=20, min_periods=1).std()
            )

            # Rate of change
            df.loc[device_mask, f'{col}_rate_change'] = df[device_mask][col].diff()

    # Network behavior features
    df['total_msg_rate'] = df.groupby(pd.Grouper(key='timestamp', freq='1min'))['device_id'].transform('count')

    # Encode categorical variables
    categorical_cols = ['device_id', 'device_type', 'topic', 'prop_status', 'prop_location']
    categorical_cols = [col for col in categorical_cols if col in df.columns]

    label_encoders = {}
    for col in categorical_cols:
        le = LabelEncoder()
        df[f'{col}_encoded'] = le.fit_transform(df[col].astype(str))
        label_encoders[col] = le

    # Normalize numerical features
    numerical_cols = [col for col in df.columns if df[col].dtype in ['float64', 'int64']
                     and not col.startswith('is_')
                     and col not in ['timestamp', 'hour', 'minute', 'second', 'day_of_week']
                     and not col.endswith('_encoded')]

    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()

    for col in numerical_cols:
        if col in df.columns and df[col].nunique() > 1:
            df[f'{col}_normalized'] = scaler.fit_transform(df[[col]].fillna(0))

    print(f"✅ Final dataset: {df.shape[0]} records, {df.shape[1]} features")
    print(f"📈 Attack distribution: {df['is_attack'].sum()} attack events ({df['is_attack'].mean()*100:.1f}%)")

    return df, label_encoders

# Step 6: Main execution
async def main():
    """Execute the complete IIoT dataset generation pipeline"""
    print("🚀 Starting Enhanced IIoT Security Dataset Generation")
    print("=" * 60)

    # Initialize simulator
    simulator = IIoTSimulator()

    # Generate normal traffic
    normal_events = await simulator.simulate_normal_traffic(duration_minutes=20)

    # Inject attacks
    attack_events, labels = simulator.inject_advanced_attacks(normal_events)

    # Save datasets
    combined_events = simulator.save_datasets(normal_events, attack_events, labels)

    # Preprocess data
    df, label_encoders = enhanced_iiot_preprocessing(combined_events)

    # Save processed dataset
    output_file = f'{simulator.output_dir}/dataset_flows.csv'
    df.to_csv(output_file, index=False)
    print(f"💾 Saved processed dataset to: {output_file}")

    # Train/test split (80/20)
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, shuffle=False)

    # Dataset summary
    print("\n📊 DATASET SUMMARY")
    print("=" * 40)
    print(f"Total records: {len(df):,}")
    print(f"Training set: {len(train_df):,} (80%)")
    print(f"Test set: {len(test_df):,} (20%)")
    print(f"Features generated: {len(df.columns)}")
    print(f"Time range: {df['timestamp'].min()} to {df['timestamp'].max()}")
    print(f"Devices: {df['device_id'].nunique()} types: {df['device_type'].unique()}")
    print(f"Attack events: {df['is_attack'].sum():,} ({df['is_attack'].mean()*100:.1f}%)")

    # Attack type breakdown
    if 'is_flood' in df.columns:
        print(f"  - Flood attacks: {df['is_flood'].sum():,}")
    if 'is_replay' in df.columns:
        print(f"  - Replay attacks: {df['is_replay'].sum():,}")
    if 'is_malicious_cmd' in df.columns:
        print(f"  - Command injection: {df['is_malicious_cmd'].sum():,}")

    # Show sample
    print("\n🔍 SAMPLE DATA")
    print("=" * 40)
    sample_cols = ['timestamp', 'device_id', 'device_type', 'is_attack', 'is_flood', 'is_replay']
    sample_cols = [col for col in sample_cols if col in df.columns]
    print(df[sample_cols].head(8))

    return df, train_df, test_df

# Run the complete pipeline
print("Initializing IIoT Dataset Generation...")
df, train_df, test_df = await main()

# Final confirmation
print("\n🎉 DATASET GENERATION COMPLETE!")
print("📁 Files created in /content/iiot_dataset/:")
print("   - events_normal.jsonl (Normal traffic only)")
print("   - events_full.jsonl (Normal + Attack traffic)")
print("   - labels.json (Attack definitions and timestamps)")
print("   - dataset_flows.csv (Preprocessed ML-ready dataset)")
print("\n✅ Ready for machine learning model training!")

total 128
drwxr-xr-x 8 root root  4096 Oct 17 16:50 .
drwxr-xr-x 1 root root  4096 Oct 17 16:50 ..
-rw-r--r-- 1 root root  8607 Oct 17 16:50 AUTOMATION.md
-rw-r--r-- 1 root root   674 Oct 17 16:50 CHANGELOG.md
drwxr-xr-x 3 root root  4096 Oct 17 16:50 charts
-rw-r--r-- 1 root root  4037 Oct 17 16:50 CONTRIBUTING.md
-rw-r--r-- 1 root root   806 Oct 17 16:50 dockerbuild.cmd
-rw-r--r-- 1 root root   762 Oct 17 16:50 dockerbuild.sh
-rw-r--r-- 1 root root   348 Oct 17 16:50 .dockerignore
drwxr-xr-x 3 root root  4096 Oct 17 16:50 docs
drwxr-xr-x 8 root root  4096 Oct 17 16:50 .git
drwxr-xr-x 3 root root  4096 Oct 17 16:50 .github
-rw-r--r-- 1 root root  6031 Oct 17 16:50 .gitignore
-rw-r--r-- 1 root root    43 Oct 17 16:50 global.json
-rw-r--r-- 1 root root  3339 Oct 17 16:50 IotTelemetrySimulator.sln
-rw-r--r-- 1 root root  1074 Oct 17 16:50 LICENSE
-rw-r--r-- 1 root root  1160 Oct 17 16:50 LICENSE.md
-rw-r--r-- 1 root root 17441 Oct 17 16:50 README.md
-rw-r--r-- 1 root root  2820 Oct 17 16

ValueError: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.