In [1]:
import pandas as pd
import numpy as np

def align_datasets_by_feature_characteristics(edgeiiot_path, ciciot_path):
    """
    Align Edge-IIoTset and CICIoT2023 datasets by matching numeric features
    with similar statistical properties when direct column name matching fails.
    
    Parameters:
    -----------
    edgeiiot_path : str
        Path to Edge-IIoTset CSV file
    ciciot_path : str
        Path to CICIoT2023 CSV file
        
    Returns:
    --------
    edgeiiot_aligned, ciciot_aligned : DataFrames
        Aligned datasets with the same number of features
    """
    
    print("=" * 80)
    print("DATASET ALIGNMENT BY FEATURE CHARACTERISTICS")
    print("=" * 80)
    
    # Step 1: Load both CSV files
    print("\n1. LOADING DATASETS...")
    print("-" * 40)
    
    try:
        # Load Edge-IIoTset dataset
        print(f"Loading Edge-IIoTset from: {edgeiiot_path}")
        edgeiiot_df = pd.read_csv(edgeiiot_path, low_memory=False)
        print(f"Edge-IIoTset shape: {edgeiiot_df.shape}")
        
        # Load CICIoT2023 dataset
        print(f"Loading CICIoT2023 from: {ciciot_path}")
        ciciot_df = pd.read_csv(ciciot_path, low_memory=False)
        print(f"CICIoT2023 shape: {ciciot_df.shape}")
    except Exception as e:
        print(f"Error loading datasets: {e}")
        return None, None
    
    # Step 2: Identify and separate target columns
    print("\n2. SEPARATING TARGET COLUMNS...")
    print("-" * 40)
    
    # Identify target columns
    edgeiiot_target_cols = []
    if 'Attack_type' in edgeiiot_df.columns:
        edgeiiot_target_cols.append('Attack_type')
    if 'Attack_label' in edgeiiot_df.columns:
        edgeiiot_target_cols.append('Attack_label')
    
    ciciot_target_cols = []
    if 'label' in ciciot_df.columns:
        ciciot_target_cols.append('label')
    if 'Label' in ciciot_df.columns:
        ciciot_target_cols.append('Label')
    
    print(f"Edge-IIoTset target columns: {edgeiiot_target_cols}")
    print(f"CICIoT2023 target columns: {ciciot_target_cols}")
    
    # Separate features and targets
    edgeiiot_features = edgeiiot_df.drop(columns=edgeiiot_target_cols, errors='ignore')
    edgeiiot_targets = edgeiiot_df[edgeiiot_target_cols] if edgeiiot_target_cols else None
    
    ciciot_features = ciciot_df.drop(columns=ciciot_target_cols, errors='ignore')
    ciciot_targets = ciciot_df[ciciot_target_cols] if ciciot_target_cols else None
    
    print(f"Edge-IIoTset features shape: {edgeiiot_features.shape}")
    print(f"CICIoT2023 features shape: {ciciot_features.shape}")
    
    # Step 3: Identify numeric features in each dataset
    print("\n3. IDENTIFYING NUMERIC FEATURES...")
    print("-" * 40)
    
    edgeiiot_numeric = edgeiiot_features.select_dtypes(include=[np.number])
    ciciot_numeric = ciciot_features.select_dtypes(include=[np.number])
    
    print(f"Edge-IIoTset numeric features: {edgeiiot_numeric.shape[1]}")
    print(f"CICIoT2023 numeric features: {ciciot_numeric.shape[1]}")
    
    # Step 4: Since column names don't match, we'll align by selecting
    # the top N most important features from each dataset based on variance
    print("\n4. SELECTING TOP FEATURES BY VARIANCE...")
    print("-" * 40)
    
    # Calculate variance for each numeric feature
    edgeiiot_variances = edgeiiot_numeric.var().sort_values(ascending=False)
    ciciot_variances = ciciot_numeric.var().sort_values(ascending=False)
    
    # Select top N features from each dataset (use the smaller number)
    n_features = min(30, len(edgeiiot_variances), len(ciciot_variances))
    print(f"Selecting top {n_features} features from each dataset")
    
    top_edgeiiot_features = edgeiiot_variances.index[:n_features].tolist()
    top_ciciot_features = ciciot_variances.index[:n_features].tolist()
    
    print(f"Top {n_features} Edge-IIoTset features (by variance):")
    for i, feat in enumerate(top_edgeiiot_features[:10], 1):
        print(f"  {i:2d}. {feat} (var: {edgeiiot_variances[feat]:.4f})")
    
    print(f"\nTop {n_features} CICIoT2023 features (by variance):")
    for i, feat in enumerate(top_ciciot_features[:10], 1):
        print(f"  {i:2d}. {feat} (var: {ciciot_variances[feat]:.4f})")
    
    # Step 5: Rename features to create a common naming scheme
    print("\n5. CREATING COMMON FEATURE NAMES...")
    print("-" * 40)
    
    # Create aligned datasets with renamed features
    edgeiiot_aligned_features = edgeiiot_numeric[top_edgeiiot_features].copy()
    ciciot_aligned_features = ciciot_numeric[top_ciciot_features].copy()
    
    # Rename columns to have a common naming scheme
    new_column_names = [f"feature_{i:03d}" for i in range(1, n_features + 1)]
    
    edgeiiot_aligned_features.columns = new_column_names
    ciciot_aligned_features.columns = new_column_names
    
    print(f"Common feature names: {new_column_names[:10]}...")
    
    # Step 6: Combine features with targets
    print("\n6. CREATING FINAL ALIGNED DATASETS...")
    print("-" * 40)
    
    if edgeiiot_targets is not None:
        edgeiiot_aligned = pd.concat([edgeiiot_aligned_features, edgeiiot_targets], axis=1)
    else:
        edgeiiot_aligned = edgeiiot_aligned_features
    
    if ciciot_targets is not None:
        ciciot_aligned = pd.concat([ciciot_aligned_features, ciciot_targets], axis=1)
    else:
        ciciot_aligned = ciciot_aligned_features
    
    print(f"Edge-IIoTset aligned shape: {edgeiiot_aligned.shape}")
    print(f"CICIoT2023 aligned shape: {ciciot_aligned.shape}")
    
    # Step 7: Save aligned datasets
    print("\n7. SAVING ALIGNED DATASETS...")
    print("-" * 40)
    
    edgeiiot_output_path = "edgeiiot_aligned_by_features.csv"
    ciciot_output_path = "ciciot_aligned_by_features.csv"
    
    edgeiiot_aligned.to_csv(edgeiiot_output_path, index=False)
    ciciot_aligned.to_csv(ciciot_output_path, index=False)
    
    print(f"Edge-IIoTset aligned saved to: {edgeiiot_output_path}")
    print(f"CICIoT2023 aligned saved to: {ciciot_output_path}")
    
    # Step 8: Display sample data
    print("\n8. VERIFICATION - SAMPLE DATA...")
    print("-" * 40)
    
    print("\nEdge-IIoTset aligned (first 3 rows):")
    print(edgeiiot_aligned.head(3))
    
    print("\nCICIoT2023 aligned (first 3 rows):")
    print(ciciot_aligned.head(3))
    
    # Step 9: Statistical comparison
    print("\n9. STATISTICAL COMPARISON OF ALIGNED FEATURES...")
    print("-" * 40)
    
    print("\nEdge-IIoTset aligned features statistics:")
    print(edgeiiot_aligned_features.describe().loc[['mean', 'std', 'min', 'max']].T.head(10))
    
    print("\nCICIoT2023 aligned features statistics:")
    print(ciciot_aligned_features.describe().loc[['mean', 'std', 'min', 'max']].T.head(10))
    
    # Step 10: Dataset information
    print("\n10. DATASET INFORMATION...")
    print("-" * 40)
    
    print(f"\nEdge-IIoTset original: {edgeiiot_df.shape}")
    print(f"Edge-IIoTset aligned: {edgeiiot_aligned.shape}")
    print(f"Features reduced: {edgeiiot_df.shape[1] - len(edgeiiot_target_cols)} → {n_features}")
    
    print(f"\nCICIoT2023 original: {ciciot_df.shape}")
    print(f"CICIoT2023 aligned: {ciciot_aligned.shape}")
    print(f"Features reduced: {ciciot_df.shape[1] - len(ciciot_target_cols)} → {n_features}")
    
    # Create mapping file for reference
    print("\n11. CREATING FEATURE MAPPING FILE...")
    print("-" * 40)
    
    mapping_data = []
    for i, (edge_feat, cic_feat) in enumerate(zip(top_edgeiiot_features, top_ciciot_features), 1):
        common_name = f"feature_{i:03d}"
        mapping_data.append({
            'common_name': common_name,
            'edgeiiot_original': edge_feat,
            'ciciot_original': cic_feat,
            'edgeiiot_variance': edgeiiot_variances[edge_feat],
            'ciciot_variance': ciciot_variances[cic_feat]
        })
    
    mapping_df = pd.DataFrame(mapping_data)
    mapping_df.to_csv("feature_mapping.csv", index=False)
    print("Feature mapping saved to: feature_mapping.csv")
    
    print("\n" + "=" * 80)
    print("ALIGNMENT COMPLETE!")
    print("=" * 80)
    print(f"\nSummary:")
    print(f"- Selected top {n_features} features from each dataset")
    print(f"- Created common feature names: feature_001 to feature_{n_features:03d}")
    print(f"- Both datasets now have identical feature names for ML compatibility")
    print(f"- Original feature names preserved in 'feature_mapping.csv'")
    
    return edgeiiot_aligned, ciciot_aligned


def create_ddos_binary_labels(df, dataset_name):
    """
    Convert multi-class labels to binary DDoS vs Normal labels.
    
    Parameters:
    -----------
    df : DataFrame
        Dataset with labels
    dataset_name : str
        Name of dataset ('edgeiiot' or 'ciciot')
        
    Returns:
    --------
    DataFrame with binary labels
    """
    df = df.copy()
    
    if dataset_name == 'edgeiiot':
        if 'Attack_type' in df.columns:
            # Create binary label: 1 for DDoS attacks, 0 for Normal
            df['is_ddos'] = df['Attack_type'].apply(
                lambda x: 1 if 'DDoS' in str(x) else 0
            )
            print(f"Edge-IIoTset: Created binary label 'is_ddos'")
            print(f"  DDoS samples: {df['is_ddos'].sum()}")
            print(f"  Normal samples: {(df['is_ddos'] == 0).sum()}")
    
    elif dataset_name == 'ciciot':
        if 'label' in df.columns:
            # Create binary label: 1 for DDoS/DoS attacks, 0 for Benign
            df['is_ddos'] = df['label'].apply(
                lambda x: 1 if any(word in str(x).lower() for word in ['ddos', 'dos']) else 0
            )
            print(f"CICIoT2023: Created binary label 'is_ddos'")
            print(f"  DDoS/DoS samples: {df['is_ddos'].sum()}")
            print(f"  Benign samples: {(df['is_ddos'] == 0).sum()}")
    
    return df


def main():
    """
    Main function to execute the dataset alignment process.
    """
    # Define dataset paths
    edgeiiot_path = "/kaggle/input/edgeiiotset-cyber-security-dataset-of-iot-iiot/Edge-IIoTset dataset/Selected dataset for ML and DL/DNN-EdgeIIoT-dataset.csv"
    ciciot_path = "/kaggle/input/ciciot2023/CICIOT23/train/train.csv"
    
    print("Dataset paths:")
    print(f"Edge-IIoTset: {edgeiiot_path}")
    print(f"CICIoT2023: {ciciot_path}")
    
    # Execute alignment
    edgeiiot_aligned, ciciot_aligned = align_datasets_by_feature_characteristics(
        edgeiiot_path, ciciot_path
    )
    
    if edgeiiot_aligned is not None and ciciot_aligned is not None:
        # Create binary labels for DDoS detection
        print("\n" + "=" * 80)
        print("CREATING BINARY LABELS FOR DDoS DETECTION")
        print("=" * 80)
        
        edgeiiot_binary = create_ddos_binary_labels(edgeiiot_aligned, 'edgeiiot')
        ciciot_binary = create_ddos_binary_labels(ciciot_aligned, 'ciciot')
        
        # Save datasets with binary labels
        edgeiiot_binary.to_csv("edgeiiot_aligned_binary.csv", index=False)
        ciciot_binary.to_csv("ciciot_aligned_binary.csv", index=False)
        
        print("\nBinary-labeled datasets saved:")
        print(f"Edge-IIoTset binary: edgeiiot_aligned_binary.csv")
        print(f"CICIoT2023 binary: ciciot_aligned_binary.csv")
        
        # Final summary
        print("\n" + "=" * 80)
        print("FINAL SUMMARY")
        print("=" * 80)
        print(f"Edge-IIoTset aligned shape: {edgeiiot_aligned.shape}")
        print(f"CICIoT2023 aligned shape: {ciciot_aligned.shape}")
        print(f"\nCommon features: {edgeiiot_aligned.shape[1] - 2} numeric features")
        print("(Features renamed to common scheme for ML compatibility)")
        
        # Check feature compatibility
        edgeiiot_feature_cols = [col for col in edgeiiot_aligned.columns 
                                if col.startswith('feature_')]
        ciciot_feature_cols = [col for col in ciciot_aligned.columns 
                              if col.startswith('feature_')]
        
        if set(edgeiiot_feature_cols) == set(ciciot_feature_cols):
            print("\n✓ SUCCESS: Datasets now have identical feature sets!")
            print(f"  Common features: {len(edgeiiot_feature_cols)}")
        else:
            print("\n✗ WARNING: Feature sets are not identical!")
    else:
        print("Alignment failed. Please check the dataset paths.")


if __name__ == "__main__":
    main()

Dataset paths:
Edge-IIoTset: /kaggle/input/edgeiiotset-cyber-security-dataset-of-iot-iiot/Edge-IIoTset dataset/Selected dataset for ML and DL/DNN-EdgeIIoT-dataset.csv
CICIoT2023: /kaggle/input/ciciot2023/CICIOT23/train/train.csv
DATASET ALIGNMENT BY FEATURE CHARACTERISTICS

1. LOADING DATASETS...
----------------------------------------
Loading Edge-IIoTset from: /kaggle/input/edgeiiotset-cyber-security-dataset-of-iot-iiot/Edge-IIoTset dataset/Selected dataset for ML and DL/DNN-EdgeIIoT-dataset.csv
Edge-IIoTset shape: (2219201, 63)
Loading CICIoT2023 from: /kaggle/input/ciciot2023/CICIOT23/train/train.csv
CICIoT2023 shape: (5491971, 47)

2. SEPARATING TARGET COLUMNS...
----------------------------------------
Edge-IIoTset target columns: ['Attack_type', 'Attack_label']
CICIoT2023 target columns: ['label']
Edge-IIoTset features shape: (2219201, 61)
CICIoT2023 features shape: (5491971, 46)

3. IDENTIFYING NUMERIC FEATURES...
----------------------------------------
Edge-IIoTset numeric f