<a href="https://colab.research.google.com/github/MostafaTF/combine-traffic-datasets/blob/main/CombineDataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from google.colab import drive
drive.mount('/content/drive/')

# Define paths to datasets in Google Drive
unsw_nb15_path = "/content/drive/My Drive/dataset/UNSW-NB15.csv"
iot_23_path = "/content/drive/My Drive/dataset/IoT-23.csv"
iotid20_path = "/content/drive/My Drive/dataset/IoTID20.csv"

# Load datasets
iot23_df = pd.read_csv(iot_23_path)
iotid20_df = pd.read_csv(iotid20_path)
unsw_nb15_df = pd.read_csv(unsw_nb15_path)

In [None]:
# Rename columns to common feature names
iot23 = iot23_df.rename(columns={
    'ts':'Time',
    'id.orig_h': 'Src_IP',
    'id.orig_p': 'Src_Port',
    'id.resp_h': 'Dst_IP',
    'id.resp_p': 'Dst_Port',
    'proto': 'Protocol',
    'duration': 'Flow_Duration',
    'orig_pkts': 'Tot_Fwd_Pkts',  # Updated column name
    'resp_pkts': 'Tot_Bwd_Pkts',  # Updated column name
    'label': 'Alabel'
})

iotid20 = iotid20_df.rename(columns={
    'Timestamp':'Time',
    'Src_IP': 'Src_IP',
    'Src_Port': 'Src_Port',
    'Dst_IP': 'Dst_IP',
    'Dst_Port': 'Dst_Port',
    'Protocol': 'Protocol',
    'Flow_Duration': 'Flow_Duration',
    'Tot_Fwd_Pkts': 'Tot_Fwd_Pkts',
    'Tot_Bwd_Pkts': 'Tot_Bwd_Pkts',
    'Cat': 'Alabel'
})

unsw_nb15 = unsw_nb15_df.rename(columns={
    'Stime': 'Time',
    'srcip': 'Src_IP',
    'sport': 'Src_Port',
    'dstip': 'Dst_IP',
    'dsport': 'Dst_Port',
    'proto': 'Protocol',
    'dur': 'Flow_Duration',
    'Spkts': 'Tot_Fwd_Pkts',  # Updated column name
    'Dpkts': 'Tot_Bwd_Pkts',  # Updated column name
    'attack_cat': 'Alabel'
})

In [None]:
import numpy as np
unsw_nb15['Alabel'] = unsw_nb15.Alabel.replace(np.nan ,"Normal",regex = True)

In [None]:
# Map attack labels to a common format
iot23['Alabel'] = iot23['Alabel'].map({
    'PartOfAHorizontalPortScan': 'Scan',
    'Okiru': 'Mirai',
    'Benign': 'Normal',
    'DDoS' : 'DoS',
    'Normal': 'Normal',
})


unsw_nb15['Alabel'] = unsw_nb15['Alabel'].map({
    'Normal': 'Normal',
    'Generic': 'Generic',
    'Exploits': 'Exploit',
    'DoS': 'DoS'
})

iot23=iot23.dropna()
iotid20=iotid20.dropna()
unsw_nb15=unsw_nb15.dropna()

In [None]:
def load_and_preprocess(df, downsample=False, target_size=600000):
    # Downsample if specified
    if downsample:
        df = df.groupby('Alabel', group_keys=False).apply(lambda x: x.sample(frac=min(1, target_size / len(df)), random_state=42))
        df = df.reset_index(drop=True)
    return df

# Load and preprocess datasets
unsw_nb15 = load_and_preprocess(unsw_nb15,downsample=True, target_size=600000)
iot23 = load_and_preprocess(iot23, downsample=True, target_size=600000)
iotid20 = load_and_preprocess(iotid20, downsample=True, target_size=600000)

In [None]:
# Convert to datetime
def normalize_time(df, time_col):
    df[time_col] = pd.to_datetime(df[time_col], errors='coerce')  # تبدیل به datetime
    return df
iot23 = normalize_time(iot23, 'Time')
iotid20 = normalize_time(iotid20, 'Time')
unsw_nb15 = normalize_time(unsw_nb15, 'Time')

iot23['Domain'] = "iot23"
iotid20['Domain'] = "iotid20"
unsw_nb15['Domain'] = "unsw_nb15"

In [None]:
unsw_nb15.columns

In [None]:
# Select common features
common_features = ['Time','Src_IP', 'Src_Port', 'Dst_IP','Dst_Port', 'Protocol', 'Flow_Duration', 'Tot_Fwd_Pkts', 'Tot_Bwd_Pkts', 'Domain', 'Alabel']
iot23 = iot23[common_features]
iotid20 = iotid20[common_features]
unsw_nb15 = unsw_nb15[common_features]

# Combine datasets
composite_dataset = pd.concat([iot23, iotid20, unsw_nb15], ignore_index=True)

# Remove rows with missing values
composite_dataset = composite_dataset.dropna()

# Save the composite dataset
composite_dataset.to_csv('/content/drive/My Drive/dataset/cdataset.csv', index=False)

print("Composite dataset created and saved as 'cdataset.csv'.")
print(f"Final dataset size: {len(composite_dataset)} rows")