<a href="https://colab.research.google.com/github/Mc4minta/AIB5-PcapAttackClassifier/blob/main/ProcessTrainSetCIC_IDS2017.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# CICIDS2017

In [12]:
import pandas as pd
import numpy as np
import os

csv_dir = '/content/drive/MyDrive/Share to Mc4/AIBuilders5-MiN/GeneratedLabelledFlows/TrafficLabelling'

csv_files = [
    'Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv',     # Benign, DDOS,
    'Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv', # Benign, Portscan
    'Monday-WorkingHours.pcap_ISCX.csv',                    # Benign
    'Tuesday-WorkingHours.pcap_ISCX.csv',                   # Benign, FTP, SSH
    'Wednesday-workingHours.pcap_ISCX.csv',                 # Benign, DOSs, Heartbleed

    # Fix : Replace "–" (Dashes) with "-" (Hyphens) to prevent decoding error
    'Thursday-WorkingHours-Morning-WebAttacks-Fix.pcap_ISCX.csv' # Benign, Web attacks
]

dfs = []

for file in csv_files:
    file_path = os.path.join(csv_dir, file)
    try:
        df = pd.read_csv(file_path)

        # Strip whitespace from column names
        df.columns = df.columns.str.strip()

        # Process 'BENIGN' class: keep only 20%
        benign_df = df[df['Label'] == 'BENIGN']
        kept_benign_df = benign_df.sample(frac=0.2, random_state=42)
        non_benign_df = df[df['Label'] != 'BENIGN']
        processed_df = pd.concat([kept_benign_df, non_benign_df], ignore_index=True)

        # Process 'DoS attacks-Hulk' class: keep only 15% (only for Wednesday-workingHours.pcap_ISCX.csv)
        if file == 'Wednesday-workingHours.pcap_ISCX.csv':
            dos_hulk_df = processed_df[processed_df['Label'] == 'DoS Hulk']
            kept_dos_hulk_df = dos_hulk_df.sample(frac=0.15, random_state=42)
            other_classes_df = processed_df[processed_df['Label'] != 'DoS Hulk']
            processed_df = pd.concat([kept_dos_hulk_df, other_classes_df], ignore_index=True)

        # Drop the 'DDoS' class
        processed_df = processed_df[processed_df['Label'] != 'DDoS']

        # Drop classes containing "Web" and "Heartbleed"
        processed_df = processed_df[~processed_df['Label'].str.contains('Web') & (processed_df['Label'] != 'Heartbleed')]


        dfs.append(processed_df) # Append the loaded dataframe to the list

    except FileNotFoundError:
        print(f"File not found: {file}")
    except Exception as e:
        print(f"Error with {file}: {e}")


# Combine all dataframes as one
df = pd.concat(dfs, ignore_index=True)

display(df.head())
display(df['Label'].value_counts())

# Compare the number of benign and other class combined
benign_count = df[df['Label'] == 'BENIGN'].shape[0]
other_count = df.shape[0] - benign_count

print(f"\nNumber of 'BENIGN' rows in the combined efficient DataFrame: {benign_count}")
print(f"Number of other class rows in the combined efficient DataFrame: {other_count}")

Unnamed: 0,Flow ID,Source IP,Source Port,Destination IP,Destination Port,Protocol,Timestamp,Flow Duration,Total Fwd Packets,Total Backward Packets,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,172.16.0.1-192.168.10.50-40474-80-6,192.168.10.50,80,172.16.0.1,40474,6,7/7/2017 4:14,7182757,1,5,...,20,45201.0,0.0,45201.0,45201.0,7137556.0,0.0,7137556.0,7137556.0,BENIGN
1,172.16.0.1-192.168.10.50-28907-80-6,192.168.10.50,80,172.16.0.1,28907,6,7/7/2017 4:07,938509,1,5,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
2,8.0.6.4-8.6.0.1-0-0-0,8.6.0.1,0,8.0.6.4,0,0,7/7/2017 4:02,119995732,123,0,...,0,3753432.167,5183854.063,13200000.0,1.0,13100000.0,6507819.26,26300000.0,6484077.0,BENIGN
3,172.217.12.205-192.168.10.14-443-58168-6,192.168.10.14,58168,172.217.12.205,443,6,7/7/2017 4:09,142954,10,6,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
4,172.16.0.1-192.168.10.50-17541-80-6,192.168.10.50,80,172.16.0.1,17541,6,7/7/2017 3:58,2548725,1,6,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN


Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
BENIGN,359093
PortScan,158930
DoS Hulk,34661
DoS GoldenEye,10293
FTP-Patator,7938
SSH-Patator,5897
DoS slowloris,5796
DoS Slowhttptest,5499



Number of 'BENIGN' rows in the combined efficient DataFrame: 359093
Number of other class rows in the combined efficient DataFrame: 229014


In [13]:
# Export the DataFrame to a CSV file
output_csv_path = 'TrainCICIDS2017.csv'
df.to_csv(output_csv_path, index=False)
print(f"DataFrame exported to {output_csv_path}")

DataFrame exported to TrainCICIDS2017.csv


In [14]:
import zipfile
import os

# Define the name for the zip archive
zip_archive_name = 'TrainCICIDS2017.zip'

# Define the name of the file to be zipped
file_to_zip = 'TrainCICIDS2017.csv'

# Create a ZipFile object in write mode
with zipfile.ZipFile(zip_archive_name, 'w') as zipf:
    # Add the CSV file to the zip archive
    zipf.write(file_to_zip, os.path.basename(file_to_zip))

print(f"Created zip archive: {zip_archive_name}")

Created zip archive: TrainCICIDS2017.zip
