<a href="https://colab.research.google.com/github/Mc4minta/AIB5-PcapAttackClassifier/blob/main/ProcessTrainSetCSE_CIC_IDS2018.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# CSE-CIC-IDS2018

In [3]:
import pandas as pd
import os

csv_dir = '/content/drive/MyDrive/Share to Mc4/AIBuilders5-MiN/cic2018'
csv_files = [
    '02-14-2018.csv',
    '02-15-2018.csv',
    '02-16-2018.csv'
]

# Initialize an empty DataFrame to store the concatenated data
df_combined_efficient = pd.DataFrame()

for file in csv_files:
    file_path = os.path.join(csv_dir, file)
    print(f"Processing file efficiently: {file}...")

    # Read the CSV in chunks
    chunk_size = 100000  # Adjust chunk size based on available memory
    for chunk in pd.read_csv(file_path, chunksize=chunk_size, low_memory=False):

        # Skip rows where 'Label' is 'Label' (if applicable to this file)
        if file == '02-16-2018.csv':
             chunk = chunk[chunk['Label'] != 'Label']

        # Process 'Benign' class: keep only 30%
        benign_chunk = chunk[chunk['Label'] == 'Benign']
        kept_benign_chunk = benign_chunk.sample(frac=0.3, random_state=42)
        non_benign_chunk = chunk[chunk['Label'] != 'Benign']
        processed_chunk = pd.concat([kept_benign_chunk, non_benign_chunk], ignore_index=True)

        # Process 'DoS attacks-Hulk' class: keep only 15% (only for 02-16-2018.csv)
        if file == '02-16-2018.csv':
            dos_hulk_chunk = processed_chunk[processed_chunk['Label'] == 'DoS attacks-Hulk']
            kept_dos_hulk_chunk = dos_hulk_chunk.sample(frac=0.15, random_state=42)
            other_classes_chunk = processed_chunk[processed_chunk['Label'] != 'DoS attacks-Hulk']
            processed_chunk = pd.concat([kept_dos_hulk_chunk, other_classes_chunk], ignore_index=True)


        # Append the processed chunk to the combined DataFrame
        df_combined_efficient = pd.concat([df_combined_efficient, processed_chunk], ignore_index=True)

print("Efficient data processing complete.")
display(df_combined_efficient.head())
display(df_combined_efficient['Label'].value_counts())

# Compare the number of benign and other class combined
benign_count = df_combined_efficient[df_combined_efficient['Label'] == 'Benign'].shape[0]
other_count = df_combined_efficient.shape[0] - benign_count

print(f"\nNumber of 'Benign' rows in the combined efficient DataFrame: {benign_count}")
print(f"Number of other class rows in the combined efficient DataFrame: {other_count}")

Processing file efficiently: 02-14-2018.csv...
Processing file efficiently: 02-15-2018.csv...
Processing file efficiently: 02-16-2018.csv...
Efficient data processing complete.


Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0,0,14/02/2018 10:09:34,112641397,3,0,0,0,0,0,...,0,0.0,0.0,0,0,56320698.5,68.589358,56320747,56320650,Benign
1,22,6,14/02/2018 08:40:23,8804066,14,11,1143,2209,744,0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,Benign
2,80,6,14/02/2018 08:47:31,525965,5,3,207,459,207,0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,Benign
3,0,0,14/02/2018 09:41:25,112641313,3,0,0,0,0,0,...,0,0.0,0.0,0,0,56320656.5,109.601551,56320734,56320579,Benign
4,80,6,14/02/2018 08:47:15,474926,5,3,220,472,220,0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,Benign


Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
Benign,633142
FTP-BruteForce,193360
SSH-Bruteforce,187589
DoS attacks-SlowHTTPTest,139890
DoS attacks-Hulk,69289
DoS attacks-GoldenEye,41508
DoS attacks-Slowloris,10990



Number of 'Benign' rows in the combined efficient DataFrame: 633142
Number of other class rows in the combined efficient DataFrame: 642626


In [4]:
# Export the DataFrame to a CSV file
output_csv_path = 'TrainCSECICIDS2018.csv'
df_combined_efficient.to_csv(output_csv_path, index=False)
print(f"DataFrame exported to {output_csv_path}")

DataFrame exported to TrainCSECICIDS2018.csv


In [5]:
import zipfile
import os

# Define the name for the zip archive
zip_archive_name = 'TrainCSECICIDS2018.zip'

# Define the name of the file to be zipped
file_to_zip = 'TrainCSECICIDS2018.csv'

# Create a ZipFile object in write mode
with zipfile.ZipFile(zip_archive_name, 'w') as zipf:
    # Add the CSV file to the zip archive
    zipf.write(file_to_zip, os.path.basename(file_to_zip))

print(f"Created zip archive: {zip_archive_name}")

Created zip archive: TrainCSECICIDS2018.zip
