In [1]:
import pandas as pd
import os
import glob

#### dataset = https://www.unb.ca/cic/datasets/ids-2017.html (MachieneLearningCVE)


In [2]:
def read_all_csv_files(directory_path):
    """
    Read all CSV files from a directory into a single DataFrame.
    
    Args:
        directory_path: Path to the directory containing CSV files
        
    Returns:
        Combined DataFrame containing data from all CSV files
    """
    # Check if directory exists
    if not os.path.exists(directory_path):
        raise FileNotFoundError(f"Directory not found: {directory_path}")
    
    # Get all CSV files in the directory
    csv_files = glob.glob(os.path.join(directory_path, "*.csv"))
    
    if not csv_files:
        print(f"No CSV files found in {directory_path}")
        return None
    
    # Create an empty list to store individual DataFrames
    dfs = []
    
    # Read each CSV file and append to the list
    for file in csv_files:
        try:
            df = pd.read_csv(file)
            dfs.append(df)
            print(f"Successfully read: {os.path.basename(file)}")
        except Exception as e:
            print(f"Error reading {os.path.basename(file)}: {str(e)}")
    
    if not dfs:
        print("No valid CSV files were read.")
        return None
    
    # Combine all DataFrames into one
    combined_df = pd.concat(dfs, ignore_index=True)
    
    print(f"Combined {len(dfs)} CSV files. Total rows: {len(combined_df)}")
    return combined_df

# Usage
directory_path = "/Users/meiramzarypkanov/Desktop/University/4_Network_Security/NetworkSecurity/data/MachineLearningCVE"
mlcve = read_all_csv_files(directory_path)

Successfully read: Friday-WorkingHours-Morning.pcap_ISCX.csv
Successfully read: Wednesday-workingHours.pcap_ISCX.csv
Successfully read: Monday-WorkingHours.pcap_ISCX.csv
Successfully read: Tuesday-WorkingHours.pcap_ISCX.csv
Successfully read: Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv
Successfully read: Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv
Successfully read: Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv
Successfully read: Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv
Combined 8 CSV files. Total rows: 1799249


In [9]:
mlcve = mlcve.dropna()
mlcve.columns = mlcve.columns.str.strip()

In [10]:
mlcve.columns

Index(['Destination Port', 'Flow Duration', 'Total Fwd Packets',
       'Total Backward Packets', 'Total Length of Fwd Packets',
       'Total Length of Bwd Packets', 'Fwd Packet Length Max',
       'Fwd Packet Length Min', 'Fwd Packet Length Mean',
       'Fwd Packet Length Std', 'Bwd Packet Length Max',
       'Bwd Packet Length Min', 'Bwd Packet Length Mean',
       'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s',
       'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min',
       'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max',
       'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std',
       'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags',
       'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Length',
       'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s',
       'Min Packet Length', 'Max Packet Length', 'Packet Length Mean',
       'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count',
       'SYN Flag Co

In [12]:
# Show all unique labels
print("Unique labels in the dataset:")
unique_labels = mlcve['Label'].unique()
print(unique_labels)
print(f"Total number of unique labels: {mlcve['Label'].nunique()}")

# Show distribution of labels (count of each label)
print("\nLabel distribution:")
label_counts = mlcve['Label'].value_counts()
print(label_counts)

# Show distribution as percentages
print("\nLabel distribution (%):")
label_percentage = mlcve['Label'].value_counts(normalize=True) * 100
print(label_percentage)

Unique labels in the dataset:
['BENIGN' 'Bot' 'DoS slowloris' 'DoS Slowhttptest' 'DoS Hulk'
 'FTP-Patator' 'SSH-Patator' 'Infiltration' 'Web Attack � Brute Force'
 'Web Attack � XSS' 'Web Attack � Sql Injection' 'PortScan' 'DDoS']
Total number of unique labels: 13

Label distribution:
BENIGN                        1351952
PortScan                       158930
DoS Hulk                       133217
DDoS                           128027
FTP-Patator                      7938
DoS slowloris                    5796
DoS Slowhttptest                 5499
SSH-Patator                      2973
Bot                              1966
Web Attack � Brute Force         1507
Web Attack � XSS                  652
Infiltration                       36
Web Attack � Sql Injection         21
Name: Label, dtype: int64

Label distribution (%):
BENIGN                        75.170502
PortScan                       8.836740
DoS Hulk                       7.407059
DDoS                           7.118488
FTP-Patat

##### BENIGN = legitimate network traffic

##### other labels are attack types
