In [1]:
!pip install pandas numpy scikit-learn matplotlib seaborn imbalanced-learn openpyxl



In [8]:
# Imported Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from collections import Counter # For checking class distribution

# For imbalanced data handling
from imblearn.over_sampling import SMOTE 

# Configure visualizations
%matplotlib inline
plt.rcParams['figure.figsize'] = (12, 6)
sns.set_style('darkgrid')

In [16]:
# Defining file paths to the CSV files

file_paths = [
    'Monday-WorkingHours.pcap_ISCX.csv',
    'Tuesday-WorkingHours.pcap_ISCX.csv',
    'Wednesday-workingHours.pcap_ISCX.csv',
    'Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv',
    'Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv',
    'Friday-WorkingHours-Morning.pcap_ISCX.csv',
    'Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv',
    'Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv'
]

In [18]:
# Loading and concatenating the CSV files

try:
    df_list = [pd.read_csv(file) for file in file_paths]
    df = pd.concat(df_list, ignore_index=True)
    print("Dataset loaded and merged successfully.")
    print(f"Shape of the merged dataset: {df.shape}")
except FileNotFoundError:
    print("ERROR: One or more CSV files not found. Please check your file_paths.")
except Exception as e:
    print(f"An error occurred during loading/merging: {e}")

Dataset loaded and merged successfully.
Shape of the merged dataset: (2830743, 79)


In [20]:
# Basic DataFrame exploration
if 'df' in locals(): # Check if df was loaded
    print("--- First 5 rows: ---")
    print(df.head())
    print("\n--- DataFrame Info: ---")
    df.info() # Provides data types and non-null count
    print("\n--- Descriptive Statistics (Numerical Features): ---")
    print(df.describe())
    print("\n--- Value Counts for 'Label' column (Target Variable): ---")
    if ' Label' in df.columns:
        print(df[' Label'].value_counts())
    elif 'Label' in df.columns:
        print(df['Label'].value_counts())
    else:
        print("Label column not found with common names (' Label' or 'Label'). Check column names.")
else:
    print("DataFrame 'df' not found. Please ensure data loading was successful.")

--- First 5 rows: ---
    Destination Port   Flow Duration   Total Fwd Packets  \
0              49188               4                   2   
1              49188               1                   2   
2              49188               1                   2   
3              49188               1                   2   
4              49486               3                   2   

    Total Backward Packets  Total Length of Fwd Packets  \
0                        0                           12   
1                        0                           12   
2                        0                           12   
3                        0                           12   
4                        0                           12   

    Total Length of Bwd Packets   Fwd Packet Length Max  \
0                             0                       6   
1                             0                       6   
2                             0                       6   
3                         

In [22]:
# Cleaning Column Names
if 'df' in locals():
    original_columns = df.columns.tolist()
    df.columns = df.columns.str.strip()
    new_columns = df.columns.tolist()
    if original_columns != new_columns:
        print("Column names stripped.")
        # print(f"Old columns: {original_columns}")
        # print(f"New columns: {new_columns}")
    else:
        print("Column names did not require stripping or were already clean.")
    
    # Standardizing the target column name if needed
    if 'Label' not in df.columns and ' Label' in df.columns:
        df.rename(columns={' Label': 'Label'}, inplace=True)
        print("Renamed ' Label' column to 'Label'")


Column names stripped.


In [24]:
# Handling Missing Values (NaNs) and Infinite Values
if 'df' in locals():

    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    print(f"Number of NaN values before imputation: {df.isnull().sum().sum()}")

    df.dropna(inplace=True)
    print("Rows with NaN values dropped.")

    numerical_cols = df.select_dtypes(include=np.number).columns

    if 'Label' in numerical_cols:
        numerical_cols = numerical_cols.drop('Label') 

    imputer_median = SimpleImputer(strategy='median')
    df[numerical_cols] = imputer_median.fit_transform(df[numerical_cols])
    print(f"Number of NaN values after imputation: {df.isnull().sum().sum()}")
    
    if 'Label' in df.columns:
        df.dropna(subset=['Label'], inplace=True)
        print(f"Shape after dropping rows with NaN in 'Label': {df.shape}")

Number of NaN values before imputation: 5734
Rows with NaN values dropped.
Number of NaN values after imputation: 0
Shape after dropping rows with NaN in 'Label': (2827876, 79)


In [26]:
# Removing Duplicate Rows
if 'df' in locals():
    duplicates_before = df.duplicated().sum()
    if duplicates_before > 0:
        df.drop_duplicates(inplace=True)
        print(f"Removed {duplicates_before} duplicate rows.")
        print(f"Shape after removing duplicates: {df.shape}")
    else:
        print("No duplicate rows found.")

Removed 307078 duplicate rows.
Shape after removing duplicates: (2520798, 79)


In [28]:
# Feature Selection
if 'df' in locals() and 'Label' in df.columns:

    features_to_drop_manual = ['Flow ID', 'Source IP', 'Destination IP', 'Timestamp', 'Source Port'] 
  
    X = df.drop('Label', axis=1)
    y_categorical = df['Label'] # Keep the categorical labels for now for EDA, encode later for model

    print(f"Shape of X (features): {X.shape}")
    print(f"Shape of y (target): {y_categorical.shape}")
    print("\nFeatures for the model (X columns):")
    print(X.columns.tolist())

Shape of X (features): (2520798, 78)
Shape of y (target): (2520798,)

Features for the model (X columns):
['Destination Port', 'Flow Duration', 'Total Fwd Packets', 'Total Backward Packets', 'Total Length of Fwd Packets', 'Total Length of Bwd Packets', 'Fwd Packet Length Max', 'Fwd Packet Length Min', 'Fwd Packet Length Mean', 'Fwd Packet Length Std', 'Bwd Packet Length Max', 'Bwd Packet Length Min', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Length', 'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s', 'Min Packet Length', 'Max Packet Length', 'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count', 'SYN Flag Count',

In [30]:
# Encoding Categorical Target Variable
if 'y_categorical' in locals():
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(y_categorical) 
    print("Target variable 'Label' encoded.")
    print("Mapping of encoded labels to original categories:")
    for i, class_name in enumerate(label_encoder.classes_):
        print(f"{i}: {class_name}")
    print("\nClass distribution in y (encoded):")
    print(Counter(y))

# Normalization with MinMaxScaler

if 'X' in locals():
    # Ensure X contains only numerical data before scaling
    if not X.select_dtypes(include=np.number).shape[1] == X.shape[1]:
        print("Warning: Non-numeric columns detected in X. Scaling will only apply to numeric ones.")

        X_numeric = X.select_dtypes(include=np.number)
    else:
        X_numeric = X

    scaler = MinMaxScaler() 
    
    X_scaled = scaler.fit_transform(X_numeric)

    X_scaled_df = pd.DataFrame(X_scaled, columns=X_numeric.columns, index=X_numeric.index)
    
    print("Features scaled.")
    print("--- Scaled Features (first 5 rows): ---")
    print(X_scaled_df.head())
    
    X = X_scaled_df

Target variable 'Label' encoded.
Mapping of encoded labels to original categories:
0: BENIGN
1: Bot
2: DDoS
3: DoS GoldenEye
4: DoS Hulk
5: DoS Slowhttptest
6: DoS slowloris
7: FTP-Patator
8: Heartbleed
9: Infiltration
10: PortScan
11: SSH-Patator
12: Web Attack � Brute Force
13: Web Attack � Sql Injection
14: Web Attack � XSS

Class distribution in y (encoded):
Counter({0: 2095057, 4: 172846, 2: 128014, 10: 90694, 3: 10286, 7: 5931, 6: 5385, 5: 5228, 11: 3219, 1: 1948, 12: 1470, 14: 652, 9: 36, 13: 21, 8: 11})
Features scaled.
--- Scaled Features (first 5 rows): ---
   Destination Port  Flow Duration  Total Fwd Packets  Total Backward Packets  \
0          0.750561   1.416667e-07           0.000005                0.000000   
1          0.750561   1.166667e-07           0.000005                0.000000   
4          0.755108   1.333333e-07           0.000005                0.000000   
5          0.755108   1.166667e-07           0.000005                0.000000   
8          0.001343  

In [34]:
# Spliting Data into Training, Validation and Testing sets

if 'X' in locals() and 'y' in locals():

    TRAIN_RATIO = 0.60
    VALIDATION_RATIO = 0.20
    TEST_RATIO = 0.20

    RANDOM_STATE = 42 

    # First split
    
    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y, 
        test_size=(VALIDATION_RATIO + TEST_RATIO), 
        random_state=RANDOM_STATE,
        stratify=y 
    )

    # Second split
    
    relative_test_ratio = TEST_RATIO / (VALIDATION_RATIO + TEST_RATIO)
    
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp,
        test_size=relative_test_ratio,
        random_state=RANDOM_STATE,
        stratify=y_temp
    )
    
    print("Data split into training, validation, and testing sets.")
    print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
    print(f"X_val shape: {X_val.shape}, y_val shape: {y_val.shape}")
    print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")
    
    print("\nTraining set label distribution:")
    print(Counter(y_train))
    print("\nValidation set label distribution:")
    print(Counter(y_val))
    print("\nTest set label distribution:")
    print(Counter(y_test))

    # Verify the overall proportions (approximate due to discrete sample counts)
    print(f"\nApproximate proportions:")
    print(f"Train: {len(X_train)/len(X):.2f}, Validation: {len(X_val)/len(X):.2f}, Test: {len(X_test)/len(X):.2f}")

else:
    print("X and/or y variables not found. Please ensure previous cells ran correctly and X (features) and y (encoded target) are defined.")

Data split into training, validation, and testing sets.
X_train shape: (1512478, 78), y_train shape: (1512478,)
X_val shape: (504160, 78), y_val shape: (504160,)
X_test shape: (504160, 78), y_test shape: (504160,)

Training set label distribution:
Counter({0: 1257033, 4: 103707, 2: 76808, 10: 54416, 3: 6172, 7: 3559, 6: 3231, 5: 3137, 11: 1931, 1: 1169, 12: 882, 14: 391, 9: 22, 13: 13, 8: 7})

Validation set label distribution:
Counter({0: 419012, 4: 34570, 2: 25603, 10: 18139, 3: 2057, 7: 1186, 6: 1077, 5: 1045, 11: 644, 1: 389, 12: 294, 14: 131, 9: 7, 13: 4, 8: 2})

Test set label distribution:
Counter({0: 419012, 4: 34569, 2: 25603, 10: 18139, 3: 2057, 7: 1186, 6: 1077, 5: 1046, 11: 644, 1: 390, 12: 294, 14: 130, 9: 7, 13: 4, 8: 2})

Approximate proportions:
Train: 0.60, Validation: 0.20, Test: 0.20


In [36]:
# Handling Imbalanced Data on the Training Set using SMOTE

if 'X_train' in locals():
    print("Original training set label distribution:", Counter(y_train))
    
    if not X_train.select_dtypes(include=np.number).shape[1] == X_train.shape[1]:
        print("Error: SMOTE requires all features in X_train to be numeric.")
    else:
        smote = SMOTE(random_state=42) # random_state from proposal
        try:
            X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
            print("SMOTE applied to the training data.")
            print("Resampled training set label distribution:", Counter(y_train_resampled))
            
        except Exception as e:
            print(f"Error during SMOTE: {e}. Check data types and values in X_train.")
            print("Ensure no NaN/Inf values remain and all data is numeric.")
    
            X_train_resampled = X_train 
            y_train_resampled = y_train

else:
    print("X_train not defined. Ensure previous cells ran correctly.")

Original training set label distribution: Counter({0: 1257033, 4: 103707, 2: 76808, 10: 54416, 3: 6172, 7: 3559, 6: 3231, 5: 3137, 11: 1931, 1: 1169, 12: 882, 14: 391, 9: 22, 13: 13, 8: 7})


[WinError 2] The system cannot find the file specified
  File "C:\Users\Adeun\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "C:\Users\Adeun\anaconda3\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Adeun\anaconda3\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "C:\Users\Adeun\anaconda3\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


SMOTE applied to the training data.
Resampled training set label distribution: Counter({0: 1257033, 4: 1257033, 2: 1257033, 10: 1257033, 7: 1257033, 6: 1257033, 3: 1257033, 5: 1257033, 11: 1257033, 12: 1257033, 1: 1257033, 14: 1257033, 9: 1257033, 13: 1257033, 8: 1257033})


In [46]:
# Saving the preprocessed data

if ('X_train_resampled' in locals() and \
    'y_train_resampled' in locals() and \
    'X_val' in locals() and \
    'y_val' in locals() and \
    'X_test' in locals() and \
    'y_test' in locals() and \
    'X' in locals() and hasattr(X, 'columns') and \
    'label_encoder' in locals() and \
    'scaler' in locals()):
    
    # Training Data
    X_train_save = pd.DataFrame(X_train_resampled, columns=X.columns)
    y_train_save = pd.DataFrame(y_train_resampled, columns=['Label']) 

    # Validation Data
    X_val_save = pd.DataFrame(X_val, columns=X.columns)
    y_val_save = pd.DataFrame(y_val, columns=['Label'])

    # Test Data
    X_test_save = pd.DataFrame(X_test, columns=X.columns)
    y_test_save = pd.DataFrame(y_test, columns=['Label'])

    # Saving to CSV files
    try:
        X_train_save.to_csv('X_train_preprocessed.csv', index=False)
        y_train_save.to_csv('y_train_preprocessed.csv', index=False)
        
        X_val_save.to_csv('X_val_preprocessed.csv', index=False)
        y_val_save.to_csv('y_val_preprocessed.csv', index=False)
        
        X_test_save.to_csv('X_test_preprocessed.csv', index=False)
        y_test_save.to_csv('y_test_preprocessed.csv', index=False)
        
        print(f"Preprocessed training, validation, and test data saved.")

        # Saving the label encoder and scaler
        import joblib
        joblib.dump(label_encoder,'cicids2017_label_encoder.joblib')
        joblib.dump(scaler,'cicids2017_scaler.joblib')
        print(f"Label encoder and scaler saved.")

    except Exception as e:
        print(f"Error during saving: {e}")

else:
    print("One or more required data components (e.g., X_train_resampled, X_val, y_val, X_test, X with columns, label_encoder, scaler) not found for saving.")

Preprocessed training, validation, and test data saved.
Label encoder and scaler saved.
