<a href="https://colab.research.google.com/github/Hushpuppyzac/DLI-Assignment/blob/main/CleanedData.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import pandas as pd
import numpy as np
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from IPython.display import display # Import display for nicer DataFrame output

# STEP 1: Loading Dataset

# Use the raw URL for the CSV file on GitHub
url = "https://raw.githubusercontent.com/Hushpuppyzac/DLI-Assignment/main/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv"
df = pd.read_csv(url)

print("\n" + "="*70)
print("             INITIAL DATASET INFORMATION             ")
print("="*70)
print(f"Total rows before any cleaning: {len(df):,}")
# Note: ' Label' might have a leading space from the original CSV
print("Initial Label distribution (raw data):")
display(df[' Label'].value_counts()) # Using display()
print("\n--- DataFrame Info ---")
df.info()
print("\n--- DataFrame Description (Numerical Columns) ---")
display(df.describe()) # Using display()
print("="*70 + "\n")

# ------------------------------------------------------------------------------

# STEP 2: Data Cleaning (Applied to the full dataset before splitting)

# Strip leading/trailing whitespaces in column names
df.columns = df.columns.str.strip()

# Explicitly rename the ' Label' column to 'Label' after stripping
# This handles cases where the column name might have had a leading space
if ' Label' in df.columns: # Check if the original name with space exists
    df.rename(columns={' Label': 'Label'}, inplace=True)
elif 'Label' not in df.columns: # If not, it means it's already 'Label' or something else
    print("Warning: ' Label' column not found, assuming 'Label' is already correct or handled.")

# Print column names to diagnose the 'Label' issue (for debugging)
print("\n--- DataFrame Columns after stripping whitespace and renaming ---")
print(df.columns.tolist())

# Replace infinite values with NaN, then remove rows with any missing values
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df = df.dropna()

# Drop columns with constant values (no variance)
constant_cols = [col for col in df.columns if df[col].nunique() == 1]
if constant_cols:
    print(f"\nDropping constant columns: {', '.join(constant_cols)}")
    df = df.drop(columns=constant_cols)
else:
    print("\nNo constant columns found to drop.")

# Remove duplicated rows (if any)
initial_rows = len(df)
df = df.drop_duplicates()
if len(df) < initial_rows:
    print(f"Removed {initial_rows - len(df):,} duplicate rows.")
else:
    print("No duplicate rows found.")

# Print column names just before accessing 'Label' for encoding (for debugging)
print("\n--- DataFrame Columns before encoding 'Label' ---")
print(df.columns.tolist())

# Encode 'Label' column: 'DDoS' as 1, 'BENIGN' as 0
df.loc[:, 'Label'] = df['Label'].apply(lambda x: 1 if x != 'BENIGN' else 0)

print("\n" + "="*70)
print("        AFTER INITIAL CLEANING (Before Train-Test Split)        ")
print("="*70)
print(f"Total rows after initial cleaning: {len(df):,}")
print("Label distribution after initial cleaning:")
display(df['Label'].value_counts()) # Using display()
print("\nFirst 5 rows of cleaned DataFrame:")
display(df.head()) # Using display()
print("="*70 + "\n")

# ------------------------------------------------------------------------------

# STEP 3: Split data into training and testing sets FIRST

X = df.drop('Label', axis=1)
y = df['Label']

# Use stratify=y to maintain the class distribution in both train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("\n" + "="*70)
print("             AFTER INITIAL TRAIN-TEST SPLIT             ")
print("="*70)
print(f"Training data shape (X_train): {X_train.shape}")
print(f"Testing data shape (X_test):   {X_test.shape}")
print(f"Training label shape (y_train): {y_train.shape}")
print(f"Testing label shape (y_test):   {y_test.shape}")
print("\nTraining label distribution:")
display(y_train.value_counts()) # Using display()
print("\nTesting label distribution:")
display(y_test.value_counts()) # Using display()
print("="*70 + "\n")

# ------------------------------------------------------------------------------

# STEP 4: Class Balancing with Undersampling (Applied ONLY to Training Data)

# Separate majority (DDoS) and minority (BENIGN) classes in the TRAINING SET
df_train = pd.concat([X_train, y_train], axis=1) # Recombine for easier filtering
df_train_majority = df_train[df_train['Label'] == 1] # DDoS in training
df_train_minority = df_train[df_train['Label'] == 0] # BENIGN in training

print("\n--- Class Distribution Before Balancing (Training Set) ---")
display(y_train.value_counts()) # Using display()

# Downsample the majority class (DDoS) in the training set
df_train_majority_downsampled = resample(df_train_majority,
                                         replace=False,
                                         n_samples=len(df_train_minority), # Match minority count in training
                                         random_state=42)

# Combine downsampled majority with minority class for the balanced training set
df_train_balanced = pd.concat([df_train_majority_downsampled, df_train_minority])

# Shuffle the combined training dataframe
df_train_balanced = df_train_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Separate X_train_balanced and y_train_balanced
X_train_balanced = df_train_balanced.drop('Label', axis=1)
y_train_balanced = df_train_balanced['Label']

print("\n" + "="*70)
print("         AFTER UNDERSAMPLING (Training Set Only)         ")
print("="*70)
print(f"Total rows in balanced training set: {len(df_train_balanced):,}")
print("Class balance (Training Set):")
display(y_train_balanced.value_counts()) # Using display()
print("="*70 + "\n")

# ------------------------------------------------------------------------------

# STEP 5: Display Sample Data (From Balanced Training Dataset)

# Columns to show in sample output
columns_to_show = ['Destination Port', 'Flow Duration', 'Label']

# Count label values (from balanced training data)
ddos_count_train = df_train_balanced[df_train_balanced['Label'] == 1].shape[0]
benign_count_train = df_train_balanced[df_train_balanced['Label'] == 0].shape[0]
total_rows_train = df_train_balanced.shape[0]

print("\n" + "="*70)
print("        SAMPLE OF BALANCED TRAINING DATAFRAME        ")
print("="*70)
print(f"Total Rows       : {total_rows_train:,}")
print(f"DDoS Attacks     : {ddos_count_train:,}")
print(f"Benign Records   : {benign_count_train:,}")
print("="*70)

# Display sample rows
print("\nBalanced Training DataFrame (First 5 Rows):")
display(df_train_balanced[columns_to_show].head(5)) # Using display()

print("\nDDoS Samples (Balanced Training Set - First 5):")
display(df_train_balanced[df_train_balanced['Label'] == 1][columns_to_show].head(5)) # Using display()

print("\nBenign Samples (Balanced Training Set - First 5):")
display(df_train_balanced[df_train_balanced['Label'] == 0][columns_to_show].head(5)) # Using display()
print("="*70 + "\n")

# ------------------------------------------------------------------------------

# STEP 6: Extract Statistical Features (Applied Separately to Train and Test Sets)

def extract_features(df_input): # Renamed parameter to avoid conflict with global df
    """Extracts statistical features from the dataframe."""
    # Create a trimmed preview column from flow characteristics
    df_input.loc[:, 'Flow Preview'] = df_input[['Destination Port', 'Flow Duration']].astype(str).agg(' | '.join, axis=1)

    # Feature 1: Packet length difference
    df_input.loc[:, 'pkt_length_diff'] = df_input['Max Packet Length'] - df_input['Min Packet Length']

    # Feature 2: Packet length variation (Max / Mean)
    df_input.loc[:, 'pkt_length_var_ratio'] = (df_input['Max Packet Length'] / (df_input['Packet Length Mean'] + 1e-5)).round(3)

    # Feature 3: Byte ratio (Fwd vs Bwd)
    df_input.loc[:, 'byte_ratio'] = (df_input['Total Length of Fwd Packets'] / (df_input['Total Length of Bwd Packets'] + 1e-5)).round(3)

    # Feature 4: Flow duration per packet
    df_input.loc[:, 'duration_per_packet'] = (df_input['Flow Duration'] / (df_input['Total Fwd Packets'] + df_input['Total Backward Packets'] + 1e-5)).round(3)

    # Feature 5: Average packet size vs Max packet size
    df_input.loc[:, 'avg_to_max_ratio'] = (df_input['Average Packet Size'] / (df_input['Max Packet Length'] + 1e-5)).round(3)

    # Remove duplicated columns (just in case, although should not happen after split)
    df_input = df_input.loc[:, ~df_input.columns.duplicated()]

    return df_input

# Apply feature extraction to the balanced training set and the original test set
X_train_featured = extract_features(X_train_balanced.copy()) # Use X_train_balanced here
X_test_featured = extract_features(X_test.copy()) # Use X_test here

# Drop the 'Flow Preview' column as it's a string and not suitable for ML models
X_train_featured = X_train_featured.drop(columns=['Flow Preview'])
X_test_featured = X_test_featured.drop(columns=['Flow Preview'])


print("\n" + "="*70)
print("        AFTER FEATURE EXTRACTION (TRAIN & TEST)        ")
print("="*70)
print("\nSample Extracted Features (Training Set - First 5):")
feature_cols_display = ['pkt_length_diff', 'pkt_length_var_ratio',
                        'byte_ratio', 'duration_per_packet', 'avg_to_max_ratio']
display(X_train_featured[feature_cols_display].head(5)) # Using display()

print("\nFinal Columns (Training Set after Feature Extraction):")
print(X_train_featured.columns.tolist())

print("\nSample Extracted Features (Testing Set - First 5):")
display(X_test_featured[feature_cols_display].head(5)) # Using display()

print("\nFinal Columns (Testing Set after Feature Extraction):")
print(X_test_featured.columns.tolist())
print("="*70 + "\n")

# ------------------------------------------------------------------------------

# STEP 7: Feature Scaling (Standardization)

# Initialize StandardScaler
scaler = StandardScaler()

# Identify numerical columns for scaling
numerical_cols = X_train_featured.select_dtypes(include=np.number).columns.tolist()

# Fit the scaler ONLY on the training data
scaler.fit(X_train_featured[numerical_cols])

# Transform both training and testing data
X_train_scaled = scaler.transform(X_train_featured[numerical_cols])
X_test_scaled = scaler.transform(X_test_featured[numerical_cols])

# Convert scaled arrays back to DataFrames, preserving column names
X_train_scaled = pd.DataFrame(X_train_scaled, columns=numerical_cols, index=X_train_featured.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=numerical_cols, index=X_test_featured.index)

print("\n" + "="*70)
print("               AFTER FEATURE SCALING               ")
print("="*70)
print(f"Training data shape (X_train_scaled): {X_train_scaled.shape}")
print(f"Testing data shape (X_test_scaled):   {X_test_scaled.shape}")
print("\nSample of Scaled Training Data (First 5 Rows):")
display(X_train_scaled.head(5)) # Using display()
print("\nSample of Scaled Testing Data (First 5 Rows):")
display(X_test_scaled.head(5)) # Using display()
print("="*70 + "\n")

# The preprocessed data is now ready for model training: X_train_scaled, y_train_balanced, X_test_scaled, y_test
# Note: y_train is now y_train_balanced



             INITIAL DATASET INFORMATION             
Total rows before any cleaning: 225,745
Initial Label distribution (raw data):


Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
DDoS,128027
BENIGN,97718



--- DataFrame Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 225745 entries, 0 to 225744
Data columns (total 79 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0    Destination Port             225745 non-null  int64  
 1    Flow Duration                225745 non-null  int64  
 2    Total Fwd Packets            225745 non-null  int64  
 3    Total Backward Packets       225745 non-null  int64  
 4   Total Length of Fwd Packets   225745 non-null  int64  
 5    Total Length of Bwd Packets  225745 non-null  int64  
 6    Fwd Packet Length Max        225745 non-null  int64  
 7    Fwd Packet Length Min        225745 non-null  int64  
 8    Fwd Packet Length Mean       225745 non-null  float64
 9    Fwd Packet Length Std        225745 non-null  float64
 10  Bwd Packet Length Max         225745 non-null  int64  
 11   Bwd Packet Length Min        225745 non-null  int64  
 12   Bwd Packet Length M

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
count,225745.0,225745.0,225745.0,225745.0,225745.0,225745.0,225745.0,225745.0,225745.0,225745.0,...,225745.0,225745.0,225745.0,225745.0,225745.0,225745.0,225745.0,225745.0,225745.0,225745.0
mean,8879.61946,16241650.0,4.874916,4.572775,939.463346,5960.477,538.535693,27.882221,164.826715,214.907242,...,3.311497,21.482753,184826.1,12934.36,208084.9,177620.1,10322140.0,3611943.0,12878130.0,7755355.0
std,19754.6474,31524370.0,15.422874,21.755356,3249.403484,39218.34,1864.128991,163.324159,504.892965,797.411073,...,12.270018,4.166799,797925.0,210273.7,900235.0,784260.2,21853030.0,12756890.0,26921260.0,19831090.0
min,0.0,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,80.0,71180.0,2.0,1.0,26.0,0.0,6.0,0.0,6.0,0.0,...,1.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,80.0,1452333.0,3.0,4.0,30.0,164.0,20.0,0.0,8.666667,5.301991,...,2.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,80.0,8805237.0,5.0,5.0,63.0,11601.0,34.0,6.0,32.0,10.263203,...,4.0,20.0,1878.0,0.0,1878.0,1862.0,8239725.0,0.0,8253838.0,7422849.0
max,65532.0,119999900.0,1932.0,2942.0,183012.0,5172346.0,11680.0,1472.0,3867.0,6692.644993,...,1931.0,52.0,100000000.0,39500000.0,100000000.0,100000000.0,120000000.0,65300000.0,120000000.0,120000000.0




--- DataFrame Columns after stripping whitespace and renaming ---
['Destination Port', 'Flow Duration', 'Total Fwd Packets', 'Total Backward Packets', 'Total Length of Fwd Packets', 'Total Length of Bwd Packets', 'Fwd Packet Length Max', 'Fwd Packet Length Min', 'Fwd Packet Length Mean', 'Fwd Packet Length Std', 'Bwd Packet Length Max', 'Bwd Packet Length Min', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Length', 'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s', 'Min Packet Length', 'Max Packet Length', 'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count', 'SYN Flag Count', 'RST Flag Count', 'PSH Flag Count', '

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
1,128014
0,95068



First 5 rows of cleaned DataFrame:


Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,54865,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0
1,55054,109,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0
2,55055,52,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0
3,46236,34,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0
4,54863,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0




             AFTER INITIAL TRAIN-TEST SPLIT             
Training data shape (X_train): (178465, 68)
Testing data shape (X_test):   (44617, 68)
Training label shape (y_train): (178465,)
Testing label shape (y_test):   (44617,)

Training label distribution:


Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
1,102411
0,76054



Testing label distribution:


Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
1,25603
0,19014




--- Class Distribution Before Balancing (Training Set) ---


Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
1,102411
0,76054



         AFTER UNDERSAMPLING (Training Set Only)         
Total rows in balanced training set: 152,108
Class balance (Training Set):


Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
0,76054
1,76054




        SAMPLE OF BALANCED TRAINING DATAFRAME        
Total Rows       : 152,108
DDoS Attacks     : 76,054
Benign Records   : 76,054

Balanced Training DataFrame (First 5 Rows):


Unnamed: 0,Destination Port,Flow Duration,Label
0,53,108497,0
1,61761,48116,0
2,59618,65,0
3,55903,9318481,0
4,80,9402695,1



DDoS Samples (Balanced Training Set - First 5):


Unnamed: 0,Destination Port,Flow Duration,Label
4,80,9402695,1
6,80,7373820,1
8,80,2857734,1
9,80,10943778,1
13,80,1784554,1



Benign Samples (Balanced Training Set - First 5):


Unnamed: 0,Destination Port,Flow Duration,Label
0,53,108497,0
1,61761,48116,0
2,59618,65,0
3,55903,9318481,0
5,80,5860218,0




        AFTER FEATURE EXTRACTION (TRAIN & TEST)        

Sample Extracted Features (Training Set - First 5):


Unnamed: 0,pkt_length_diff,pkt_length_var_ratio,byte_ratio,duration_per_packet,avg_to_max_ratio
0,62,1.611,0.431,54248.229,0.931
1,11595,7.974,446.423,6873.704,0.143
2,0,0.0,0.0,32.5,0.0
3,0,1.0,0.2,1553077.578,1.167
4,0,1.0,3000000.0,1880535.239,1.2



Final Columns (Training Set after Feature Extraction):
['Destination Port', 'Flow Duration', 'Total Fwd Packets', 'Total Backward Packets', 'Total Length of Fwd Packets', 'Total Length of Bwd Packets', 'Fwd Packet Length Max', 'Fwd Packet Length Min', 'Fwd Packet Length Mean', 'Fwd Packet Length Std', 'Bwd Packet Length Max', 'Bwd Packet Length Min', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Fwd Header Length', 'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s', 'Min Packet Length', 'Max Packet Length', 'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count', 'SYN Flag Count', 'RST Flag Count', 'PSH Flag Count', 'ACK Flag Count', 'URG Flag Count', 'ECE Flag Count', 'Down/Up R

Unnamed: 0,pkt_length_diff,pkt_length_var_ratio,byte_ratio,duration_per_packet,avg_to_max_ratio
139654,0,1.0,0.2,819400.634,1.167
127815,5840,4.521,0.002,93517.633,0.249
111648,0,1.0,0.167,130544.814,1.143
76785,5840,4.521,0.002,7943.865,0.249
72328,0,1.0,3000000.0,61111.478,1.2



Final Columns (Testing Set after Feature Extraction):
['Destination Port', 'Flow Duration', 'Total Fwd Packets', 'Total Backward Packets', 'Total Length of Fwd Packets', 'Total Length of Bwd Packets', 'Fwd Packet Length Max', 'Fwd Packet Length Min', 'Fwd Packet Length Mean', 'Fwd Packet Length Std', 'Bwd Packet Length Max', 'Bwd Packet Length Min', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Fwd Header Length', 'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s', 'Min Packet Length', 'Max Packet Length', 'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count', 'SYN Flag Count', 'RST Flag Count', 'PSH Flag Count', 'ACK Flag Count', 'URG Flag Count', 'ECE Flag Count', 'Down/Up Ra

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,pkt_length_diff,pkt_length_var_ratio,byte_ratio,duration_per_packet,avg_to_max_ratio
0,-0.496462,-0.511479,-0.245222,-0.16853,-0.30382,-0.138477,-0.292419,0.082431,-0.269685,-0.29443,...,-0.220676,-0.469879,-0.279191,-0.475267,-0.390354,-0.788134,-0.663447,-0.042185,-0.558311,0.632065
1,2.424285,-0.513378,0.004044,-0.123929,3.008621,-0.140547,5.431752,-0.182254,3.902286,5.708969,...,-0.220676,-0.469879,-0.279191,-0.475267,-0.390354,2.285842,1.203662,-0.04217,-0.578708,-0.982426
2,2.322853,-0.51489,-0.245222,-0.16853,-0.317288,-0.141196,-0.315716,-0.182254,-0.355898,-0.29443,...,-0.220676,-0.469879,-0.279191,-0.475267,-0.390354,-0.804659,-1.136167,-0.042185,-0.581654,-1.275411
3,2.147016,-0.221781,-0.245222,0.009873,-0.315569,-0.140448,-0.312742,-0.148465,-0.344892,-0.29443,...,-0.20403,-0.04124,-0.279191,-0.126852,0.081679,-0.804659,-0.842734,-0.042185,0.087018,1.115593
4,-0.495185,-0.219132,0.004044,-0.213131,-0.308692,-0.141196,-0.312742,-0.148465,-0.344892,-0.29443,...,-0.21726,-0.036873,-0.279191,-0.123302,0.086489,-0.804659,-0.842734,0.058275,0.228007,1.183205



Sample of Scaled Testing Data (First 5 Rows):


Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,pkt_length_diff,pkt_length_var_ratio,byte_ratio,duration_per_packet,avg_to_max_ratio
139654,2.095897,-0.360247,-0.245222,0.009873,-0.315569,-0.140448,-0.312742,-0.148465,-0.344892,-0.29443,...,-0.220676,-0.469879,-0.279191,-0.475267,-0.390354,-0.804659,-0.842734,-0.042185,-0.22887,1.115593
127815,-0.495185,-0.491359,-0.120589,0.009873,-0.309838,0.148192,-0.305803,-0.182254,-0.340001,-0.282545,...,-0.220676,-0.469879,-0.279191,-0.475267,-0.390354,0.751919,0.190441,-0.042185,-0.541403,-0.765248
111648,2.589378,-0.486148,-0.245222,0.054474,-0.315569,-0.140298,-0.312742,-0.148465,-0.344892,-0.29443,...,-0.220676,-0.469879,-0.279191,-0.475267,-0.390354,-0.804659,-0.842734,-0.042185,-0.525461,1.066421
76785,-0.495185,-0.512893,-0.120589,0.009873,-0.309838,0.148192,-0.305803,-0.182254,-0.340001,-0.282545,...,-0.220676,-0.469879,-0.279191,-0.475267,-0.390354,0.751919,0.190441,-0.042185,-0.578247,-0.765248
72328,-0.495185,-0.505281,0.004044,-0.213131,-0.308692,-0.141196,-0.312742,-0.148465,-0.344892,-0.29443,...,-0.220676,-0.469879,-0.279191,-0.475267,-0.390354,-0.804659,-0.842734,0.058275,-0.555356,1.183205



