In [5]:
# Import necessary libraries
import pandas as pd
import numpy as np

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [20]:
# Load the dataset
file_path = 'Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv'
data = pd.read_csv(file_path)

In [34]:
# Clean column names by removing extra spaces
data.columns = data.columns.str.replace(' ', '')


print(data.columns)

Index(['DestinationPort', 'FlowDuration', 'TotalFwdPackets',
       'TotalBackwardPackets', 'TotalLengthofFwdPackets',
       'TotalLengthofBwdPackets', 'FwdPacketLengthMax', 'FwdPacketLengthMin',
       'FwdPacketLengthMean', 'FwdPacketLengthStd', 'BwdPacketLengthMax',
       'BwdPacketLengthMin', 'BwdPacketLengthMean', 'BwdPacketLengthStd',
       'FlowBytes/s', 'FlowPackets/s', 'FlowIATMean', 'FlowIATStd',
       'FlowIATMax', 'FlowIATMin', 'FwdIATTotal', 'FwdIATMean', 'FwdIATStd',
       'FwdIATMax', 'FwdIATMin', 'BwdIATTotal', 'BwdIATMean', 'BwdIATStd',
       'BwdIATMax', 'BwdIATMin', 'FwdPSHFlags', 'BwdPSHFlags', 'FwdURGFlags',
       'BwdURGFlags', 'FwdHeaderLength', 'BwdHeaderLength', 'FwdPackets/s',
       'BwdPackets/s', 'MinPacketLength', 'MaxPacketLength',
       'PacketLengthMean', 'PacketLengthStd', 'PacketLengthVariance',
       'FINFlagCount', 'SYNFlagCount', 'RSTFlagCount', 'PSHFlagCount',
       'ACKFlagCount', 'URGFlagCount', 'CWEFlagCount', 'ECEFlagCount',
       '

In [22]:
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.dropna(inplace=True)

# Display the first 10 rows of the dataset
print("First 10 rows of the dataset:")
print(data.head(10))


First 10 rows of the dataset:
   DestinationPort  FlowDuration  TotalFwdPackets  TotalBackwardPackets  \
0            54865             3                2                     0   
1            55054           109                1                     1   
2            55055            52                1                     1   
3            46236            34                1                     1   
4            54863             3                2                     0   
5            54871          1022                2                     0   
6            54925             4                2                     0   
7            54925            42                1                     1   
8             9282             4                2                     0   
9            55153             4                2                     0   

   TotalLengthofFwdPackets  TotalLengthofBwdPackets  FwdPacketLengthMax  \
0                       12                        0                  

Preprocessing steps

1. Handle Missing and Infinite Values
2. Action: Remove rows with missing values or fill them with appropriate statistics (mean, median).


In [23]:
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.dropna(inplace=True)


In [24]:
print(data.isnull().sum())


DestinationPort            0
FlowDuration               0
TotalFwdPackets            0
TotalBackwardPackets       0
TotalLengthofFwdPackets    0
                          ..
IdleMean                   0
IdleStd                    0
IdleMax                    0
IdleMin                    0
Label                      0
Length: 79, dtype: int64


2.Encode Labels
Action: Convert Label to numeric format, e.g., BENIGN = 0 and ATTACK = 1.

In [25]:
data['Label'] = data['Label'].apply(lambda x: 0 if x == 'BENIGN' else 1)


In [26]:
print(data['Label'].value_counts())


Label
1    128025
0     97686
Name: count, dtype: int64


3. Drop Irrelevant or Redundant Columns
Action: Remove columns that don’t contribute to the analysis (e.g., IDs or IP addresses).

In [27]:
irrelevant_columns = ['FlowID', 'SourceIP', 'DestinationIP']
data = data.drop(columns=irrelevant_columns, errors='ignore')


In [28]:
print(data.columns)


Index(['DestinationPort', 'FlowDuration', 'TotalFwdPackets',
       'TotalBackwardPackets', 'TotalLengthofFwdPackets',
       'TotalLengthofBwdPackets', 'FwdPacketLengthMax', 'FwdPacketLengthMin',
       'FwdPacketLengthMean', 'FwdPacketLengthStd', 'BwdPacketLengthMax',
       'BwdPacketLengthMin', 'BwdPacketLengthMean', 'BwdPacketLengthStd',
       'FlowBytes/s', 'FlowPackets/s', 'FlowIATMean', 'FlowIATStd',
       'FlowIATMax', 'FlowIATMin', 'FwdIATTotal', 'FwdIATMean', 'FwdIATStd',
       'FwdIATMax', 'FwdIATMin', 'BwdIATTotal', 'BwdIATMean', 'BwdIATStd',
       'BwdIATMax', 'BwdIATMin', 'FwdPSHFlags', 'BwdPSHFlags', 'FwdURGFlags',
       'BwdURGFlags', 'FwdHeaderLength', 'BwdHeaderLength', 'FwdPackets/s',
       'BwdPackets/s', 'MinPacketLength', 'MaxPacketLength',
       'PacketLengthMean', 'PacketLengthStd', 'PacketLengthVariance',
       'FINFlagCount', 'SYNFlagCount', 'RSTFlagCount', 'PSHFlagCount',
       'ACKFlagCount', 'URGFlagCount', 'CWEFlagCount', 'ECEFlagCount',
       '

4. Handle Correlated Features (Optional)
Why: Highly correlated features add redundancy and can reduce model performance.
How: Calculate correlations and remove one feature from pairs with high correlation (>0.9).

In [29]:
correlation_matrix = data.corr()
high_corr = correlation_matrix[correlation_matrix > 0.9]
print(high_corr)


                         DestinationPort  FlowDuration  TotalFwdPackets  \
DestinationPort                      1.0           NaN              NaN   
FlowDuration                         NaN      1.000000              NaN   
TotalFwdPackets                      NaN           NaN         1.000000   
TotalBackwardPackets                 NaN           NaN         0.956714   
TotalLengthofFwdPackets              NaN           NaN              NaN   
...                                  ...           ...              ...   
IdleMean                             NaN           NaN              NaN   
IdleStd                              NaN           NaN              NaN   
IdleMax                              NaN      0.919196              NaN   
IdleMin                              NaN           NaN              NaN   
Label                                NaN           NaN              NaN   

                         TotalBackwardPackets  TotalLengthofFwdPackets  \
DestinationPort          

5. Scale Features
 Why: Ensure all numerical features have the same scale for models sensitive to feature magnitudes (e.g., SVM, KNN).

In [30]:
from sklearn.preprocessing import StandardScaler
numeric_features = data.select_dtypes(include=[float, int])
scaler = StandardScaler()
scaled_features = scaler.fit_transform(numeric_features)
data[numeric_features.columns] = scaled_features


In [31]:
print(data.describe())


       DestinationPort  FlowDuration  TotalFwdPackets  TotalBackwardPackets  \
count     2.257110e+05  2.257110e+05     2.257110e+05         225711.000000   
mean      5.036832e-18  2.014733e-17    -1.259208e-17              0.000000   
std       1.000002e+00  1.000002e+00     1.000002e+00              1.000002   
min      -4.494010e-01 -5.152595e-01    -2.512579e-01             -0.210206   
25%      -4.453507e-01 -5.129998e-01    -1.864236e-01             -0.164243   
50%      -4.453507e-01 -4.691654e-01    -1.215894e-01             -0.026356   
75%      -4.453507e-01 -2.359142e-01     8.079037e-03              0.019607   
max       2.868343e+00  3.291115e+00     1.249436e+02            135.011381   

       TotalLengthofFwdPackets  TotalLengthofBwdPackets  FwdPacketLengthMax  \
count             2.257110e+05             2.257110e+05        2.257110e+05   
mean             -2.115469e-17             7.681169e-18       -1.208840e-17   
std               1.000002e+00             1.000002

 6.Train-Test Split--?10% 90%-improor.? optmi.?

In [32]:
from sklearn.model_selection import train_test_split
X = data.drop('Label', axis=1)
y = data['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [33]:
print(f"Training set size: {len(X_train)}, Test set size: {len(X_test)}")


Training set size: 180568, Test set size: 45143


In [35]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 225711 entries, 0 to 225744
Data columns (total 79 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   DestinationPort          225711 non-null  float64
 1   FlowDuration             225711 non-null  float64
 2   TotalFwdPackets          225711 non-null  float64
 3   TotalBackwardPackets     225711 non-null  float64
 4   TotalLengthofFwdPackets  225711 non-null  float64
 5   TotalLengthofBwdPackets  225711 non-null  float64
 6   FwdPacketLengthMax       225711 non-null  float64
 7   FwdPacketLengthMin       225711 non-null  float64
 8   FwdPacketLengthMean      225711 non-null  float64
 9   FwdPacketLengthStd       225711 non-null  float64
 10  BwdPacketLengthMax       225711 non-null  float64
 11  BwdPacketLengthMin       225711 non-null  float64
 12  BwdPacketLengthMean      225711 non-null  float64
 13  BwdPacketLengthStd       225711 non-null  float64
 14  FlowBytes

In [36]:
data.Label.nunique()

2

In [37]:
data.Label.value_counts()

Label
 0.873512    128025
-1.144804     97686
Name: count, dtype: int64

In [38]:
# Separate features and labels
features = data.drop('Label', axis=1)
labels = data['Label']

In [39]:
features.replace([np.inf, -np.inf], np.nan, inplace=True)
features.dropna(inplace=True)

In [40]:
features = features.select_dtypes(include=[float, int])

In [41]:
len(features)

225711

In [42]:
features

Unnamed: 0,DestinationPort,FlowDuration,TotalFwdPackets,TotalBackwardPackets,TotalLengthofFwdPackets,TotalLengthofBwdPackets,FwdPacketLengthMax,FwdPacketLengthMin,FwdPacketLengthMean,FwdPacketLengthStd,...,act_data_pkt_fwd,min_seg_size_forward,ActiveMean,ActiveStd,ActiveMax,ActiveMin,IdleMean,IdleStd,IdleMax,IdleMin
0,2.328296,-0.515259,-0.186424,-0.210206,-0.285450,-0.151994,-0.285699,-0.133993,-0.314602,-0.269528,...,-0.188408,-0.355799,-0.231652,-0.061517,-0.231164,-0.226499,-0.472388,-0.28316,-0.478408,-0.391105
1,2.337865,-0.515256,-0.251258,-0.164243,-0.287296,-0.151841,-0.285699,-0.133993,-0.314602,-0.269528,...,-0.269902,-0.355799,-0.231652,-0.061517,-0.231164,-0.226499,-0.472388,-0.28316,-0.478408,-0.391105
2,2.337915,-0.515258,-0.251258,-0.164243,-0.287296,-0.151841,-0.285699,-0.133993,-0.314602,-0.269528,...,-0.269902,-0.355799,-0.231652,-0.061517,-0.231164,-0.226499,-0.472388,-0.28316,-0.478408,-0.391105
3,1.891428,-0.515258,-0.251258,-0.164243,-0.287296,-0.151841,-0.285699,-0.133993,-0.314602,-0.269528,...,-0.269902,-0.355799,-0.231652,-0.061517,-0.231164,-0.226499,-0.472388,-0.28316,-0.478408,-0.391105
4,2.328195,-0.515259,-0.186424,-0.210206,-0.285450,-0.151994,-0.285699,-0.133993,-0.314602,-0.269528,...,-0.188408,-0.355799,-0.231652,-0.061517,-0.231164,-0.226499,-0.472388,-0.28316,-0.478408,-0.391105
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
225740,2.657833,-0.515258,-0.251258,-0.164243,-0.287296,-0.151841,-0.285699,-0.133993,-0.314602,-0.269528,...,-0.269902,-0.355799,-0.231652,-0.061517,-0.231164,-0.226499,-0.472388,-0.28316,-0.478408,-0.391105
225741,2.658035,-0.515257,-0.251258,-0.164243,-0.287296,-0.151841,-0.285699,-0.133993,-0.314602,-0.269528,...,-0.269902,-0.355799,-0.231652,-0.061517,-0.231164,-0.226499,-0.472388,-0.28316,-0.478408,-0.391105
225742,2.657883,-0.515257,-0.251258,-0.164243,-0.287296,-0.151841,-0.285699,-0.133993,-0.314602,-0.269528,...,-0.269902,-0.355799,-0.231652,-0.061517,-0.231164,-0.226499,-0.472388,-0.28316,-0.478408,-0.391105
225743,2.655251,-0.515258,-0.186424,-0.210206,-0.285450,-0.151994,-0.285699,-0.133993,-0.314602,-0.269528,...,-0.188408,-0.355799,-0.231652,-0.061517,-0.231164,-0.226499,-0.472388,-0.28316,-0.478408,-0.391105


In [43]:
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

In [44]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(labels)

In [45]:
round(data.shape[0]*80/100)

180569

In [46]:
missing_labels = np.isnan(y_encoded).sum()
print(f"Number of missing values in y_encoded: {missing_labels}")

Number of missing values in y_encoded: 0


In [47]:
train_size = round(len(y_encoded) * 0.8)

X_train = features_scaled[:train_size]
X_test = features_scaled[train_size:]
y_train = y_encoded[:train_size]
y_test = y_encoded[train_size:]

print(f"Length of X_train: {len(X_train)}")
print(f"Length of y_train: {len(y_train)}")
print(f"Length of X_test: {len(X_test)}")
print(f"Length of y_test: {len(y_test)}")

min_len = min(len(X_test), len(y_test))
X_test = X_test[:min_len]
y_test = y_test[:min_len]

print(f"Final Length of X_test: {len(X_test)}")
print(f"Final Length of y_test: {len(y_test)}")

Length of X_train: 180569
Length of y_train: 180569
Length of X_test: 45142
Length of y_test: 45142
Final Length of X_test: 45142
Final Length of y_test: 45142


In [48]:
from sklearn.svm import OneClassSVM
from sklearn.metrics import classification_report

ocsvm = OneClassSVM(kernel='rbf', nu=0.01, gamma='auto')
ocsvm.fit(X_train)

y_pred = ocsvm.predict(X_test)



print("Number of anomalies detected:", (y_pred == -1).sum())

Number of anomalies detected: 1477


In [49]:
from sklearn.metrics import accuracy_score

# Convert One-Class SVM predictions to 0 and 1: -1 for outliers and 1 for inliers
svm_predict = np.where(y_pred == -1, 1, 0)

# Convert true labels to 0 and 1 as well
y_test_mapped = np.where(y_test == 'Bot', 1, 0)

accuracy = accuracy_score(y_test_mapped, svm_predict)
print("Accuracy in separating Outliers:", accuracy)

Accuracy in separating Outliers: 0.9672810243232467


In [50]:
from sklearn.ensemble import IsolationForest
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

iso_forest = IsolationForest(n_estimators=100, contamination=0.01, random_state=42)
iso_forest.fit(X_train)

y_pred = iso_forest.predict(X_test)

print("Number of anomalies detected:", (y_pred == -1).sum())

iso_forest_predict = np.where(y_pred == -1, 1, 0)

accuracy = accuracy_score(y_test, iso_forest_predict)
print("Accuracy in separating Outliers:", accuracy)

unique_values, counts = np.unique(y_pred, return_counts=True)

for value, count in zip(unique_values, counts):
    print(f'Value: {value}, Count: {count}')

Number of anomalies detected: 712
Accuracy in separating Outliers: 0.732466439236188
Value: -1, Count: 712
Value: 1, Count: 44430
