Isolation Forest

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Load the dataset
file_path = '/content/drive/MyDrive/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv'
data = pd.read_csv(file_path)


In [4]:
# Clean column names by removing extra spaces
data.columns = data.columns.str.replace(' ', '')
print(data.columns)

Index(['DestinationPort', 'FlowDuration', 'TotalFwdPackets',
       'TotalBackwardPackets', 'TotalLengthofFwdPackets',
       'TotalLengthofBwdPackets', 'FwdPacketLengthMax', 'FwdPacketLengthMin',
       'FwdPacketLengthMean', 'FwdPacketLengthStd', 'BwdPacketLengthMax',
       'BwdPacketLengthMin', 'BwdPacketLengthMean', 'BwdPacketLengthStd',
       'FlowBytes/s', 'FlowPackets/s', 'FlowIATMean', 'FlowIATStd',
       'FlowIATMax', 'FlowIATMin', 'FwdIATTotal', 'FwdIATMean', 'FwdIATStd',
       'FwdIATMax', 'FwdIATMin', 'BwdIATTotal', 'BwdIATMean', 'BwdIATStd',
       'BwdIATMax', 'BwdIATMin', 'FwdPSHFlags', 'BwdPSHFlags', 'FwdURGFlags',
       'BwdURGFlags', 'FwdHeaderLength', 'BwdHeaderLength', 'FwdPackets/s',
       'BwdPackets/s', 'MinPacketLength', 'MaxPacketLength',
       'PacketLengthMean', 'PacketLengthStd', 'PacketLengthVariance',
       'FINFlagCount', 'SYNFlagCount', 'RSTFlagCount', 'PSHFlagCount',
       'ACKFlagCount', 'URGFlagCount', 'CWEFlagCount', 'ECEFlagCount',
       '

In [5]:
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.dropna(inplace=True)

# Display the first 10 rows of the dataset
print("First 10 rows of the dataset:")
print(data.head(10))


First 10 rows of the dataset:
   DestinationPort  FlowDuration  TotalFwdPackets  TotalBackwardPackets  \
0               22       1266342               41                    44   
1               22       1319353               41                    44   
2               22           160                1                     1   
3               22       1303488               41                    42   
4            35396            77                1                     2   
5               22           244                1                     1   
6               22       1307239               41                    40   
7            60058            82                1                     2   
8               22           171                1                     1   
9               22           210                1                     1   

   TotalLengthofFwdPackets  TotalLengthofBwdPackets  FwdPacketLengthMax  \
0                     2664                     6954                 4

In [6]:
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.dropna(inplace=True)

In [7]:
print(data.isnull().sum())

DestinationPort            0
FlowDuration               0
TotalFwdPackets            0
TotalBackwardPackets       0
TotalLengthofFwdPackets    0
                          ..
IdleMean                   0
IdleStd                    0
IdleMax                    0
IdleMin                    0
Label                      0
Length: 79, dtype: int64


In [8]:
data['Label'] = data['Label'].apply(lambda x: 0 if x == 'BENIGN' else 1)


In [9]:
print(data['Label'].value_counts())


Label
1    158804
0    127292
Name: count, dtype: int64


In [10]:
irrelevant_columns = ['FlowID', 'SourceIP', 'DestinationIP']
data = data.drop(columns=irrelevant_columns, errors='ignore')
print(data.columns)

Index(['DestinationPort', 'FlowDuration', 'TotalFwdPackets',
       'TotalBackwardPackets', 'TotalLengthofFwdPackets',
       'TotalLengthofBwdPackets', 'FwdPacketLengthMax', 'FwdPacketLengthMin',
       'FwdPacketLengthMean', 'FwdPacketLengthStd', 'BwdPacketLengthMax',
       'BwdPacketLengthMin', 'BwdPacketLengthMean', 'BwdPacketLengthStd',
       'FlowBytes/s', 'FlowPackets/s', 'FlowIATMean', 'FlowIATStd',
       'FlowIATMax', 'FlowIATMin', 'FwdIATTotal', 'FwdIATMean', 'FwdIATStd',
       'FwdIATMax', 'FwdIATMin', 'BwdIATTotal', 'BwdIATMean', 'BwdIATStd',
       'BwdIATMax', 'BwdIATMin', 'FwdPSHFlags', 'BwdPSHFlags', 'FwdURGFlags',
       'BwdURGFlags', 'FwdHeaderLength', 'BwdHeaderLength', 'FwdPackets/s',
       'BwdPackets/s', 'MinPacketLength', 'MaxPacketLength',
       'PacketLengthMean', 'PacketLengthStd', 'PacketLengthVariance',
       'FINFlagCount', 'SYNFlagCount', 'RSTFlagCount', 'PSHFlagCount',
       'ACKFlagCount', 'URGFlagCount', 'CWEFlagCount', 'ECEFlagCount',
       '

In [11]:
correlation_matrix = data.corr()
high_corr = correlation_matrix[correlation_matrix > 0.9]
print(high_corr)

                         DestinationPort  FlowDuration  TotalFwdPackets  \
DestinationPort                      1.0           NaN              NaN   
FlowDuration                         NaN           1.0              NaN   
TotalFwdPackets                      NaN           NaN         1.000000   
TotalBackwardPackets                 NaN           NaN         0.962078   
TotalLengthofFwdPackets              NaN           NaN              NaN   
...                                  ...           ...              ...   
IdleMean                             NaN           NaN              NaN   
IdleStd                              NaN           NaN              NaN   
IdleMax                              NaN           NaN              NaN   
IdleMin                              NaN           NaN              NaN   
Label                                NaN           NaN              NaN   

                         TotalBackwardPackets  TotalLengthofFwdPackets  \
DestinationPort          

In [12]:
from sklearn.preprocessing import StandardScaler
numeric_features = data.select_dtypes(include=[float, int])
scaler = StandardScaler()
scaled_features = scaler.fit_transform(numeric_features)
data[numeric_features.columns] = scaled_features
print(data.describe())

       DestinationPort  FlowDuration  TotalFwdPackets  TotalBackwardPackets  \
count     2.860960e+05  2.860960e+05     2.860960e+05          2.860960e+05   
mean      5.960596e-18 -1.589492e-18     3.655832e-17          7.947461e-18   
std       1.000002e+00  1.000002e+00     1.000002e+00          1.000002e+00   
min      -5.223313e-01 -2.455363e-01    -1.268108e-01         -1.244904e-01   
25%      -5.171183e-01 -2.455337e-01    -1.268108e-01         -8.916347e-02   
50%      -4.501311e-01 -2.455327e-01    -1.268108e-01         -8.916347e-02   
75%      -7.523445e-02 -2.444498e-01    -7.560110e-02         -5.383656e-02   
max       3.738585e+00  5.224682e+00     1.595449e+02          1.282888e+02   

       TotalLengthofFwdPackets  TotalLengthofBwdPackets  FwdPacketLengthMax  \
count             2.860960e+05             2.860960e+05       286096.000000   
mean              7.947461e-18            -7.947461e-18            0.000000   
std               1.000002e+00             1.000002

In [13]:
from sklearn.model_selection import train_test_split
X = data.drop('Label', axis=1)
y = data['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training set size: {len(X_train)}, Test set size: {len(X_test)}")


Training set size: 228876, Test set size: 57220


In [14]:
# Separate features and labels
features = data.drop('Label', axis=1)
labels = data['Label']
features.replace([np.inf, -np.inf], np.nan, inplace=True)
features.dropna(inplace=True)
features = features.select_dtypes(include=[float, int])
len(features)

286096

In [15]:
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

In [16]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(labels)

In [17]:
round(data.shape[0]*80/100)

228877

In [18]:
missing_labels = np.isnan(y_encoded).sum()
print(f"Number of missing values in y_encoded: {missing_labels}")

Number of missing values in y_encoded: 0


In [19]:
train_size = round(len(y_encoded) * 0.8)

X_train = features_scaled[:train_size]
X_test = features_scaled[train_size:]
y_train = y_encoded[:train_size]
y_test = y_encoded[train_size:]

print(f"Length of X_train: {len(X_train)}")
print(f"Length of y_train: {len(y_train)}")
print(f"Length of X_test: {len(X_test)}")
print(f"Length of y_test: {len(y_test)}")

min_len = min(len(X_test), len(y_test))
X_test = X_test[:min_len]
y_test = y_test[:min_len]

print(f"Final Length of X_test: {len(X_test)}")
print(f"Final Length of y_test: {len(y_test)}")

Length of X_train: 228877
Length of y_train: 228877
Length of X_test: 57219
Length of y_test: 57219
Final Length of X_test: 57219
Final Length of y_test: 57219


In [21]:
from sklearn.ensemble import IsolationForest
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# Train the Isolation Forest
iso_forest = IsolationForest(n_estimators=300, contamination=0.01, random_state=42)
iso_forest.fit(X_train)

# Predict on the test set
y_pred = iso_forest.predict(X_test)

# Calculate and display the number of anomalies detected
print("Number of anomalies detected:", (y_pred == -1).sum())

# Convert Isolation Forest output (-1 for anomalies, 1 for inliers) to binary classification (1 for anomalies, 0 for normal)
iso_forest_predict = np.where(y_pred == -1, 1, 0)

# Calculate accuracy
accuracy = accuracy_score(y_test, iso_forest_predict)
print("Accuracy in separating Outliers:", accuracy)

# Generate and display the classification report
print("\nClassification Report:")
print(classification_report(y_test, iso_forest_predict))

# Display unique values and their counts
unique_values, counts = np.unique(y_pred, return_counts=True)
for value, count in zip(unique_values, counts):
    print(f'Value: {value}, Count: {count}')


Number of anomalies detected: 985
Accuracy in separating Outliers: 0.6035407819081074

Classification Report:
              precision    recall  f1-score   support

           0       0.61      0.97      0.75     35519
           1       0.00      0.00      0.00     21700

    accuracy                           0.60     57219
   macro avg       0.31      0.49      0.38     57219
weighted avg       0.38      0.60      0.47     57219

Value: -1, Count: 985
Value: 1, Count: 56234
