In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit

# Load data

In [2]:
monday = pd.read_csv('data/CIC_IDS_2017/TrafficLabelling/Monday-WorkingHours.pcap_ISCX.csv', sep=",", encoding='utf-8')
tuesday = pd.read_csv('data/CIC_IDS_2017/TrafficLabelling/Tuesday-WorkingHours.pcap_ISCX.csv', sep=",", encoding='utf-8')
wednesday = pd.read_csv('data/CIC_IDS_2017/TrafficLabelling/Wednesday-workingHours.pcap_ISCX.csv', sep=",", encoding='utf-8')
thursday_morning = pd.read_csv('data/CIC_IDS_2017/TrafficLabelling/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv', sep=",", encoding='cp1252')
thursday_afternoon = pd.read_csv('data/CIC_IDS_2017/TrafficLabelling/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv', sep=",", encoding='utf-8')
friday_ddos = pd.read_csv('data/CIC_IDS_2017/TrafficLabelling/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv', sep=",", encoding='utf-8')
friday_pcap = pd.read_csv('data/CIC_IDS_2017/TrafficLabelling/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv', sep=",", encoding='utf-8')
friday_morning = pd.read_csv('data/CIC_IDS_2017/TrafficLabelling/Friday-WorkingHours-Morning.pcap_ISCX.csv', sep=",", encoding='utf-8')
monday.name = 'Monday'
tuesday.name = 'Tuesday'
wednesday.name = 'Wednesday'
thursday_morning.name = 'Thursday_Morning'
thursday_afternoon.name = 'Thursday_Afternoon'
friday_ddos.name = 'Friday_DDoS'
friday_pcap.name = 'Friday_Pcap'
friday_morning.name = 'Friday_Morning'

dataframes = [monday, tuesday, wednesday, thursday_morning, thursday_afternoon, friday_ddos, friday_morning, friday_pcap]
data = pd.concat(dataframes)

for df in dataframes: del df

  thursday_morning = pd.read_csv('data/CIC_IDS_2017/TrafficLabelling/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv', sep=",", encoding='cp1252')


In [3]:
data.columns

Index(['Flow ID', ' Source IP', ' Source Port', ' Destination IP',
       ' Destination Port', ' Protocol', ' Timestamp', ' Flow Duration',
       ' Total Fwd Packets', ' Total Backward Packets',
       'Total Length of Fwd Packets', ' Total Length of Bwd Packets',
       ' Fwd Packet Length Max', ' Fwd Packet Length Min',
       ' Fwd Packet Length Mean', ' Fwd Packet Length Std',
       'Bwd Packet Length Max', ' Bwd Packet Length Min',
       ' Bwd Packet Length Mean', ' Bwd Packet Length Std', 'Flow Bytes/s',
       ' Flow Packets/s', ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max',
       ' Flow IAT Min', 'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std',
       ' Fwd IAT Max', ' Fwd IAT Min', 'Bwd IAT Total', ' Bwd IAT Mean',
       ' Bwd IAT Std', ' Bwd IAT Max', ' Bwd IAT Min', 'Fwd PSH Flags',
       ' Bwd PSH Flags', ' Fwd URG Flags', ' Bwd URG Flags',
       ' Fwd Header Length', ' Bwd Header Length', 'Fwd Packets/s',
       ' Bwd Packets/s', ' Min Packet Length', ' Max Pa

In [9]:
data['Class'] = np.where(data[' Label'] == 'BENIGN', 'Benign', 'Attack')

In [10]:
data['Class'].unique()

array(['Benign', 'Attack'], dtype=object)

In [13]:
attack_starts = data[data['Class'] == 'Attack'].groupby(' Label')[' Timestamp'].min()

In [14]:
attack_starts

 Label
Bot                           7/7/2017 10:04
DDoS                           7/7/2017 3:56
DoS GoldenEye                 5/7/2017 11:10
DoS Hulk                      5/7/2017 10:43
DoS Slowhttptest              5/7/2017 10:15
DoS slowloris                 5/7/2017 10:00
FTP-Patator                   4/7/2017 10:00
Heartbleed                     5/7/2017 3:12
Infiltration                   6/7/2017 2:19
PortScan                       7/7/2017 1:05
SSH-Patator                    4/7/2017 2:09
Web Attack – Brute Force      6/7/2017 10:00
Web Attack – Sql Injection    6/7/2017 10:40
Web Attack – XSS              6/7/2017 10:15
Name:  Timestamp, dtype: object

In [17]:
# Time-based cross-validation
tscv = TimeSeriesSplit(n_splits=5)
for train_idx, test_idx in tscv.split(data):
    train, test = data.iloc[train_idx], data.iloc[test_idx]

In [19]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2599455 entries, 0 to 183354
Data columns (total 86 columns):
 #   Column                        Dtype  
---  ------                        -----  
 0   Flow ID                       object 
 1    Source IP                    object 
 2    Source Port                  float64
 3    Destination IP               object 
 4    Destination Port             float64
 5    Protocol                     float64
 6    Timestamp                    object 
 7    Flow Duration                float64
 8    Total Fwd Packets            float64
 9    Total Backward Packets       float64
 10  Total Length of Fwd Packets   float64
 11   Total Length of Bwd Packets  float64
 12   Fwd Packet Length Max        float64
 13   Fwd Packet Length Min        float64
 14   Fwd Packet Length Mean       float64
 15   Fwd Packet Length Std        float64
 16  Bwd Packet Length Max         float64
 17   Bwd Packet Length Min        float64
 18   Bwd Packet Length Mean     

In [20]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 519890 entries, 183355 to 286466
Data columns (total 86 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   Flow ID                       519890 non-null  object 
 1    Source IP                    519890 non-null  object 
 2    Source Port                  519890 non-null  float64
 3    Destination IP               519890 non-null  object 
 4    Destination Port             519890 non-null  float64
 5    Protocol                     519890 non-null  float64
 6    Timestamp                    519890 non-null  object 
 7    Flow Duration                519890 non-null  float64
 8    Total Fwd Packets            519890 non-null  float64
 9    Total Backward Packets       519890 non-null  float64
 10  Total Length of Fwd Packets   519890 non-null  float64
 11   Total Length of Bwd Packets  519890 non-null  float64
 12   Fwd Packet Length Max        519890 non-nul

In [21]:
def calculate_ttd(predictions, attack_intervals):
    detection_times = {}
    for attack_type, (start, end) in attack_intervals.items():
        first_detection = predictions[(predictions['timestamp'] >= start) & 
                                    (predictions['timestamp'] <= end) &
                                    (predictions['prediction'] == attack_type)]
        if not first_detection.empty:
            detection_times[attack_type] = (first_detection['timestamp'].iloc[0] - start).total_seconds()
    return detection_times

In [23]:
from sklearn.ensemble import IsolationForest
import time

model = IsolationForest(contamination=0.01)
X_train = train.drop([' Timestamp', ' Label'], axis=1)

# Training latency
start_train = time.time()
model.fit(X_train)
training_latency = time.time() - start_train

# Inference latency
test_batch = test.sample(1000)
start_infer = time.time()
preds = model.predict(test_batch.drop([' Timestamp', ' Label'], axis=1))
inference_latency = (time.time() - start_infer)/1000  # per-instance

ValueError: could not convert string to float: '192.168.10.5-8.254.250.126-49188-80-6'