In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [3]:
folder_path = r'C:\Users\Hussain Raza\Downloads\CIC_IDS_2017'

In [4]:
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

print("Found CSV files:", csv_files)

Found CSV files: ['Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv', 'Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv', 'Friday-WorkingHours-Morning.pcap_ISCX.csv', 'Monday-WorkingHours.pcap_ISCX.csv', 'Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv', 'Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv', 'Tuesday-WorkingHours.pcap_ISCX.csv', 'Wednesday-workingHours.pcap_ISCX.csv']


In [5]:
df_list = []
for file in csv_files:
    full_path = os.path.join(folder_path, file)
    try:
        df = pd.read_csv(full_path, low_memory=False)
        df_list.append(df)
        print(f"Loaded {file} | Shape: {df.shape}")
    except Exception as e:
        print(f"Failed to read {file}: {e}")

Loaded Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv | Shape: (225745, 79)
Loaded Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv | Shape: (286467, 79)
Loaded Friday-WorkingHours-Morning.pcap_ISCX.csv | Shape: (191033, 79)
Loaded Monday-WorkingHours.pcap_ISCX.csv | Shape: (529918, 79)
Loaded Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv | Shape: (288602, 79)
Loaded Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv | Shape: (170366, 79)
Loaded Tuesday-WorkingHours.pcap_ISCX.csv | Shape: (445909, 79)
Loaded Wednesday-workingHours.pcap_ISCX.csv | Shape: (692703, 79)


In [6]:
df_merged = pd.concat(df_list, ignore_index=True)
print("Final Merged Shape:", df_merged.shape)

Final Merged Shape: (2830743, 79)


In [7]:
# Clean column names
df_merged.columns = df_merged.columns.str.strip().str.replace(' ', '_').str.replace('[^A-Za-z0-9_]+', '', regex=True)

In [8]:
# Drop columns with >30% NaNs
df_merged = df_merged.loc[:, df_merged.isnull().mean() < 0.3]

In [9]:
# Drop constant columns
df_merged = df_merged.loc[:, df_merged.nunique() > 1]

In [10]:
# Drop rows with remaining NaNs
df_merged.dropna(inplace=True)

In [11]:
# Convert labels to binary
df_merged['Label'] = df_merged['Label'].apply(lambda x: 0 if str(x).upper() == 'BENIGN' else 1)

In [12]:
# Separate features and labels
X = df_merged.drop(['Label'], axis=1)
y = df_merged['Label']

In [13]:
# Keep only numeric columns to avoid dtype errors
X = X.select_dtypes(include=[np.number])

In [14]:
# Replace infinities with NaNs, then drop them
X.replace([np.inf, -np.inf], np.nan, inplace=True)
X.dropna(inplace=True)

In [None]:
# Align y with X
y = y.loc[X.index]

In [None]:
# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, stratify=y, random_state=42
)

print("Final Shapes - X_train:", X_train.shape, "| X_test:", X_test.shape)

Final Shapes - X_train: (2262300, 70) | X_test: (565576, 70)


In [None]:
# Use only selected important features to make it compact
selected_features = ['ProtocolName', 'Flow_Duration', 'Total_Fwd_Packets', 'Total_Backward_Packets',
                     'Fwd_Packet_Length_Mean', 'Bwd_Packet_Length_Mean', 'Flow_Bytes/s', 'Flow_Packets/s']

In [None]:
if 'Protocol' in df_merged.columns and 'ProtocolName' not in df_merged.columns:
    protocol_map = {6: 'TCP', 17: 'UDP', 1: 'ICMP'}
    df_merged['ProtocolName'] = df_merged['Protocol'].map(protocol_map).fillna('OTHER')

In [None]:
print(df_merged.columns.tolist())

['Destination_Port', 'Flow_Duration', 'Total_Fwd_Packets', 'Total_Backward_Packets', 'Total_Length_of_Fwd_Packets', 'Total_Length_of_Bwd_Packets', 'Fwd_Packet_Length_Max', 'Fwd_Packet_Length_Min', 'Fwd_Packet_Length_Mean', 'Fwd_Packet_Length_Std', 'Bwd_Packet_Length_Max', 'Bwd_Packet_Length_Min', 'Bwd_Packet_Length_Mean', 'Bwd_Packet_Length_Std', 'Flow_Bytess', 'Flow_Packetss', 'Flow_IAT_Mean', 'Flow_IAT_Std', 'Flow_IAT_Max', 'Flow_IAT_Min', 'Fwd_IAT_Total', 'Fwd_IAT_Mean', 'Fwd_IAT_Std', 'Fwd_IAT_Max', 'Fwd_IAT_Min', 'Bwd_IAT_Total', 'Bwd_IAT_Mean', 'Bwd_IAT_Std', 'Bwd_IAT_Max', 'Bwd_IAT_Min', 'Fwd_PSH_Flags', 'Fwd_URG_Flags', 'Fwd_Header_Length', 'Bwd_Header_Length', 'Fwd_Packetss', 'Bwd_Packetss', 'Min_Packet_Length', 'Max_Packet_Length', 'Packet_Length_Mean', 'Packet_Length_Std', 'Packet_Length_Variance', 'FIN_Flag_Count', 'SYN_Flag_Count', 'RST_Flag_Count', 'PSH_Flag_Count', 'ACK_Flag_Count', 'URG_Flag_Count', 'CWE_Flag_Count', 'ECE_Flag_Count', 'DownUp_Ratio', 'Average_Packet_Siz

In [None]:
selected_features = [
    'Destination_Port', 'Flow_Duration', 'Total_Fwd_Packets', 'Total_Backward_Packets',
    'Fwd_Packet_Length_Mean', 'Bwd_Packet_Length_Mean',
    'Packet_Length_Mean', 'Flow_IAT_Mean', 'Fwd_IAT_Total', 'Bwd_IAT_Total',
    'Init_Win_bytes_forward', 'Init_Win_bytes_backward'
]

In [None]:
# Function to turn a row into a descriptive sentence
def row_to_text(row):
    return ", ".join([f"{col.replace('_', ' ')} is {row[col]}" for col in selected_features])

In [None]:
df_text = pd.DataFrame()
df_text['text'] = df_merged.apply(row_to_text, axis=1)
df_text['label'] = df_merged['Label']

In [None]:
print(df_text.head(2)['text'].tolist())
print(df_text['label'].value_counts())

['Destination Port is 54865.0, Flow Duration is 3.0, Total Fwd Packets is 2.0, Total Backward Packets is 0.0, Fwd Packet Length Mean is 6.0, Bwd Packet Length Mean is 0.0, Packet Length Mean is 6.0, Flow IAT Mean is 3.0, Fwd IAT Total is 3.0, Bwd IAT Total is 0.0, Init Win bytes forward is 33.0, Init Win bytes backward is -1.0', 'Destination Port is 55054.0, Flow Duration is 109.0, Total Fwd Packets is 1.0, Total Backward Packets is 1.0, Fwd Packet Length Mean is 6.0, Bwd Packet Length Mean is 6.0, Packet Length Mean is 6.0, Flow IAT Mean is 109.0, Fwd IAT Total is 0.0, Bwd IAT Total is 0.0, Init Win bytes forward is 29.0, Init Win bytes backward is 256.0']
label
0    2272688
1     556697
Name: count, dtype: int64


In [None]:
%pip install -q sentence-transformers lightgbm scikit-learn

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics import classification_report
from lightgbm import LGBMClassifier
import joblib

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Load Sentence-BERT
model = SentenceTransformer('paraphrase-MiniLM-L3-v2')

In [None]:
# Convert text to embeddings with batching for speed
X_embeddings = model.encode(
    df_text['text'].tolist(),
    batch_size=64,
    show_progress_bar=True
)

NameError: name 'model' is not defined

In [None]:
X_embeddings = joblib.load('X_embeddings.pkl')

In [None]:
# Labels
y = df_text['label']

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_embeddings, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
# Train LightGBM
clf = LGBMClassifier()
clf.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 445358, number of negative: 1818150
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.660050 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97920
[LightGBM] [Info] Number of data points in the train set: 2263508, number of used features: 384
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.196756 -> initscore=-1.406696
[LightGBM] [Info] Start training from score -1.406696


In [None]:
# Evaluate
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))



              precision    recall  f1-score   support

           0       0.98      0.99      0.98    454538
           1       0.95      0.92      0.93    111339

    accuracy                           0.97    565877
   macro avg       0.97      0.95      0.96    565877
weighted avg       0.97      0.97      0.97    565877



In [None]:
# Save model and data
joblib.dump(clf, 'lightgbm_llm_ids.pkl')
joblib.dump(df_text['text'].tolist(), 'text_inputs.pkl')
joblib.dump(y_test, 'y_test.pkl')
joblib.dump(y_pred, 'y_pred.pkl')


['y_pred.pkl']