In [1]:
# =========================================================
# 1. INSTALL DEPENDENCIES (Run this only once if needed)
# =========================================================
# !pip install pandas scikit-learn numpy

# =========================================================
# 2. IMPORT LIBRARIES
# =========================================================
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier

# =========================================================
# 3. LOAD THE DATASET
# =========================================================
# IMPORTANT:
# Put the CSV in the SAME folder as the notebook
# Then set the filename here:
file_path = "cybersecurity_threat_detection_logs.csv"

df = pd.read_csv(file_path, low_memory=False)
print("Dataset loaded!")
print(df.head())
print(df.info())

# =========================================================
# 4. CLEAN COLUMN NAMES
# =========================================================
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

df = df.rename(columns={
    "source_ip": "source_ip",
    "destination_ip": "destination_ip",
    "source_port": "source_port",
    "destination_port": "destination_port"
})

# =========================================================
# 5. DROP ROWS WHERE TARGET (action) IS EMPTY
# =========================================================
df = df.dropna(subset=["action"])

# =========================================================
# 6. OPTIONAL: Limit rows for speed (remove later)
# =========================================================
# Comment this out to use full dataset
df = df.head(130000)

print(f"Using {len(df)} rows")

# =========================================================
# 7. SPLIT FEATURES & LABEL
# =========================================================
X = df.drop("action", axis=1)
y = df["action"]

# =========================================================
# 8. LABEL ENCODE ALL TEXT COLUMNS
# =========================================================
text_columns = X.select_dtypes(include=["object"]).columns

for col in text_columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))

# =========================================================
# 9. TRAIN/TEST SPLIT
# =========================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# =========================================================
# 10. TRAIN A RANDOM FOREST MODEL
# =========================================================
model = RandomForestClassifier(
    n_estimators=150,
    max_depth=25,
    random_state=42,
    n_jobs=-1
)

print("\nTraining model…")
model.fit(X_train, y_train)

# =========================================================
# 11. EVALUATE MODEL
# =========================================================
y_pred = model.predict(X_test)

print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))


Dataset loaded!
             timestamp      source_ip        dest_ip protocol   action  \
0  2024-05-01T00:00:00  192.168.1.125  192.168.1.124      TCP  blocked   
1  2024-07-18T00:00:00  192.168.1.201  192.168.1.201     ICMP  blocked   
2  2024-04-07T00:00:00  192.168.1.248   192.168.1.15     HTTP  allowed   
3  2024-10-26T00:00:00  192.168.1.236  192.168.1.219     HTTP  allowed   
4  2024-10-31T00:00:00  192.168.1.221   192.168.1.61     ICMP  allowed   

  threat_label     log_type  bytes_transferred  \
0       benign     firewall              10889   
1       benign  application              36522   
2       benign  application              20652   
3       benign  application               5350   
4       benign  application              40691   

                                          user_agent request_path  
0                              Nmap Scripting Engine            /  
1                              Nmap Scripting Engine            /  
2  Mozilla/5.0 (Windows NT 10.0; W