<a href="https://colab.research.google.com/github/kareemullah123456789/cybersecurity_ML/blob/main/brute_force_unsupervised.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# Unsupervised Brute-Force Detection from Logs (IsolationForest & DBSCAN)

This notebook treats attack discovery as **anomaly detection**—no labels required.
We extract per-IP features in short time windows and flag extremes.


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
ls /content/drive/MyDrive/cybersecurity_data/

cyberfeddefender_dataset.csv  KDDTest+.txt              KDDTrain+.txt
KDDTest1.jpg                  KDDTrain1.jpg             synthetic_access.log
KDDTest-21.arff               KDDTrain+_20Percent.arff  unsup_access.log
KDDTest-21.txt                KDDTrain+_20Percent.txt   unsup_auth.log
KDDTest+.arff                 KDDTrain+.arff


In [4]:

from pathlib import Path
import re, pandas as pd, numpy as np, datetime as dt
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.cluster import DBSCAN

ACCESS_LOG = Path("/content/drive/MyDrive/cybersecurity_data//unsup_access.log")
AUTH_LOG   = Path("/content/drive/MyDrive/cybersecurity_data//unsup_auth.log")

print("Using:")
print(ACCESS_LOG)
print(AUTH_LOG)


Using:
/content/drive/MyDrive/cybersecurity_data/unsup_access.log
/content/drive/MyDrive/cybersecurity_data/unsup_auth.log


In [5]:

# --- Parsers (Apache combined + sshd auth.log minimal) ---
import datetime as dt

access_re = re.compile(r'(?P<ip>\S+) \S+ \S+ \[(?P<time>[^\]]+)\] "(?P<method>\S+) (?P<path>\S+) \S+" (?P<status>\d{3}) (?P<size>\S+) "([^"]*)" "([^"]*)"')
def parse_access(line):
    m = access_re.search(line)
    if not m: return None
    d = m.groupdict()
    ts = dt.datetime.strptime(d["time"], "%d/%b/%Y:%H:%M:%S %z").astimezone(dt.timezone.utc).replace(tzinfo=None)
    return dict(ts=ts, ip=d["ip"], method=d["method"], path=d["path"], status=int(d["status"]))

auth_fail = re.compile(r"Failed password .* from (?P<ip>\d+\.\d+\.\d+\.\d+)")
auth_ok   = re.compile(r"Accepted password .* from (?P<ip>\d+\.\d+\.\d+\.\d+)")
def parse_auth(line):
    prefix = line[:15]
    try:
        ts = dt.datetime.strptime(prefix, "%b %d %H:%M:%S").replace(year=2025)
    except:
        return None
    if "Failed password" in line:
        m = auth_fail.search(line)
        if m: return dict(ts=ts, ip=m.group("ip"), auth_fail=1, auth_ok=0)
    elif "Accepted password" in line:
        m = auth_ok.search(line)
        if m: return dict(ts=ts, ip=m.group("ip"), auth_fail=0, auth_ok=1)
    return None

def load_log(path, parser):
    rows = []
    with open(path, "r", errors="ignore") as f:
        for ln in f:
            r = parser(ln)
            if r: rows.append(r)
    return pd.DataFrame(rows)

dfa = load_log(ACCESS_LOG, parse_access)
dfu = load_log(AUTH_LOG, parse_auth)

dfa.head(), dfu.head(), len(dfa), len(dfu)


(                   ts            ip method       path  status
 0 2025-08-29 03:00:20  203.0.113.22    GET          /     200
 1 2025-08-29 03:00:20  203.0.113.12    GET     /about     200
 2 2025-08-29 03:00:20  203.0.113.13    GET  /products     200
 3 2025-08-29 03:00:20  203.0.113.28    GET          /     200
 4 2025-08-29 03:00:20  203.0.113.26    GET      /home     200,
                    ts            ip  auth_fail  auth_ok
 0 2025-08-29 03:00:35  203.0.113.25          0        1
 1 2025-08-29 03:06:29  203.0.113.27          0        1
 2 2025-08-29 03:12:00  203.0.113.11          0        1
 3 2025-08-29 03:18:27  203.0.113.14          0        1
 4 2025-08-29 03:24:59  203.0.113.10          0        1,
 4130,
 208)

In [7]:
# --- Feature engineering: 60-second windows per IP ---
WINDOW = "60s"  # was "60S" -> deprecated uppercase

def agg_access(df):
    if df.empty:
        return pd.DataFrame()
    g = (
        df.assign(ts=pd.to_datetime(df["ts"]))
          .set_index("ts")
          .groupby([pd.Grouper(freq=WINDOW), "ip"])
    )
    out = g.agg(
        reqs=("path", "count"),
        http_401=("status", lambda s: (s == 401).sum()),
        post_reqs=("method", lambda s: (s == "POST").sum()),
        login_reqs=("path", lambda s: (s == "/login").sum()),
        path_diversity=("path", pd.Series.nunique),
    ).reset_index().rename(columns={"ts": "window"})  # <-- single braces
    out["http_401_ratio"] = out["http_401"] / out["reqs"].clip(lower=1)
    out["post_ratio"]     = out["post_reqs"] / out["reqs"].clip(lower=1)
    out["login_ratio"]    = out["login_reqs"] / out["reqs"].clip(lower=1)
    return out

def agg_auth(df):
    if df.empty:
        return pd.DataFrame()
    g = (
        df.assign(ts=pd.to_datetime(df["ts"]))
          .set_index("ts")
          .groupby([pd.Grouper(freq=WINDOW), "ip"])
    )
    out = g.agg(
        auth_failed=("auth_fail", "sum"),
        auth_ok=("auth_ok", "sum"),
    ).reset_index().rename(columns={"ts": "window"})  # <-- single braces
    out["auth_fail_ratio"] = out["auth_failed"] / (out["auth_failed"] + out["auth_ok"]).replace(0, np.nan)
    out["auth_fail_ratio"] = out["auth_fail_ratio"].fillna(0.0)
    return out

fa = agg_access(dfa)
fu = agg_auth(dfu)
features = pd.merge(fa, fu, on=["window", "ip"], how="outer").fillna(0).sort_values(["window","ip"])
features.head(), features.shape


(               window            ip  reqs  http_401  post_reqs  login_reqs  \
 0 2025-08-29 03:00:00  203.0.113.11   1.0       0.0        0.0         0.0   
 1 2025-08-29 03:00:00  203.0.113.12   1.0       0.0        0.0         0.0   
 2 2025-08-29 03:00:00  203.0.113.13   1.0       0.0        0.0         0.0   
 3 2025-08-29 03:00:00  203.0.113.22   1.0       0.0        0.0         0.0   
 4 2025-08-29 03:00:00  203.0.113.23   1.0       0.0        0.0         0.0   
 
    path_diversity  http_401_ratio  post_ratio  login_ratio  auth_failed  \
 0             1.0             0.0         0.0          0.0          0.0   
 1             1.0             0.0         0.0          0.0          0.0   
 2             1.0             0.0         0.0          0.0          0.0   
 3             1.0             0.0         0.0          0.0          0.0   
 4             1.0             0.0         0.0          0.0          0.0   
 
    auth_ok  auth_fail_ratio  
 0      0.0              0.0  
 1  

In [8]:

# --- IsolationForest ---
X = features.drop(columns=["window","ip"])
scaler = StandardScaler(with_mean=False)
Xs = scaler.fit_transform(X)

iso = IsolationForest(random_state=0, contamination=0.05)
iso.fit(Xs)
anom_score = -iso.decision_function(Xs)  # higher = more anomalous
features_iso = features.copy()
features_iso["anom_score"] = anom_score
# Mark top 5% as anomalies
thr = np.quantile(anom_score, 0.95)
features_iso["anomaly"] = (features_iso["anom_score"] >= thr).astype(int)

features_iso.sort_values("anom_score", ascending=False).head(10)


Unnamed: 0,window,ip,reqs,http_401,post_reqs,login_reqs,path_diversity,http_401_ratio,post_ratio,login_ratio,auth_failed,auth_ok,auth_fail_ratio,anom_score,anomaly
296,2025-08-29 03:35:00,198.51.100.61,60.0,53.0,60.0,60.0,1.0,0.883333,1.0,1.0,1.0,0.0,1.0,0.086343,1
227,2025-08-29 03:30:00,198.51.100.64,60.0,60.0,60.0,60.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.079769,1
139,2025-08-29 03:18:00,203.0.113.26,4.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.077586,1
235,2025-08-29 03:30:00,203.0.113.25,4.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.077586,1
457,2025-08-29 03:54:00,203.0.113.27,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.074686,1
46,2025-08-29 03:06:00,203.0.113.27,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.074686,1
5,2025-08-29 03:00:00,203.0.113.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.074686,1
88,2025-08-29 03:12:00,203.0.113.11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.074686,1
176,2025-08-29 03:24:00,203.0.113.10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.074686,1
135,2025-08-29 03:18:00,203.0.113.14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.074686,1


In [9]:

# --- DBSCAN on scaled features (density-based anomalies) ---
# Points labeled -1 are outliers.
db = DBSCAN(eps=1.5, min_samples=5).fit(Xs)
features_db = features.copy()
features_db["dbscan_label"] = db.labels_
features_db["dbscan_outlier"] = (db.labels_ == -1).astype(int)

features_db["dbscan_label"].value_counts()


Unnamed: 0_level_0,count
dbscan_label,Unnamed: 1_level_1
0,351
2,62
4,59
3,9
1,6
-1,3



### Interpreting results (no labels)
- Look at the top **IsolationForest `anom_score`** windows and list the IPs.
- Cross-check those IPs in raw logs to confirm behavior: bursts of `/login` + many `401` + SSH failures.
- Tune `contamination` (expected anomaly fraction) and window size.
- For production, alert on an IP if it trips anomalies across multiple consecutive windows.
