In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest

print("Project 5 environment ready")


Project 5 environment ready


In [2]:
data = [
    ["u001", "2025-01-01 08:01:00", "192.168.1.10", 0],
    ["u001", "2025-01-01 08:02:00", "192.168.1.10", 0],
    ["u001", "2025-01-01 08:03:00", "192.168.1.10", 0],
    ["u001", "2025-01-01 08:04:00", "192.168.1.10", 0],
    ["u001", "2025-01-01 08:05:00", "192.168.1.10", 0],

    ["u002", "2025-01-01 09:10:00", "10.0.0.5", 0],
    ["u002", "2025-01-01 09:11:00", "10.0.0.5", 1],

    ["u999", "2025-01-01 01:00:00", "203.0.113.9", 0],
    ["u999", "2025-01-01 01:01:00", "203.0.113.9", 0],
    ["u999", "2025-01-01 01:02:00", "203.0.113.9", 0],
    ["u999", "2025-01-01 01:03:00", "203.0.113.9", 0],
    ["u999", "2025-01-01 01:04:00", "203.0.113.9", 0],
]

df = pd.DataFrame(
    data,
    columns=["user_id", "timestamp", "ip", "success"]
)

df["timestamp"] = pd.to_datetime(df["timestamp"])
df


Unnamed: 0,user_id,timestamp,ip,success
0,u001,2025-01-01 08:01:00,192.168.1.10,0
1,u001,2025-01-01 08:02:00,192.168.1.10,0
2,u001,2025-01-01 08:03:00,192.168.1.10,0
3,u001,2025-01-01 08:04:00,192.168.1.10,0
4,u001,2025-01-01 08:05:00,192.168.1.10,0
5,u002,2025-01-01 09:10:00,10.0.0.5,0
6,u002,2025-01-01 09:11:00,10.0.0.5,1
7,u999,2025-01-01 01:00:00,203.0.113.9,0
8,u999,2025-01-01 01:01:00,203.0.113.9,0
9,u999,2025-01-01 01:02:00,203.0.113.9,0


In [3]:
failed = df[df["success"] == 0]
counts = failed.groupby("user_id").size()

z_scores = (counts - counts.mean()) / counts.std()
z_anomalies = z_scores[abs(z_scores) > 2]

z_anomalies


Series([], dtype: float64)

In [4]:
Q1 = counts.quantile(0.25)
Q3 = counts.quantile(0.75)
IQR = Q3 - Q1

iqr_anomalies = counts[
    (counts < Q1 - 1.5 * IQR) |
    (counts > Q3 + 1.5 * IQR)
]

iqr_anomalies


Series([], dtype: int64)

In [5]:
features = (
    df.groupby("user_id")
    .agg(
        failed_attempts=("success", lambda x: (x == 0).sum()),
        unique_ips=("ip", "nunique"),
        active_minutes=("timestamp", lambda x: (x.max() - x.min()).total_seconds() / 60)
    )
)

model = IsolationForest(contamination=0.2, random_state=42)
features["anomaly"] = model.fit_predict(features)

features


Unnamed: 0_level_0,failed_attempts,unique_ips,active_minutes,anomaly
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
u001,5,1,4.0,1
u002,1,1,1.0,-1
u999,5,1,4.0,1


In [7]:
comparison = pd.DataFrame([
    {
        "Method": "Z-Score",
        "Flagged Users": z_anomalies.index.tolist()
    },
    {
        "Method": "IQR",
        "Flagged Users": iqr_anomalies.index.tolist()
    },
    {
        "Method": "Isolation Forest",
        "Flagged Users": features[features["anomaly"] == -1].index.tolist()
    }
])

comparison



Unnamed: 0,Method,Flagged Users
0,Z-Score,[]
1,IQR,[]
2,Isolation Forest,[u002]


## Conclusion

Statistical methods such as Z-score and IQR were ineffective on this dataset due to
small sample size and lack of extreme variance.

Isolation Forest successfully identified anomalous login behavior by modeling
behavioral patterns rather than relying on distribution assumptions.

This demonstrates why unsupervised machine learning methods are preferred in
real-world cybersecurity systems where labeled attack data is limited.
