In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest

print("Project 4 environment ready")


Project 4 environment ready


In [4]:
data = [
    ["u001", "2025-01-01 08:01:00", "192.168.1.10", 0],
    ["u001", "2025-01-01 08:02:00", "192.168.1.10", 0],
    ["u001", "2025-01-01 08:03:00", "192.168.1.10", 0],
    ["u001", "2025-01-01 08:04:00", "192.168.1.10", 0],
    ["u001", "2025-01-01 08:05:00", "192.168.1.10", 0],

    ["u002", "2025-01-01 09:10:00", "10.0.0.5", 0],
    ["u002", "2025-01-01 09:11:00", "10.0.0.5", 1],

    ["u999", "2025-01-01 01:00:00", "203.0.113.9", 0],
    ["u999", "2025-01-01 01:01:00", "203.0.113.9", 0],
    ["u999", "2025-01-01 01:02:00", "203.0.113.9", 0],
    ["u999", "2025-01-01 01:03:00", "203.0.113.9", 0],
    ["u999", "2025-01-01 01:04:00", "203.0.113.9", 0],
]

df = pd.DataFrame(
    data,
    columns=["user_id", "timestamp", "ip", "success"]
)

df["timestamp"] = pd.to_datetime(df["timestamp"])
df


Unnamed: 0,user_id,timestamp,ip,success
0,u001,2025-01-01 08:01:00,192.168.1.10,0
1,u001,2025-01-01 08:02:00,192.168.1.10,0
2,u001,2025-01-01 08:03:00,192.168.1.10,0
3,u001,2025-01-01 08:04:00,192.168.1.10,0
4,u001,2025-01-01 08:05:00,192.168.1.10,0
5,u002,2025-01-01 09:10:00,10.0.0.5,0
6,u002,2025-01-01 09:11:00,10.0.0.5,1
7,u999,2025-01-01 01:00:00,203.0.113.9,0
8,u999,2025-01-01 01:01:00,203.0.113.9,0
9,u999,2025-01-01 01:02:00,203.0.113.9,0


In [5]:
# Aggregate behavior at user level
features = (
    df
    .groupby("user_id")
    .agg(
        failed_attempts=("success", lambda x: (x == 0).sum()),
        unique_ips=("ip", "nunique"),
        active_minutes=("timestamp", lambda x: (x.max() - x.min()).total_seconds() / 60)
    )
)

features


Unnamed: 0_level_0,failed_attempts,unique_ips,active_minutes
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
u001,5,1,4.0
u002,1,1,1.0
u999,5,1,4.0


In [6]:
X = features[["failed_attempts", "unique_ips", "active_minutes"]]
X


Unnamed: 0_level_0,failed_attempts,unique_ips,active_minutes
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
u001,5,1,4.0
u002,1,1,1.0
u999,5,1,4.0


In [7]:
model = IsolationForest(
    n_estimators=100,
    contamination=0.25,   # expect ~25% anomalies
    random_state=42
)

model.fit(X)


0,1,2
,"n_estimators  n_estimators: int, default=100 The number of base estimators in the ensemble.",100
,"max_samples  max_samples: ""auto"", int or float, default=""auto"" The number of samples to draw from X to train each base estimator. - If int, then draw `max_samples` samples. - If float, then draw `max_samples * X.shape[0]` samples. - If ""auto"", then `max_samples=min(256, n_samples)`. If max_samples is larger than the number of samples provided, all samples will be used for all trees (no sampling).",'auto'
,"contamination  contamination: 'auto' or float, default='auto' The amount of contamination of the data set, i.e. the proportion of outliers in the data set. Used when fitting to define the threshold on the scores of the samples. - If 'auto', the threshold is determined as in the  original paper. - If float, the contamination should be in the range (0, 0.5]. .. versionchanged:: 0.22  The default value of ``contamination`` changed from 0.1  to ``'auto'``.",0.25
,"max_features  max_features: int or float, default=1.0 The number of features to draw from X to train each base estimator. - If int, then draw `max_features` features. - If float, then draw `max(1, int(max_features * n_features_in_))` features. Note: using a float number less than 1.0 or integer less than number of features will enable feature subsampling and leads to a longer runtime.",1.0
,"bootstrap  bootstrap: bool, default=False If True, individual trees are fit on random subsets of the training data sampled with replacement. If False, sampling without replacement is performed.",False
,"n_jobs  n_jobs: int, default=None The number of jobs to run in parallel for :meth:`fit`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",
,"random_state  random_state: int, RandomState instance or None, default=None Controls the pseudo-randomness of the selection of the feature and split values for each branching step and each tree in the forest. Pass an int for reproducible results across multiple function calls. See :term:`Glossary `.",42
,"verbose  verbose: int, default=0 Controls the verbosity of the tree building process.",0
,"warm_start  warm_start: bool, default=False When set to ``True``, reuse the solution of the previous call to fit and add more estimators to the ensemble, otherwise, just fit a whole new forest. See :term:`the Glossary `. .. versionadded:: 0.21",False


In [8]:
features["anomaly"] = model.predict(X)
features


Unnamed: 0_level_0,failed_attempts,unique_ips,active_minutes,anomaly
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
u001,5,1,4.0,1
u002,1,1,1.0,-1
u999,5,1,4.0,1


In [9]:
features["anomaly_score"] = model.decision_function(X)
features


Unnamed: 0_level_0,failed_attempts,unique_ips,active_minutes,anomaly,anomaly_score
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
u001,5,1,4.0,1,0.123002
u002,1,1,1.0,-1,-0.123002
u999,5,1,4.0,1,0.123002


In [10]:
features.sort_values("anomaly_score")


Unnamed: 0_level_0,failed_attempts,unique_ips,active_minutes,anomaly,anomaly_score
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
u002,1,1,1.0,-1,-0.123002
u001,5,1,4.0,1,0.123002
u999,5,1,4.0,1,0.123002


In [11]:
features["label"] = features["anomaly"].map({
    -1: "Anomalous",
     1: "Normal"
})

features


Unnamed: 0_level_0,failed_attempts,unique_ips,active_minutes,anomaly,anomaly_score,label
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
u001,5,1,4.0,1,0.123002,Normal
u002,1,1,1.0,-1,-0.123002,Anomalous
u999,5,1,4.0,1,0.123002,Normal


In [12]:
final_alerts = (
    features
    .sort_values("anomaly_score")
    .reset_index()
)

final_alerts


Unnamed: 0,user_id,failed_attempts,unique_ips,active_minutes,anomaly,anomaly_score,label
0,u002,1,1,1.0,-1,-0.123002,Anomalous
1,u001,5,1,4.0,1,0.123002,Normal
2,u999,5,1,4.0,1,0.123002,Normal


In [14]:
final_alerts.to_csv("project4_isolation_forest_alerts.csv", index=False)


## Conclusion

This project applied unsupervised machine learning (Isolation Forest) to detect
anomalous login behavior without labeled attack data.

By aggregating user behavior into statistical features, the model successfully
identified users whose login patterns deviated from the norm.

This approach mirrors real-world security analytics systems used for fraud
detection, insider threat monitoring, and behavioral anomaly detection.
