<a href="https://colab.research.google.com/github/LoVeNurik/datathon-security-assistant/blob/main/notebooks_00_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import pandas as pd, numpy as np
from datetime import datetime, timedelta
import random
import plotly.express as px

random.seed(42); np.random.seed(42)

users = ["alice","bob","carol","dave","erin"]
ips_normal = [f"192.168.0.{i}" for i in range(2,60)]
ip_attacker = "203.0.113.77"  # «злой» IP

start = datetime(2025, 9, 20, 9, 0, 0)
rows = []

# обычные события
ts = start
for i in range(9500):
    ts += timedelta(seconds=np.random.randint(2,8))
    user = random.choice(users)
    ip = random.choice(ips_normal)
    action = np.random.choice(["login_success","login_failed"], p=[0.93,0.07])
    rows.append([ts, user, ip, action])

# атака брутфорсом: bursts
ts_attack = start + timedelta(hours=2)
for burst in range(30):  # 30 «всплесков»
    for j in range(np.random.randint(5,12)):  # 5–11 fail подряд
        ts_attack += timedelta(seconds=np.random.randint(3,10))
        rows.append([ts_attack, "alice", ip_attacker, "login_failed"])
    # иногда «успех» для реалистичности
    if np.random.rand()<0.2:
        ts_attack += timedelta(seconds=np.random.randint(2,6))
        rows.append([ts_attack, "alice", ip_attacker, "login_success"])
    ts_attack += timedelta(minutes=np.random.randint(1,5))

df = pd.DataFrame(rows, columns=["timestamp","user","src_ip","action"]).sort_values("timestamp")
df.to_csv("/content/logs.csv", index=False)
len(df), df.head()


(9738,
             timestamp   user        src_ip         action
 0 2025-09-20 09:00:05  alice   192.168.0.3   login_failed
 1 2025-09-20 09:00:09  carol  192.168.0.17  login_success
 2 2025-09-20 09:00:15    bob  192.168.0.10  login_success
 3 2025-09-20 09:00:19  alice  192.168.0.45  login_success
 4 2025-09-20 09:00:23   erin   192.168.0.7  login_success)

In [7]:
import pandas as pd
import plotly.express as px

df = pd.read_csv("/content/logs.csv", parse_dates=["timestamp"])
df = df.sort_values("timestamp")

# базовая «общая схема» уже есть: timestamp, user, src_ip, action
df["hour"] = df["timestamp"].dt.floor("H")

# график нагрузки по часам
by_hour = df.groupby("hour").size().reset_index(name="events")
px.line(by_hour, x="hour", y="events", title="Events per hour").show()

# топ IP по неудачным логинам
fails_by_ip = (df[df["action"]=="login_failed"]
               .groupby("src_ip").size().reset_index(name="fails")
               .sort_values("fails", ascending=False).head(20))
px.bar(fails_by_ip, x="src_ip", y="fails", title="Top failed-login IPs").show()

# сохраним нормализованный CSV
df.to_csv("/content/data_clean.csv", index=False)



'H' is deprecated and will be removed in a future version, please use 'h' instead.



In [9]:
import pandas as pd

df = pd.read_csv("/content/data_clean.csv", parse_dates=["timestamp"]).sort_values(["src_ip","timestamp"])

# оставим только провалы логина
fails = df[df["action"]=="login_failed"].copy()

# rolling-окно 2 минуты внутри каждого IP
fails = fails.set_index("timestamp")
roll = (fails.groupby("src_ip")
        .rolling("2min").size()
        .reset_index(name="fails_2min"))

# алёрты на уровне событий окна (где порог превышен)
events_alerts = roll[roll["fails_2min"] >= 5].copy()

# список «подозрительных IP» по правилу
rule_ips = events_alerts["src_ip"].unique().tolist()
len(rule_ips), rule_ips[:5]

AttributeError: 'RollingGroupby' object has no attribute 'size'

In [10]:
from sklearn.ensemble import IsolationForest
import numpy as np

g = df.groupby("src_ip", as_index=False)
features = pd.DataFrame({
    "src_ip": g.size()["src_ip"],
    "total_events": g.size()["size"],
    "fail_count": g.apply(lambda x: (x["action"]=="login_failed").sum()).values,
    "success_count": g.apply(lambda x: (x["action"]=="login_success").sum()).values,
    "unique_users": g["user"].nunique().values,
})
features["fail_ratio"] = features["fail_count"] / features["total_events"]

# средний интервал между событиями этого IP (сек)
def mean_interval_seconds(x):
    ts = x["timestamp"].sort_values()
    if len(ts) < 2: return np.nan
    return (ts.diff().dt.total_seconds().dropna().mean())
features["mean_dt_sec"] = g.apply(mean_interval_seconds).values
features["mean_dt_sec"] = features["mean_dt_sec"].fillna(features["mean_dt_sec"].median())

X = features[["total_events","fail_count","fail_ratio","unique_users","mean_dt_sec"]].copy()
iso = IsolationForest(contamination=0.05, random_state=42)
features["anomaly"] = iso.fit_predict(X)   # -1 = аномалия
ml_ips = features.query("anomaly==-1")["src_ip"].tolist()
len(ml_ips), ml_ips[:5]








ValueError: Per-column arrays must each be 1-dimensional