<a href="https://colab.research.google.com/github/LoVeNurik/datathon/blob/main/notebooks/00_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import pandas as pd, numpy as np
from datetime import datetime, timedelta
import random
import plotly.express as px

random.seed(42); np.random.seed(42)

users = ["alice","bob","carol","dave","erin"]
ips_normal = [f"192.168.0.{i}" for i in range(2,60)]
ip_attacker = "203.0.113.77"  # «злой» IP

start = datetime(2025, 9, 20, 9, 0, 0)
rows = []

# обычные события
ts = start
for i in range(9500):
    ts += timedelta(seconds=np.random.randint(2,8))
    user = random.choice(users)
    ip = random.choice(ips_normal)
    action = np.random.choice(["login_success","login_failed"], p=[0.93,0.07])
    rows.append([ts, user, ip, action])

# атака брутфорсом: bursts
ts_attack = start + timedelta(hours=2)
for burst in range(30):  # 30 «всплесков»
    for j in range(np.random.randint(5,12)):  # 5–11 fail подряд
        ts_attack += timedelta(seconds=np.random.randint(3,10))
        rows.append([ts_attack, "alice", ip_attacker, "login_failed"])
    # иногда «успех» для реалистичности
    if np.random.rand()<0.2:
        ts_attack += timedelta(seconds=np.random.randint(2,6))
        rows.append([ts_attack, "alice", ip_attacker, "login_success"])
    ts_attack += timedelta(minutes=np.random.randint(1,5))

df = pd.DataFrame(rows, columns=["timestamp","user","src_ip","action"]).sort_values("timestamp")
df.to_csv("/content/logs.csv", index=False)
len(df), df.head()


(9738,
             timestamp   user        src_ip         action
 0 2025-09-20 09:00:05  alice   192.168.0.3   login_failed
 1 2025-09-20 09:00:09  carol  192.168.0.17  login_success
 2 2025-09-20 09:00:15    bob  192.168.0.10  login_success
 3 2025-09-20 09:00:19  alice  192.168.0.45  login_success
 4 2025-09-20 09:00:23   erin   192.168.0.7  login_success)

In [7]:
import pandas as pd
import plotly.express as px

df = pd.read_csv("/content/logs.csv", parse_dates=["timestamp"])
df = df.sort_values("timestamp")

# базовая «общая схема» уже есть: timestamp, user, src_ip, action
df["hour"] = df["timestamp"].dt.floor("H")

# график нагрузки по часам
by_hour = df.groupby("hour").size().reset_index(name="events")
px.line(by_hour, x="hour", y="events", title="Events per hour").show()

# топ IP по неудачным логинам
fails_by_ip = (df[df["action"]=="login_failed"]
               .groupby("src_ip").size().reset_index(name="fails")
               .sort_values("fails", ascending=False).head(20))
px.bar(fails_by_ip, x="src_ip", y="fails", title="Top failed-login IPs").show()

# сохраним нормализованный CSV
df.to_csv("/content/data_clean.csv", index=False)



'H' is deprecated and will be removed in a future version, please use 'h' instead.



In [11]:
import pandas as pd

# 0) фильтруем только провалы и убеждаемся в типе времени
fails = df[df["action"]=="login_failed"].copy()
fails["timestamp"] = pd.to_datetime(fails["timestamp"])

# 1) сортируем и делаем индекс по времени (обязательно для time-based rolling)
fails = fails.sort_values(["src_ip","timestamp"]).set_index("timestamp")

# 2) индикатор одной неудачи = 1, чтобы потом суммировать в окне
fails["is_fail"] = 1

# 3) скользящее окно 2 минуты внутри каждой группы src_ip
roll = (fails[["src_ip","is_fail"]]
        .groupby("src_ip")
        .rolling("2min")
        .sum()                                  # сумма индикаторов в окне
        .rename(columns={"is_fail":"fails_2min"})
        .reset_index())                         # вернем столбцы src_ip и timestamp

# 4) события, где порог превышен
events_alerts = roll[roll["fails_2min"] >= 5].copy()
events_alerts = events_alerts.rename(columns={"timestamp":"window_end"})

# 5) список «подозрительных» IP по правилу
rule_ips = events_alerts["src_ip"].unique().tolist()
len(rule_ips), rule_ips[:5]


(1, ['203.0.113.77'])

In [13]:
import pandas as pd
from sklearn.ensemble import IsolationForest
import numpy as np

# читаем и сортируем
df = pd.read_csv("/content/data_clean.csv", parse_dates=["timestamp"])
df = df.sort_values(["src_ip","timestamp"])

# 1) агрегаты по IP
features = (df.groupby("src_ip")
              .agg(total_events = ("action", "size"),
                   fail_count   = ("action", lambda x: (x=="login_failed").sum()),
                   success_count= ("action", lambda x: (x=="login_success").sum()),
                   unique_users = ("user", "nunique"))
              .reset_index())

features["fail_ratio"] = features["fail_count"] / features["total_events"]

# 2) средний интервал между событиями этого IP (в секундах)
mean_dt = (df.groupby("src_ip")["timestamp"]
             .apply(lambda s: s.diff().dt.total_seconds().dropna().mean()))
features = features.merge(mean_dt.rename("mean_dt_sec"), on="src_ip", how="left")
features["mean_dt_sec"] = features["mean_dt_sec"].fillna(features["mean_dt_sec"].median())

# 3) модель аномалий
X = features[["total_events","fail_count","fail_ratio","unique_users","mean_dt_sec"]].fillna(0)
iso = IsolationForest(contamination=0.05, random_state=42)
features["anomaly"] = iso.fit_predict(X)   # -1 = аномалия

ml_ips = features.loc[features["anomaly"]==-1, "src_ip"].tolist()
print(len(ml_ips), ml_ips[:5])
features.head()


3 ['192.168.0.13', '192.168.0.5', '203.0.113.77']


Unnamed: 0,src_ip,total_events,fail_count,success_count,unique_users,fail_ratio,mean_dt_sec,anomaly
0,192.168.0.10,165,12,153,5,0.072727,261.439024,1
1,192.168.0.11,167,8,159,5,0.047904,257.60241,1
2,192.168.0.12,156,13,143,5,0.083333,274.258065,1
3,192.168.0.13,175,21,154,5,0.12,238.87931,-1
4,192.168.0.14,179,15,164,5,0.083799,240.202247,1





Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.







Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.







Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.







Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




In [14]:
!git config --global user.email "nurislam.zhanibek.2006@gmail.com"
!git config --global user.name "Johny"

In [23]:
!git clone https://github.com/LoVeNurik/datathon.git
%cd datathon
!cp /content/00_pipeline.ipynb notebooks/


Cloning into 'datathon'...
remote: Enumerating objects: 31, done.[K
remote: Counting objects: 100% (31/31), done.[K
remote: Compressing objects: 100% (28/28), done.[K
remote: Total 31 (delta 10), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (31/31), 102.30 KiB | 2.92 MiB/s, done.
Resolving deltas: 100% (10/10), done.
/content/datathon/datathon/datathon/datathon/datathon
cp: cannot stat '/content/00_pipeline.ipynb': No such file or directory


In [18]:
!cp /content/00_pipeline.ipynb notebooks/


cp: cannot stat '/content/00_pipeline.ipynb': No such file or directory
