In [1]:
import pandas as pd

df = pd.read_csv("../data/logs/simulated_audit_logs.csv", parse_dates=["timestamp"])
df.head()

Unnamed: 0,timestamp,user,ip_address,event_type,resource
0,2025-07-06 07:27:18.130801,user_4,47.172.78.228,login_success,wp-content
1,2025-07-07 18:23:27.693557,user_9,203.75.32.207,login_success,tag/app/wp-content
2,2025-07-04 07:11:21.453382,user_5,58.217.105.169,login_failure,tags
3,2025-07-05 13:23:49.326303,user_18,42.55.219.177,login_success,tag/search/wp-content
4,2025-07-08 04:07:09.391192,user_14,70.48.153.229,login_success,list/tags


### Basic features

In [2]:
df["event_hour"] = df["timestamp"].dt.hour #Hour of the event (0–23)
event_type_map = {event: i for i, event in enumerate(df["event_type"].unique())}
df["event_type_code"] = df["event_type"].map(event_type_map) #Numeric encoding of event type
df["resource_depth"] = df["resource"].str.count("/") #Resource depth - Depth of the accessed URL path
df["is_privileged_event"] = df["event_type"].isin(["privilege_escalation", "config_change"]).astype(int) #priviledged event ? like escalation or config change
df["is_weekend"] = df["timestamp"].dt.weekday >= 5 #weekend?

### Rolling features

In [6]:
#failed logins in one hour
df = df.sort_values(by="timestamp")
df["timestamp"] = pd.to_datetime(df["timestamp"])

# Filter to just login failures
failures = df[df["event_type"] == "login_failure"].copy()

# Initialize the new column
failures["failed_logins_last_1h"] = 0

# Group by user and compute rolling counts manually
for user, group in failures.groupby("user"):
    times = group["timestamp"]
    counts = []

    for i in range(len(times)):
        current_time = times.iloc[i]
        window_start = current_time - pd.Timedelta(hours=1)
        count = times[(times >= window_start) & (times < current_time)].count()
        counts.append(count)

    failures.loc[group.index, "failed_logins_last_1h"] = counts

# Merge back into original df
df = df.merge(
    failures[["timestamp", "user", "failed_logins_last_1h"]],
    on=["timestamp", "user"],
    how="left"
)
df["failed_logins_last_1h"] = df["failed_logins_last_1h"].fillna(0).astype(int)


In [9]:
#IP frequency
ip_counts = df["ip_address"].value_counts()
df["ip_event_count"] = df["ip_address"].map(ip_counts)

In [11]:
# User activity rate
df["user_event_rate"] = (
    df.groupby("user")["timestamp"]
    .transform("count") / 15  # avg per day over 15 days
)

In [12]:
features = [
    "event_hour", "event_type_code", "resource_depth",
    "is_privileged_event", "is_weekend",
    "failed_logins_last_1h", "ip_event_count", "user_event_rate"
]

X = df[features]
X.head()


Unnamed: 0,event_hour,event_type_code,resource_depth,is_privileged_event,is_weekend,failed_logins_last_1h,ip_event_count,user_event_rate
0,17,0,0,0,False,0,1,18.533333
1,17,3,2,0,False,0,1,14.266667
2,17,2,2,1,False,0,1,17.6
3,17,0,2,0,False,0,1,17.6
4,17,1,2,0,False,0,1,14.4


In [13]:
df.to_csv('../data/logs/simulated_audit_logs_with_features.csv', index=False)