In [1]:
import pandas as pd

df = pd.read_csv("../data/logs/noisy_simulated_audit_logs.csv", parse_dates=["timestamp"])
df.head()

Unnamed: 0,timestamp,user,ip_address,event_type,resource,random_token,label,event_hour,day_of_week
0,2025-07-03 07:52:11.842529,user_1,216.205.10.74,login_failure,explore,b2a41fa01b4079c00cd8e8a28a0e9465a2b3d9ad,1,7,3
1,2025-07-08 12:17:10.296134,user_5,196.89.6.126,login_failure,app,75e9298d641ab53b2f4fe713889026a494b6bbac,0,12,1
2,2025-07-08 01:45:25.601568,user_29,33.178.195.236,login_failure,tags/categories/wp-content,6e63aca43636b2c521cb060c3e35830ef75fc30b,0,1,1
3,2025-07-10 03:02:20.655905,user_2,42.55.219.177,login_success,search/wp-content,c04a568e28514d462157589699fe4b6b8cecd94f,1,3,3
4,2025-07-05 00:56:29.277108,user_17,70.48.153.229,login_failure,tags/posts,f1a309a0fee235ec1f83bb410117c454eb7f122e,0,0,5


### Basic features

In [2]:
df["event_hour"] = df["timestamp"].dt.hour #Hour of the event (0â€“23)
event_type_map = {event: i for i, event in enumerate(df["event_type"].unique())}
df["event_type_code"] = df["event_type"].map(event_type_map) #Numeric encoding of event type
df["resource_depth"] = df["resource"].str.count("/") #Resource depth - Depth of the accessed URL path
df["is_privileged_event"] = df["event_type"].isin(["privilege_escalation", "config_change"]).astype(int) #priviledged event ? like escalation or config change
df["is_weekend"] = df["timestamp"].dt.weekday >= 5 #weekend?

In [3]:
df.head()

Unnamed: 0,timestamp,user,ip_address,event_type,resource,random_token,label,event_hour,day_of_week,event_type_code,resource_depth,is_privileged_event,is_weekend
0,2025-07-03 07:52:11.842529,user_1,216.205.10.74,login_failure,explore,b2a41fa01b4079c00cd8e8a28a0e9465a2b3d9ad,1,7,3,0,0,0,False
1,2025-07-08 12:17:10.296134,user_5,196.89.6.126,login_failure,app,75e9298d641ab53b2f4fe713889026a494b6bbac,0,12,1,0,0,0,False
2,2025-07-08 01:45:25.601568,user_29,33.178.195.236,login_failure,tags/categories/wp-content,6e63aca43636b2c521cb060c3e35830ef75fc30b,0,1,1,0,2,0,False
3,2025-07-10 03:02:20.655905,user_2,42.55.219.177,login_success,search/wp-content,c04a568e28514d462157589699fe4b6b8cecd94f,1,3,3,1,1,0,False
4,2025-07-05 00:56:29.277108,user_17,70.48.153.229,login_failure,tags/posts,f1a309a0fee235ec1f83bb410117c454eb7f122e,0,0,5,0,1,0,True


### Rolling features

In [4]:
#failed logins in one hour
df = df.sort_values(by="timestamp")
df["timestamp"] = pd.to_datetime(df["timestamp"])

# Filter to just login failures
failures = df[df["event_type"] == "login_failure"].copy()

# Initialize the new column
failures["failed_logins_last_1h"] = 0

# Group by user and compute rolling counts manually
for user, group in failures.groupby("user"):
    times = group["timestamp"]
    counts = []

    for i in range(len(times)):
        current_time = times.iloc[i]
        window_start = current_time - pd.Timedelta(hours=1)
        count = times[(times >= window_start) & (times < current_time)].count()
        counts.append(count)

    failures.loc[group.index, "failed_logins_last_1h"] = counts

# Merge back into original df
df = df.merge(
    failures[["timestamp", "user", "failed_logins_last_1h"]],
    on=["timestamp", "user"],
    how="left"
)
df["failed_logins_last_1h"] = df["failed_logins_last_1h"].fillna(0).astype(int)


In [5]:
#IP frequency
ip_counts = df["ip_address"].value_counts()
df["ip_event_count"] = df["ip_address"].map(ip_counts)

In [6]:
# User activity rate
df["user_event_rate"] = (
    df.groupby("user")["timestamp"]
    .transform("count") / 15  # avg per day over 15 days
)

In [7]:
features = [
    "event_hour", "event_type_code", "resource_depth",
    "is_privileged_event", "is_weekend",
    "failed_logins_last_1h", "ip_event_count", "user_event_rate"
]

X = df[features]
X.head()


Unnamed: 0,event_hour,event_type_code,resource_depth,is_privileged_event,is_weekend,failed_logins_last_1h,ip_event_count,user_event_rate
0,15,0,1,0,True,0,1,20.933333
1,15,3,2,0,True,0,1,20.066667
2,15,0,1,0,True,0,1,13.8
3,15,1,2,0,True,0,1,20.066667
4,16,3,0,0,True,0,1,20.0


In [8]:
df.to_csv('../data/logs/simulated_audit_logs_with_features.csv', index=False)