In [10]:
import pandas as pd

# Load auth logs
df = pd.read_csv("../data/auth_logs.csv")
df.head()


Unnamed: 0,timestamp,username,source_ip,status,location
0,2025-07-21 03:30:00,hr_user,172.16.0.14,SUCCESS,United Kingdom
1,2025-07-21 08:00:10,root,192.168.1.18,FAIL,China
2,2025-07-21 08:00:13,admin,212.45.99.5,FAIL,Russia
3,2025-07-21 08:00:24,charlie,102.133.9.88,FAIL,Brazil
4,2025-07-21 08:00:37,david,192.168.1.10,FAIL,Germany


In [11]:
# 🔹 LOAD LOGS (Step 1 of your SIEM detection notebook)

import pandas as pd

# Load the generated authentication logs into a pandas DataFrame
df = pd.read_csv("../data/auth_logs.csv")

# Show the first few rows for inspection
df.head()


Unnamed: 0,timestamp,username,source_ip,status,location
0,2025-07-21 03:30:00,hr_user,172.16.0.14,SUCCESS,United Kingdom
1,2025-07-21 08:00:10,root,192.168.1.18,FAIL,China
2,2025-07-21 08:00:13,admin,212.45.99.5,FAIL,Russia
3,2025-07-21 08:00:24,charlie,102.133.9.88,FAIL,Brazil
4,2025-07-21 08:00:37,david,192.168.1.10,FAIL,Germany


In [12]:
# 🔸 DETECTION LOGIC: Brute-force login alert (5+ FAILs from same IP in 60 seconds)

from datetime import timedelta

# Convert timestamp to proper datetime format
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Filter only failed login attempts
failed_logins = df[df['status'] == "FAIL"].copy()

# Sort logs by source IP and timestamp
failed_logins = failed_logins.sort_values(by=['source_ip', 'timestamp'])

# Detection logic
alerts = []

for ip in failed_logins['source_ip'].unique():
    ip_df = failed_logins[failed_logins['source_ip'] == ip]
    timestamps = ip_df['timestamp'].tolist()

    for i in range(len(timestamps)):
        start = timestamps[i]
        window = [t for t in timestamps if 0 <= (t - start).total_seconds() <= 60]

        if len(window) >= 5:
            alerts.append({
                'source_ip': ip,
                'fail_count': len(window),
                'start_time': start,
                'end_time': window[-1]
            })
            break  # only flag first brute-force window per IP

# Show results
if alerts:
    print("🚨 Brute-force login attempts detected:")
    for alert in alerts:
        print(alert)
else:
    print("✅ No brute-force behaviour detected.")


✅ No brute-force behaviour detected.


In [10]:
df[df['source_ip'] == '192.168.1.5']


Unnamed: 0,timestamp,username,source_ip,status,location
32,2025-07-21 08:20:39,alice,192.168.1.5,FAIL,United Kingdom
48,2025-07-21 08:32:26,alice,192.168.1.5,SUCCESS,United Kingdom
69,2025-07-21 08:54:50,alice,192.168.1.5,FAIL,United Kingdom
70,2025-07-21 08:55:00,alice,192.168.1.5,FAIL,United Kingdom
102,2025-07-21 09:12:52,charlie,192.168.1.5,FAIL,Germany
104,2025-07-21 09:16:23,alice,192.168.1.5,FAIL,United Kingdom
127,2025-07-21 09:39:09,alice,192.168.1.5,FAIL,United Kingdom
139,2025-07-21 09:51:43,david,192.168.1.5,FAIL,United Kingdom
145,2025-07-21 09:55:07,admin,192.168.1.5,FAIL,United Kingdom


In [6]:
# 🔓 DETECTION LOGIC: Success After Multiple Failures from the Same IP

from datetime import timedelta  # Used to calculate time windows (e.g. "last 10 minutes")

# --------------------------------------------------------
# STEP 1: Separate SUCCESS and FAIL login attempts
# --------------------------------------------------------

# Extract all successful login events from the DataFrame
success_logs = df[df['status'] == "SUCCESS"]

# Extract all failed login events from the DataFrame
fail_logs = df[df['status'] == "FAIL"]

# --------------------------------------------------------
# STEP 2: Sort logs by IP address and timestamp
# --------------------------------------------------------
# Sorting is critical for time-window analysis so that we
# compare events in chronological order per IP address

success_logs = success_logs.sort_values(by=['source_ip', 'timestamp'])
fail_logs = fail_logs.sort_values(by=['source_ip', 'timestamp'])

# --------------------------------------------------------
# STEP 3: Create an empty list to store any alerts triggered
# --------------------------------------------------------

success_after_fail_alerts = []  # This will store dictionaries for each detected event

# --------------------------------------------------------
# STEP 4: For each IP that had a successful login, look backward
# --------------------------------------------------------

# Loop through each unique IP address that had at least one SUCCESS login
for ip in success_logs['source_ip'].unique():

    # Filter successful login events for the current IP
    ip_successes = success_logs[success_logs['source_ip'] == ip]

    # Filter failed login events for the same IP
    ip_fails = fail_logs[fail_logs['source_ip'] == ip]

    # Loop through every successful login attempt for this IP
    for index_row, sucess_row_data in ip_successes.iterrows(): # Every loop it indexes the current row in idx + creates a mini table called success_row that holds the data in the one row
        success_time = success_row['timestamp']     # Timestamp of successful login
        username = success_row['username']          # Username that logged in

        # --------------------------------------------------------
        # STEP 5: Look for failed logins within 10 minutes before the success
        # --------------------------------------------------------

        # Calculate the beginning of the 10-minute window before the success
        window_start = success_time - timedelta(minutes=10)

        # Filter failed logins that occurred between window_start and success_time
        recent_fails = ip_fails[
            (ip_fails['timestamp'] >= window_start) &
            (ip_fails['timestamp'] < success_time)
        ]

        # If there are 3 or more failures in that 10-minute window, we trigger an alert
        if len(recent_fails) >= 3:
            success_after_fail_alerts.append({
                'source_ip': ip,                           # Attacker IP
                'username': username,                      # Account that logged in
                'fail_count': len(recent_fails),           # Number of failed attempts
                'fail_window_start': window_start,         # When the fail streak began
                'success_time': success_time               # When login was successful
            })

# --------------------------------------------------------
# STEP 6: Print results in a readable format
# --------------------------------------------------------

if success_after_fail_alerts:
    print("🔓 Success-after-failure alerts:\n")
    for alert in success_after_fail_alerts:
        print(alert)
else:
    print("✅ No success-after-failure activity detected.")


✅ No success-after-failure activity detected.


In [7]:
for row in df.iterrows():
    print(row)
    break


(0, timestamp    2025-07-21 03:30:00
username                 hr_user
source_ip            172.16.0.14
status                   SUCCESS
location          United Kingdom
Name: 0, dtype: object)


In [19]:
from datetime import timedelta

#  Step 1: Ensure timestamps are in proper datetime format
# This is required so we can do time-based calculations (like comparing minutes)
df['timestamp'] = pd.to_datetime(df['timestamp'])

#  Step 2: Sort the log data first by IP, then by timestamp
# This helps us analyse login activity in correct time order for each IP
df = df.sort_values(by=['source_ip', 'timestamp'])

#  Step 3: Loop through each unique IP address found in the logs
# This allows us to examine login behaviour for each IP one at a time
for ip in df['source_ip'].unique():
    
    # Isolate only the log entries related to this specific IP
    ip_logs = df[df['source_ip'] == ip]

    #  Step 4: Loop through each login attempt from this IP
    # We’ll use each timestamp to look ahead 60 seconds and see what other login attempts happened
    for row_index, row_data in ip_logs.iterrows():
        
        #  Get the time of the current login attempt
        timestamp = row_data["timestamp"]
        
        # 🪜 Define the end of our 1-minute sliding window
        # timedelta lets us add a precise amount of time (1 minute) to the current timestamp
        window_end = timestamp + timedelta(minutes=1)

        #  Step 5: Get all login attempts from this IP within the next 60 seconds
        # This helps us check if the attacker is trying multiple usernames in a short burst
        window_logs = ip_logs[
            (ip_logs['timestamp'] >= timestamp) &
            (ip_logs['timestamp'] <= window_end)
        ]

        #  Step 6: Count how many **different usernames** were attempted in that 1-minute window
        # .nunique() = “number of unique values” — it filters out repeats
        unique_usernames = window_logs['username'].nunique()

        #  Step 7: Raise an alert if 5 or more usernames were attempted — likely a password spray
        if unique_usernames >= 5:
            print(f"Password spraying detected from IP {ip} at {timestamp}")

            # 🧾 Print the list of usernames that were tried during this attack window
            # .tolist() turns a pandas Series into a plain Python list (for cleaner output)
            print(f"Usernames tried: {window_logs['username'].tolist()}")
            print("-" * 50)  # Divider for readability
            alert_triggered = True


#Print a confirmation message if no alerts were triggered
if 'alert_triggered' not in locals():
            print("✅ No password spraying activity detected in this dataset.")


✅ No password spraying activity detected in this dataset.
