In [9]:
# DNS NOTEBOOK — 
# Load the generated DNS logs and do tiny hygiene

import pandas as pd
from datetime import timedelta

# Load from the ../data folder 
df = pd.read_csv("../data/dns_logs.csv")

# Parse timestamp once up front so later cells can compare times
df["timestamp"] = pd.to_datetime(df["timestamp"])

# Always keep a pristine copy around (we'll inject IOCs into a copy later)
df_clean = df.copy()

# Quick peek
df.head()


Unnamed: 0,timestamp,source_ip,queried_domain,response_code,process_name
0,2025-07-21 10:00:00,192.168.1.10,malicious.ru,SERVFAIL,cmd.exe
1,2025-07-21 10:00:20,192.168.1.25,google.com,NXDOMAIN,svchost.exe
2,2025-07-21 10:00:40,192.168.1.25,bbc.co.uk,NXDOMAIN,chrome.exe
3,2025-07-21 10:01:00,192.168.1.10,YXNkZmcxMjMuZXhhbXBsZS5jb20=,NXDOMAIN,cmd.exe
4,2025-07-21 10:01:20,192.168.1.15,google.com,NOERROR,chrome.exe


In [10]:
# Rule 1: Suspicious DNS Query Detection
# (uses df with columns: timestamp, source_ip, queried_domain, response_code, process_name)


import re

# Step 1: Define risky Top-Level Domains (TLDs) to flag
risky_tlds = [".ru", ".xyz", ".top"]

# Step 2: Regex for base64/random-looking leftmost subdomain labels
# e.g., "dXNlcjEyMw==.malicious.ru"
encoded_label_re = re.compile(r'^[A-Za-z0-9+/=]{12,}$')

# Step 3: Build boolean masks (use na=False so NAs don't error)
endswith_risky_tld = df['queried_domain'].str.endswith(tuple(risky_tlds), na=False)
left_label_encoded = df['queried_domain'].str.split('.').str[0].str.match(encoded_label_re, na=False)
failed_lookup = df['response_code'].isin(['NXDOMAIN', 'SERVFAIL'])

# Step 4: Filter and show the results
suspicious_dns = df[endswith_risky_tld | left_label_encoded | failed_lookup]

print("Suspicious DNS queries detected (Rule 1):")
display(suspicious_dns[['timestamp', 'source_ip', 'queried_domain', 'response_code', 'process_name']])


Suspicious DNS queries detected (Rule 1):


Unnamed: 0,timestamp,source_ip,queried_domain,response_code,process_name
0,2025-07-21 10:00:00,192.168.1.10,malicious.ru,SERVFAIL,cmd.exe
1,2025-07-21 10:00:20,192.168.1.25,google.com,NXDOMAIN,svchost.exe
2,2025-07-21 10:00:40,192.168.1.25,bbc.co.uk,NXDOMAIN,chrome.exe
3,2025-07-21 10:01:00,192.168.1.10,YXNkZmcxMjMuZXhhbXBsZS5jb20=,NXDOMAIN,cmd.exe
5,2025-07-21 10:01:40,192.168.1.15,google.com,SERVFAIL,cmd.exe
...,...,...,...,...,...
93,2025-07-21 10:31:00,192.168.1.15,dGhpcy5pcwBhLnRlc3Q=,NXDOMAIN,svchost.exe
94,2025-07-21 10:31:20,192.168.1.15,phishing.xyz,NXDOMAIN,chrome.exe
96,2025-07-21 10:32:00,172.16.0.7,malicious.ru,NOERROR,chrome.exe
97,2025-07-21 10:32:20,192.168.1.10,secure-login.net,SERVFAIL,powershell.exe


In [11]:
# DNS NOTEBOOK — Cell 2 (Rule 2: CLEAN RUN)
# Detect repeated DNS queries to suspicious domains (beacon-y behavior)

# Tunables
RISKY_KEYWORDS = ["malicious", "xyz", "ru", "top", "cn"]
FAIL_CODES     = {"NXDOMAIN", "SERVFAIL"}
WINDOW_SEC     = 60
THRESHOLD      = 3     # fire if >= 3 queries in the window

def looks_risky_domain(domain: str) -> bool:
    """Quick heuristic for 'risky' domains."""
    domain = str(domain).lower()
    return any(k in domain for k in RISKY_KEYWORDS)

alerts = []

# Work on the *clean* dataset (no IOCs yet)
df2 = df_clean.copy()

# Group by source and domain so we evaluate each pair independently
for (src_ip, domain), group in df2.groupby(["source_ip", "queried_domain"]):
    # Only consider pairs that look interesting:
    #   - domain contains a risky keyword, OR
    #   - any of the responses are classic fail codes (NXDOMAIN/SERVFAIL)
    if not looks_risky_domain(domain) and not group["response_code"].astype(str).str.upper().isin(FAIL_CODES).any():
        continue

    # Chronological order makes time-window checks trivial
    group = group.sort_values("timestamp").reset_index(drop=True)

    # Sliding window: for each row, look ahead ≤ 60 seconds
    for i in range(len(group)):
        start = group.loc[i, "timestamp"]
        window = group[(group["timestamp"] >= start) &
                       (group["timestamp"] <= start + timedelta(seconds=WINDOW_SEC))]

        if len(window) >= THRESHOLD:
            alerts.append({
                "source_ip" : src_ip,
                "domain"    : domain,
                "count"     : len(window),
                "first_seen": window.iloc[0]["timestamp"],
                "last_seen" : window.iloc[-1]["timestamp"],
            })
            # One alert per (src,domain) is enough; move on
            break

# Pretty output
alerts_df = pd.DataFrame(alerts)

if not alerts_df.empty:
    print("⚠ Repeated DNS query alerts (IOC RUN):\n")
    display(alerts_df[["source_ip", "domain", "count", "first_seen", "last_seen"]])
else:
    print("✅ No repeated DNS query patterns detected (IOC dataset).")


✅ No repeated DNS query patterns detected (IOC dataset).


In [12]:
# DNS NOTEBOOK — Cell 3 (IOC for Rule 2)
# Inject a small burst of repeated queries (>=3 within 60s) to a risky domain.

from datetime import timedelta

# Start from the clean copy so we don't mutate df_clean
df_ioc = df_clean.copy()

# Choose an anchor that is definitely *inside* the dataset range
anchor = df_ioc["timestamp"].min() + timedelta(minutes=12)

# IOC parameters — keep them realistic but simple
ioc_src_ip  = "192.168.1.50"
ioc_domain  = "stealer.cn"         # matches RISKY_KEYWORDS ("cn") and appears in your data theme
ioc_proc    = "powershell.exe"
ioc_results = ["SERVFAIL", "SERVFAIL", "NXDOMAIN"]  # classic failing lookups

# Make three queries at t, t+20s, t+40s (all within 60s)
ioc_rows = []
for i, rc in enumerate(ioc_results):
    ioc_rows.append({
        "timestamp"    : (anchor + timedelta(seconds=20*i)).strftime("%Y-%m-%d %H:%M:%S"),
        "source_ip"    : ioc_src_ip,
        "queried_domain": ioc_domain,
        "response_code": rc,
        "process_name" : ioc_proc,
    })

# Append and normalize types
df_ioc = pd.concat([df_ioc, pd.DataFrame(ioc_rows)], ignore_index=True)
df_ioc["timestamp"] = pd.to_datetime(df_ioc["timestamp"])

print("IOC injected for Rule 2 (repeated queries within 60s). Preview rows for that host/domain:\n")
preview = df_ioc[(df_ioc["source_ip"] == ioc_src_ip) & (df_ioc["queried_domain"] == ioc_domain)] \
             .sort_values("timestamp")[["timestamp", "source_ip", "queried_domain", "response_code", "process_name"]]
display(preview.tail(5))

# IMPORTANT: temporarily point df_clean to our injected copy so re-running Cell 2 picks it up.
df_clean = df_ioc.copy()


IOC injected for Rule 2 (repeated queries within 60s). Preview rows for that host/domain:



Unnamed: 0,timestamp,source_ip,queried_domain,response_code,process_name
100,2025-07-21 10:12:00,192.168.1.50,stealer.cn,SERVFAIL,powershell.exe
101,2025-07-21 10:12:20,192.168.1.50,stealer.cn,SERVFAIL,powershell.exe
102,2025-07-21 10:12:40,192.168.1.50,stealer.cn,NXDOMAIN,powershell.exe


In [14]:
# Rule 3: DNS Exfiltration Detection
# Detects suspicious queries with base64-style subdomains that often indicate data exfiltration over DNS.

import re

# Define a function to identify base64-style subdomains (commonly used in exfiltration)
def is_encoded_subdomain(domain):
    subdomain = domain.split('.')[0]  # Extract the leftmost label before the first dot
    return bool(re.match(r'^[A-Za-z0-9+/=]{10,}$', subdomain))  # Typical encoded pattern

# Apply detection logic to the current dataset
exfiltration_suspects = df_clean[
    df_clean['queried_domain'].apply(is_encoded_subdomain) &
    df_clean['response_code'].isin(['NXDOMAIN', 'SERVFAIL'])  # Focus on failed lookups, common in beaconing
]

# Display results with a readable analyst message
if exfiltration_suspects.empty:
    print("✅ No suspicious DNS exfiltration patterns detected.")
else:
    print("⚠️ Suspicious DNS exfiltration activity detected:")
    display(exfiltration_suspects)


⚠️ Suspicious DNS exfiltration activity detected:


Unnamed: 0,timestamp,source_ip,queried_domain,response_code,process_name
3,2025-07-21 10:01:00,192.168.1.10,YXNkZmcxMjMuZXhhbXBsZS5jb20=,NXDOMAIN,cmd.exe
12,2025-07-21 10:04:00,192.168.1.15,dGhpcy5pcwBhLnRlc3Q=,SERVFAIL,powershell.exe
21,2025-07-21 10:07:00,172.16.0.7,dGhpcy5pcwBhLnRlc3Q=,NXDOMAIN,cmd.exe
42,2025-07-21 10:14:00,192.168.1.10,dGhpcy5pcwBhLnRlc3Q=,NXDOMAIN,outlook.exe
48,2025-07-21 10:16:00,192.168.1.15,dGhpcy5pcwBhLnRlc3Q=,NXDOMAIN,svchost.exe
49,2025-07-21 10:16:20,172.16.0.7,dGhpcy5pcwBhLnRlc3Q=,SERVFAIL,svchost.exe
54,2025-07-21 10:18:00,192.168.1.10,dGhpcy5pcwBhLnRlc3Q=,SERVFAIL,unknown.exe
58,2025-07-21 10:19:20,172.16.0.7,YXNkZmcxMjMuZXhhbXBsZS5jb20=,SERVFAIL,cmd.exe
74,2025-07-21 10:24:40,192.168.1.15,YXNkZmcxMjMuZXhhbXBsZS5jb20=,SERVFAIL,outlook.exe
84,2025-07-21 10:28:00,172.16.0.7,dGhpcy5pcwBhLnRlc3Q=,NXDOMAIN,cmd.exe
