In [1]:
import pandas as pd

# Load the generated DNS logs
df = pd.read_csv("../data/dns_logs.csv")

# Display the first few rows
df.head()


Unnamed: 0,timestamp,source_ip,queried_domain,response_code,process_name
0,2025-07-21 10:00:00,192.168.1.10,malicious.ru,SERVFAIL,cmd.exe
1,2025-07-21 10:00:20,192.168.1.25,google.com,NXDOMAIN,svchost.exe
2,2025-07-21 10:00:40,192.168.1.25,bbc.co.uk,NXDOMAIN,chrome.exe
3,2025-07-21 10:01:00,192.168.1.10,YXNkZmcxMjMuZXhhbXBsZS5jb20=,NXDOMAIN,cmd.exe
4,2025-07-21 10:01:20,192.168.1.15,google.com,NOERROR,chrome.exe


In [2]:
import re  # re = Regular Expression module (used to detect patterns in text)

# Create a list of known suspicious keywords often used in malicious domains
# These are often used by attackers to host malware, phishing pages, or exploit kits
suspicious_keywords = ['malicious', 'xyz', 'ru', 'top', 'cn']

# Regular Expression pattern to match domains that look like base64-encoded strings
# Explanation: 
# - ^ = start of string
# - [A-Za-z0-9+/]{16,} = match 16 or more characters that are upper/lower letters, numbers, +, or /
# This catches domains like 'YXNkZGFzZGZhc2RmYXNk' which are highly random or encoded — suspicious!
base64_pattern = r'^[A-Za-z0-9+/]{16,}$'

# Convert the 'timestamp' column from plain text to real datetime format
# Why? Because you may want to filter logs by time later — datetime format allows comparisons like < or >=
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Step 1: Flag domains that look suspicious either by:
# - Containing a known bad keyword (e.g. "malicious", ".ru")
# - Matching our base64/random-looking pattern
suspicious_dns = df[
    df['queried_domain'].str.contains('|'.join(suspicious_keywords), case=False) |  # Check for bad keywords
    df['queried_domain'].apply(lambda d: bool(re.match(base64_pattern, d)))         # Check for randomness
]

# Step 2: Narrow it down — keep only the ones that also failed to resolve
# - Why? Because failed lookups (NXDOMAIN, SERVFAIL) to bad-looking domains are strong indicators of malware testing out its C2 channels
suspicious_dns = suspicious_dns[
    suspicious_dns['response_code'].isin(['NXDOMAIN', 'SERVFAIL'])
]

# Step 3: Print the final suspicious results
# - Show relevant columns to keep the output clear and focused
if not suspicious_dns.empty:
    print("⚠️ Suspicious DNS queries detected:\n")
    print(suspicious_dns[['timestamp', 'source_ip', 'queried_domain', 'response_code', 'process_name']])
else:
    print("✅ No suspicious DNS activity detected.")


⚠️ Suspicious DNS queries detected:

             timestamp     source_ip queried_domain response_code  \
0  2025-07-21 10:00:00  192.168.1.10   malicious.ru      SERVFAIL   
23 2025-07-21 10:07:40      10.0.0.5     stealer.cn      SERVFAIL   
24 2025-07-21 10:08:00  192.168.1.10   phishing.xyz      SERVFAIL   
39 2025-07-21 10:13:00    172.16.0.7   phishing.xyz      NXDOMAIN   
46 2025-07-21 10:15:20  192.168.1.10     stealer.cn      NXDOMAIN   
51 2025-07-21 10:17:00      10.0.0.5     stealer.cn      SERVFAIL   
57 2025-07-21 10:19:00  192.168.1.15   phishing.xyz      NXDOMAIN   
60 2025-07-21 10:20:00  192.168.1.15   phishing.xyz      SERVFAIL   
66 2025-07-21 10:22:00      10.0.0.5     stealer.cn      SERVFAIL   
73 2025-07-21 10:24:20  192.168.1.10   malicious.ru      NXDOMAIN   
75 2025-07-21 10:25:00  192.168.1.10   phishing.xyz      SERVFAIL   
78 2025-07-21 10:26:00  192.168.1.25   phishing.xyz      SERVFAIL   
89 2025-07-21 10:29:40  192.168.1.10   malicious.ru      SERVFAIL 