# LaTeXpOsEd: Data Mining Stage, Pattern Matching Substep

In this step, pattern matching techniques are used to extract information from the comments that were extracted from the papers. This includes:
- URL extraction
- Urlextract scan
- Secret Patterns Database regex

Before running this script:

- Complete: [2_parse.ipynb](2_parse.ipynb)

In [16]:
%pip install -q urlextract tqdm pandas pyyaml

Note: you may need to restart the kernel to use updated packages.


In [None]:
import json
import os
import re
from collections import Counter, defaultdict
from math import ceil

import yaml
from urlextract import URLExtract
from tqdm import tqdm
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
COMMENTS_JSONL = 'data/paper_comments.jsonl'
# Output files
URLS_TXT = 'data/extracted_urls.txt'
IPS_TXT = 'data/extracted_ips.txt'
PUBLIC_IPS_TXT = 'data/extracted_public_ips.txt'
IP_LOOKUP_CSV = 'data/ip_lookup.csv'
# Configuration
SECRETS_DB = 'resources/secrets-patterns-db-merged.yaml'

In [19]:
# Iterator class for convenient processing of the dataset
class PaperExtractedCommentIterator:
    def __init__(self, comments_file: str):
        self.comments_file = comments_file
        # Count lines
        with open(comments_file, 'r', encoding='utf-8') as f:
            self.iteration_count = sum(1 for line in f)
        self.current_paper_index = 0
        self.file_reader = open(comments_file, 'r', encoding='utf-8')
    
    def __iter__(self):
        return self
    
    def __len__(self):
        return self.iteration_count

    def __del__(self):
        self.file_reader.close()

    def __next__(self) -> tuple[int, dict[str, str]]:
        if self.current_paper_index >= self.iteration_count:
            self.file_reader.close()
            raise StopIteration
        content = self.current_paper_index, json.loads(self.file_reader.readline())
        self.current_paper_index += 1
        return content

## URLs

In [20]:
# Extract URLs
urls = set()
url_extractor = URLExtract()
paper_iterator = PaperExtractedCommentIterator(COMMENTS_JSONL)
with tqdm(total=len(paper_iterator)) as pbar:
    for index, comments in paper_iterator:
        text = comments['comments']
        url_list = url_extractor.find_urls(text)
        urls.update(url_list)
        pbar.update(1)
        pbar.set_description(f"Extracting URLs: found_urls={len(urls)}")

urls

FileNotFoundError: [Errno 2] No such file or directory: 'data/paper_comments.jsonl'

In [None]:
url_list = list(urls)
with open(URLS_TXT, 'w', encoding='utf-8') as f:
    for url in url_list:
        f.write(url + '\n')

## IP Addresses

In [None]:
ip_pattern = re.compile(r'\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b')
ips = set()
for url in urls:
    match = ip_pattern.search(url)
    if match:
        ips.add(match.group())

len(ips), ips

In [None]:
ip_list = list(ips)
with open(IPS_TXT, 'w', encoding='utf-8') as f:
    for ip in ip_list:
        f.write(ip + '\n')

In [None]:
# Identify internal/private IPs
internal_ip_pattern = re.compile(r'\b(10\.(?:[0-9]{1,3}\.){2}[0-9]{1,3}|172\.(1[6-9]|2[0-9]|3[0-1])\.(?:[0-9]{1,3}\.)[0-9]{1,3}|192\.168\.(?:[0-9]{1,3}\.)[0-9]{1,3})\b')
internal_ips = [ip for ip in ips if internal_ip_pattern.search(ip)]
len(internal_ips), internal_ips

In [None]:
# Loopback IPs
loopback_ips = [ip for ip in ips if ip.startswith('127.')]
len(loopback_ips), loopback_ips

In [None]:
# External/Public IPs
external_ips = [ip for ip in ips if ip not in internal_ips and ip not in loopback_ips]
len(external_ips), external_ips

In [None]:
with open(PUBLIC_IPS_TXT, 'w', encoding='utf-8') as f:
    for ip in external_ips:
        f.write(ip + '\n')

Use an IP lookup tool (for example: https://www.infobyip.com/ipbulklookup.php) and save it to `data/ip_lookup.csv`

In [None]:
# Parse lookup results
headers = '"IP","Domain","Country","Region","City","ISP","ASN","Lat","Long","CNAME"'

# Read the lookup results file
results_df = pd.read_csv(IP_LOOKUP_CSV, header=0)

# Create a toplist of countries and ISPs
country_toplist = results_df['Country'].value_counts()
isp_toplist = results_df['ISP'].value_counts()

country_toplist, isp_toplist

## Domains

In [None]:
# Domain extractor
def extract_domain(url):
    match = re.search(r"https?://([A-Za-z0-9.-]+)", url)
    if match:
        return match.group(1)
    return None

tests = [
    ("http://example.com/path", "example.com"),
    ("http://www.ieee.com", "www.ieee.com"),
    ("http://www.dmlr.org/format/natbib.pdf", "www.dmlr.org"),
]

for url, expected in tests:
    assert extract_domain(url) == expected, f"Failed for {url}"

In [None]:
# Count domain occurences
domain_counts = Counter()
for url in urls:
    domain = extract_domain(url)
    if domain:
        domain_counts[domain] += 1
        
domain_counts.most_common(15)

In [None]:
# Horizontal bar chart with top 10 domains

top_n = 10
total_urls = sum(domain_counts.values())
top_domains = domain_counts.most_common(top_n)

labels = [d for d, _ in top_domains]
counts = [c for _, c in top_domains]
other_count = total_urls - sum(counts)
#if other_count > 0:
#    labels.append('Other')
#    counts.append(other_count)

# Clean and truncate labels for readability
def clean_label(lbl: str) -> str:
    return lbl[4:] if lbl.startswith('www.') else lbl

def truncate(lbl: str, max_len: int = 28) -> str:
    return lbl if len(lbl) <= max_len else lbl[: max_len - 1] + '…'

clean_labels = [truncate(clean_label(l)) for l in labels]
percents = [c / total_urls * 100 for c in counts]

# Colors: palette for top domains, gray for Other
base_colors = list(plt.cm.tab20.colors)
colors = (base_colors * ceil(len(clean_labels) / len(base_colors)))[: len(clean_labels)]
#if other_count > 0:
#    colors[-1] = '#cccccc'  # gray for Other
colors = [ colors[0] ] * len(top_domains)

# Figure sizing based on number of bars
height = max(6, 0.5 * len(clean_labels) + 1)
fig, ax = plt.subplots(figsize=(8, height))

ypos = list(range(len(clean_labels)))
bars = ax.barh(ypos, counts, color=colors, edgecolor='white')
ax.set_yticks(ypos)
ax.set_yticklabels([''] * len(clean_labels))  # Hide default labels
ax.invert_yaxis()  # highest values at the top

# Annotate counts and percentages inside the bars (left side)
for i, (bar, v, p) in enumerate(zip(bars, counts, percents)):
    ax.text(
        #bar.get_width() * 0.02, i,
        25, i,
        #f"{v} ({p:.1f}%)",
        f"{p:.1f}%",
        va='center', ha='left', color='white', fontsize=10, fontweight='bold'
    )

# Annotate domain names to the right of the bars
xmax = max(counts) if counts else 1
for i, label in enumerate(clean_labels):
    ax.text(
        counts[i] + xmax * 0.01, i,
        label,
        va='center', ha='left', fontsize=10, color='black'
    )

ax.set_xlim(0, xmax * 1.15)

# Title and footnote
other_pct = (other_count / total_urls * 100) if total_urls else 0.0
ax.grid(axis='x', linestyle='--', alpha=0.3)
plt.tight_layout(rect=[0, 0.03, 1, 0.98])
plt.savefig('common_domains.pdf')
plt.show()

## Custom search patterns

In [None]:
all_search_patterns = {
    # IBAN (International Bank Account Number)
    'ibans': re.compile(r'\b([A-Z]{2}\d{2}[A-Z0-9]{11,30})\b'),
    # Generic bank account numbers (e.g., numeric sequences, 8–20 digits)
    'bank_accounts': re.compile(r'\b(\d{8}-\d{8}-?\d{0,8})\b'),
    # SSH private key headers
    'ssh_private_keys': re.compile(r'(-----BEGIN (?:RSA|DSA|EC|OPENSSH) PRIVATE KEY-----.*?-----END (?:RSA|DSA|EC|OPENSSH) PRIVATE KEY-----)', re.DOTALL),
    # Generic API keys or secrets (alphanumeric tokens with length)
    'api_keys': re.compile(r'\b([A-Za-z0-9_\-]{32,64})\b'),
    # Email addresses
    'emails': re.compile(r'\b([A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,})\b'),
    # AWS access keys
    'aws_access_keys': re.compile(r'\b(AKIA[0-9A-Z]{16})\b'),
    # AWS secret keys
    'aws_secret_keys': re.compile(r'\b([0-9a-zA-Z/+]{40})\b'),
    # Credit card numbers (Visa, MasterCard, American Express)
    'credit_cards': re.compile(r'\b(?:4[0-9]{12}(?:[0-9]{3})?|5[1-5][0-9]{14}|3[47][0-9]{13}|3(?:0[0-5]|[68][0-9])[0-9]{11}|6(?:011|5[0-9]{2})[0-9]{12}|(?:2131|1800|35\d{3})\d{11})\b'),
    # Social Security Numbers (SSNs) - US format
    'us_ssns': re.compile(r'\b(\d{3}-\d{2}-\d{4})\b'),
    # JWT tokens
    'jwt_tokens': re.compile(r'\b(eyJ[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+)\b'),
    # Google API keys
    'google_api_keys': re.compile(r'\b(AIza[0-9A-Za-z\-_]{35})\b'),
    # Github tokens
    'github_tokens': re.compile(r'\b(ghp_[A-Za-z0-9]{36})\b'),
    # Slack tokens
    'slack_tokens': re.compile(r'\b(xox[baprs]-[A-Za-z0-9]{10,48})\b'),
    # Phone numbers (various formats)
    'phone_numbers': re.compile(r'\b(\+?\d{1,3}?[-.\s]??\(?\d{1,4}?\)?[-.\s]??\d{1,4}[-.\s]??\d{1,9})\b'),
}

In [None]:
# Run the custom pattern searches
with open(COMMENTS_JSONL, "r") as f:
    comments = f.read().splitlines()
comments = [json.loads(line) for line in comments]

findings = {key: set() for key in all_search_patterns.keys()}
for c in tqdm(comments, desc="Pattern searching comments"):
    comment = c['comments']
    for key, pattern in all_search_patterns.items():
        if res := pattern.search(comment):
            findings[key].add(res.group())
        
# Remove example.com emails
findings['emails'] = {email for email in findings['emails'] if "example.com" not in email}

# Save results
os.makedirs('data/custom_patterns', exist_ok=True)
for key, items in findings.items():
    with open(f"data/custom_patterns/{key}.txt", "w") as f:
        f.write("\n".join(items))
        print(f'{key}: {len(items)}')
        
emails = findings['emails']

In [None]:
# Count domain occurences in found emails
email_domain_counts = Counter(email.split('@')[1].lower() for email in emails)
email_domain_counts.most_common(15), sum(email_domain_counts.values()), len(email_domain_counts.values())

In [None]:
# Count top-level domain occurences in found emails
email_top_domain_counts = Counter(email.split('@')[1].split('.')[-1].lower() for email in emails)
email_top_domain_counts.most_common(15), len(email_top_domain_counts)

## Secrets Patterns DB

In [None]:
sdb_patterns = yaml.safe_load(open(SECRETS_DB))['patterns']

findings = defaultdict(set)
finding_count = 0
for c in tqdm(comments, desc="Pattern searching comments"):
    comment = c['comments']
    res = None
    for pattern in sdb_patterns:
        p = pattern['pattern']
        name = p['name']
        reg = p['regex']
        confidence = p['confidence']
        if confidence != 'low':
            if res := re.search(reg, comment):
                findings[name].add(res.group())
                finding_count += 1

# Save results
import os
os.makedirs('data/db_patterns', exist_ok=True)
for key, items in findings.items():
    with open(f"data/db_patterns/{key}.txt", "w") as f:
        f.write("\n".join(items))
        print(f'{key}: {len(items)}')
        
finding_count, len(sdb_patterns)