In [2]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import json
import os
import re
import pickle
from collections import defaultdict
from datetime import datetime
from typing import Any, Dict, Optional, Tuple

import networkx as nx


# -----------------------------
# MaxMind ASN lookup (geoip2 or maxminddb)
# -----------------------------

class ASNLookup:
    def __init__(self, asn_mmdb_path: str):
        self._mode = None
        self._reader = None
        try:
            import geoip2.database  # type: ignore
            self._mode = "geoip2"
            self._reader = geoip2.database.Reader(asn_mmdb_path)
        except Exception:
            self._mode = "maxminddb"
            import maxminddb  # type: ignore
            self._reader = maxminddb.open_database(asn_mmdb_path)

        self._cache: Dict[str, Optional[int]] = {}

    def close(self) -> None:
        try:
            if self._reader is not None:
                self._reader.close()
        except Exception:
            pass

    def lookup_asn(self, ip: str) -> Optional[int]:
        if ip in self._cache:
            return self._cache[ip]

        asn = None
        try:
            if self._mode == "geoip2":
                resp = self._reader.asn(ip)  # type: ignore
                asn = getattr(resp, "autonomous_system_number", None)
            else:
                data = self._reader.get(ip)  # type: ignore
                if isinstance(data, dict):
                    asn = data.get("autonomous_system_number")
        except Exception:
            asn = None

        if asn is not None:
            try:
                asn = int(asn)
            except Exception:
                asn = None

        self._cache[ip] = asn
        return asn


# -----------------------------
# Utils
# -----------------------------

_ip_re = re.compile(r"^\d{1,3}(\.\d{1,3}){3}$")

def is_ipv4(ip: str) -> bool:
    if not ip or not _ip_re.match(ip):
        return False
    parts = ip.split(".")
    try:
        return all(0 <= int(p) <= 255 for p in parts)
    except Exception:
        return False


def extract_day_yyyymmdd(ts: str) -> Optional[str]:
    """
    "2025-09-04T08:44:02+08:00" -> "20250904"
    """
    if not ts:
        return None
    try:
        dt = datetime.fromisoformat(ts)
        return dt.strftime("%Y%m%d")
    except Exception:
        # fallback: grab YYYY-MM-DD
        m = re.search(r"(\d{4})-(\d{2})-(\d{2})", ts)
        if not m:
            return None
        return f"{m.group(1)}{m.group(2)}{m.group(3)}"


# -----------------------------
# Build Resolution Graph
# -----------------------------

def build_resolution_graph(
    dataset_jsonl: str = "../data/datasets/dataset_sample.jsonl",
    asn_mmdb: str = "../data/GeoLite2-ASN_20250702/GeoLite2-ASN.mmdb",
    out_path: str = "../outputs/resolution_graph.gpickle",
) -> nx.DiGraph:
    """
    Resolution Graph:
      Nodes: D:<domain>, S:<asn>
      Edge:  D -> S
      Edge attrs:
        - days: set of YYYYMMDD
        - day_counts: dict {YYYYMMDD: count}
        - total_count: int
    """
    if not os.path.exists(dataset_jsonl):
        raise FileNotFoundError(dataset_jsonl)
    if not os.path.exists(asn_mmdb):
        raise FileNotFoundError(asn_mmdb)

    G = nx.DiGraph()
    asn_lookup = ASNLookup(asn_mmdb)

    try:
        with open(dataset_jsonl, "r", encoding="utf-8") as f:
            for line_no, line in enumerate(f, start=1):
                line = line.strip()
                if not line:
                    continue
                try:
                    obj = json.loads(line)
                except Exception:
                    continue

                domain = (obj.get("name") or "").strip().lower()
                if not domain:
                    continue

                day = extract_day_yyyymmdd((obj.get("timestamp") or "").strip())
                if not day:
                    continue

                data = obj.get("data") or {}
                answers = data.get("answers") or []

                d_node = f"D:{domain}"
                if d_node not in G:
                    G.add_node(d_node, ntype="D", value=domain)

                # 只使用 A 记录的 IPv4 answer
                for ans in answers:
                    if str(ans.get("type", "")).upper() != "A":
                        continue
                    ip = str(ans.get("answer", "")).strip()
                    if not is_ipv4(ip):
                        continue

                    asn = asn_lookup.lookup_asn(ip)
                    if asn is None:
                        continue

                    s_node = f"S:{asn}"
                    if s_node not in G:
                        G.add_node(s_node, ntype="S", value=str(asn))

                    # 更新边 D -> S
                    if not G.has_edge(d_node, s_node):
                        G.add_edge(
                            d_node, s_node,
                            days=set(),
                            day_counts={},
                            total_count=0
                        )

                    e = G[d_node][s_node]
                    e["days"].add(day)
                    e["day_counts"][day] = e["day_counts"].get(day, 0) + 1
                    e["total_count"] += 1

        # save
        out_dir = os.path.dirname(out_path)
        if out_dir:
            os.makedirs(out_dir, exist_ok=True)
        with open(out_path, "wb") as wf:
            pickle.dump(G, wf, protocol=pickle.HIGHEST_PROTOCOL)

        print(f"Resolution graph saved to {out_path}")
        print(f"Nodes={G.number_of_nodes()}, Edges={G.number_of_edges()}")
        return G

    finally:
        asn_lookup.close()


if __name__ == "__main__":
    build_resolution_graph()


Resolution graph saved to ../outputs/resolution_graph.gpickle
Nodes=4217, Edges=4011


In [3]:
import pickle

GRAPH_PATH = "../outputs/resolution_graph.gpickle"
NODE = "D:yandex.ru"   

with open(GRAPH_PATH, "rb") as f:
    G = pickle.load(f)

if NODE not in G:
    print("Node not found:", NODE)
    raise SystemExit(0)

print("Node:", NODE)
print("Node attrs:", G.nodes[NODE])

print("\n=== Out edges ===")
for u, v, data in G.out_edges(NODE, data=True):
    print(f"{u} --> {v}  total={data.get('total_count')}  days={len(data.get('days', []))}")

print("\n=== In edges ===")
for u, v, data in G.in_edges(NODE, data=True):
    print(f"{u} --> {v}  total={data.get('total_count')}  days={len(data.get('days', []))}")

print(f"\nDone. out_degree={G.out_degree(NODE)} in_degree={G.in_degree(NODE)}")


Node: D:yandex.ru
Node attrs: {'ntype': 'D', 'value': 'yandex.ru'}

=== Out edges ===
D:yandex.ru --> S:40676  total=1  days=1

=== In edges ===

Done. out_degree=1 in_degree=0
