In [13]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from __future__ import annotations

import csv
import json
import logging
import os
import pickle
import ipaddress
from collections import Counter
from typing import Any, Dict, Optional, Tuple

import networkx as nx


# -----------------------------
# MaxMind lookup (geoip2 first, fallback to maxminddb)
# -----------------------------
class MaxMindLookup:
    def __init__(self, asn_mmdb: str, country_mmdb: str):
        self.mode = None
        self.asn_reader = None
        self.country_reader = None

        try:
            import geoip2.database  # type: ignore
            self.mode = "geoip2"
            self.asn_reader = geoip2.database.Reader(asn_mmdb)
            self.country_reader = geoip2.database.Reader(country_mmdb)
        except Exception:
            import maxminddb  # type: ignore
            self.mode = "maxminddb"
            self.asn_reader = maxminddb.open_database(asn_mmdb)
            self.country_reader = maxminddb.open_database(country_mmdb)

        self.cache: Dict[str, Dict[str, Any]] = {}

    def close(self) -> None:
        for r in (self.asn_reader, self.country_reader):
            try:
                if r is not None:
                    r.close()
            except Exception:
                pass

    def lookup(self, ip: str) -> Dict[str, Any]:
        if ip in self.cache:
            return self.cache[ip]

        asn = None
        org = None
        c = None

        try:
            if self.mode == "geoip2":
                a = self.asn_reader.asn(ip)  # type: ignore
                asn = getattr(a, "autonomous_system_number", None)
                org = getattr(a, "autonomous_system_organization", None)

                cc = self.country_reader.country(ip)  # type: ignore
                c = getattr(getattr(cc, "country", None), "iso_code", None)
            else:
                a = self.asn_reader.get(ip)  # type: ignore
                if isinstance(a, dict):
                    asn = a.get("autonomous_system_number")
                    org = a.get("autonomous_system_organization")

                cc = self.country_reader.get(ip)  # type: ignore
                if isinstance(cc, dict):
                    c = (cc.get("country") or {}).get("iso_code")
        except Exception:
            pass

        if isinstance(c, str):
            c = c.lower()

        out = {"asn": asn, "org": org, "country": c}
        self.cache[ip] = out
        return out


# -----------------------------
# Small helpers
# -----------------------------
def nid(ntype: str, value: str) -> str:
    return f"{ntype}:{value}"

def is_ipv4(x: str) -> bool:
    try:
        return ipaddress.ip_address(x).version == 4
    except Exception:
        return False

def parse_resolver(s: str) -> Tuple[str, Optional[int]]:
    # "210.2.4.8:53" -> ("210.2.4.8", 53)
    if not s:
        return "", None
    if ":" in s:
        ip, port = s.rsplit(":", 1)
        try:
            return ip.strip(), int(port)
        except Exception:
            return ip.strip(), None
    return s.strip(), None


# -----------------------------
# Builder
# -----------------------------
class EntityGraphBuilder:
    def __init__(self, asn_mmdb: str, country_mmdb: str, bidirectional: bool = True, log: Optional[logging.Logger] = None):
        self.mm = MaxMindLookup(asn_mmdb, country_mmdb)
        self.G = nx.MultiDiGraph()
        self.bi = bidirectional
        self.log = log or logging.getLogger("EntityGraphBuilder")

    def close(self) -> None:
        self.mm.close()

    def ensure_node(self, node: str, **attrs: Any) -> None:
        if node not in self.G:
            ntype, value = node.split(":", 1)
            self.G.add_node(node, ntype=ntype, value=value, **attrs)

    def add_edge(self, u: str, v: str, rtype: str, **attrs: Any) -> None:
        if self.G.has_edge(u, v):
            for _k, d in self.G[u][v].items():
                if d.get("rtype") == rtype:
                    return
        self.G.add_edge(u, v, rtype=rtype, **attrs)
        if self.bi:
            self.G.add_edge(v, u, rtype=f"{rtype}_rev", **attrs)

    def attach_ip_attrs(self, ip_node: str, ip: str, role: str) -> None:
        info = self.mm.lookup(ip)

        asn = info.get("asn")
        if asn is not None:
            s = nid("S", str(int(asn)))
            self.ensure_node(s)
            self.add_edge(ip_node, s, f"{role}_in_asn")

        org = info.get("org")
        if org:
            org = " ".join(str(org).split()).strip()
            if org:
                o = nid("O", org)
                self.ensure_node(o)
                self.add_edge(ip_node, o, f"{role}_in_org")

        country = info.get("country")
        if country:
            c = nid("C", str(country).lower())
            self.ensure_node(c)
            self.add_edge(ip_node, c, f"{role}_in_country")

    # ---- ingest ----
    def add_domain_info(self, csv_path: str) -> None:
        if not os.path.exists(csv_path):
            self.log.warning("missing: %s", csv_path)
            return
        with open(csv_path, "r", encoding="utf-8") as f:
            for row in csv.DictReader(f):
                dom = (row.get("domain") or "").strip().lower()
                if not dom:
                    continue
                dtype = (row.get("type") or "").strip().lower()
                country = (row.get("country") or "").strip().lower()
                desc = (row.get("description") or "").strip()

                d = nid("D", dom)
                self.ensure_node(d, description=desc)

                if country:
                    c = nid("C", country)
                    self.ensure_node(c)
                    self.add_edge(d, c, "domain_in_country")

                if dtype and dtype != "other":
                    g = nid("G", dtype)
                    self.ensure_node(g)
                    self.add_edge(d, g, "domain_category")

    def add_policy(self, csv_path: str) -> None:
        if not os.path.exists(csv_path):
            self.log.warning("missing: %s", csv_path)
            return
        with open(csv_path, "r", encoding="utf-8") as f:
            for row in csv.DictReader(f):
                src = (row.get("src_country") or "").strip().lower()
                tgt = (row.get("tgt_country") or "").strip().lower()
                desc = (row.get("description") or "").strip()
                if not (src and tgt and desc):
                    continue

                c_src = nid("C", src)
                c_tgt = nid("C", tgt)
                p = nid("P", desc)

                self.ensure_node(c_src)
                self.ensure_node(c_tgt)
                self.ensure_node(p)

                self.add_edge(c_src, p, "country_has_policy")
                self.add_edge(p, c_tgt, "policy_targets_country")

    def add_dns_records(self, jsonl_path: str) -> None:
        if not os.path.exists(jsonl_path):
            raise FileNotFoundError(jsonl_path)

        with open(jsonl_path, "r", encoding="utf-8") as f:
            for line_no, line in enumerate(f, 1):
                line = line.strip()
                if not line:
                    continue
                try:
                    obj = json.loads(line)
                except Exception:
                    self.log.warning("bad json line %d", line_no)
                    continue

                dom = (obj.get("name") or "").strip().lower()
                if not dom:
                    continue

                data = obj.get("data") or {}
                resolver_field = (data.get("resolver") or "").strip()
                answers = data.get("answers") or []

                d = nid("D", dom)
                self.ensure_node(d)

                # resolver node + attrs
                r_ip, r_port = parse_resolver(resolver_field)
                if r_ip and is_ipv4(r_ip):
                    r = nid("R", r_ip)
                    self.ensure_node(r, port=r_port)
                    self.attach_ip_attrs(r, r_ip, "resolver")

                # A answers
                for ans in answers:
                    if str(ans.get("type", "")).upper() != "A":
                        continue
                    ip = str(ans.get("answer", "")).strip()
                    if not is_ipv4(ip):
                        continue
                    a = nid("A", ip)
                    self.ensure_node(a)
                    self.attach_ip_attrs(a, ip, "answer_ip")


def build_entity_graph(
    dataset_jsonl: str = "../data/datasets/dataset_sample.jsonl",
    asn_mmdb: str = "../data/GeoLite2-ASN_20250702/GeoLite2-ASN.mmdb",
    country_mmdb: str = "../data/GeoLite2-Country_20250702/GeoLite2-Country.mmdb",
    policy_csv: str = "../data/datasets/policy_country.csv",
    domain_info_csv: str = "../data/datasets/monitor_domain_info.csv",
    out_path: str = "../outputs/entity_graph.gpickle",
    bidirectional: bool = True,
) -> nx.MultiDiGraph:
    logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s")
    log = logging.getLogger("build_entity_graph")

    b = EntityGraphBuilder(asn_mmdb, country_mmdb, bidirectional=bidirectional, log=log)
    try:
        log.info("domain info: %s", domain_info_csv)
        b.add_domain_info(domain_info_csv)

        log.info("policy: %s", policy_csv)
        b.add_policy(policy_csv)

        log.info("dns records: %s", dataset_jsonl)
        b.add_dns_records(dataset_jsonl)

        G = b.G
        log.info("graph: nodes=%d edges=%d", G.number_of_nodes(), G.number_of_edges())

        # node type counts
        cnt = Counter((attr.get("ntype", "UNK") for _, attr in G.nodes(data=True)))
        log.info("node types: %s", dict(sorted(cnt.items())))

        out_dir = os.path.dirname(out_path)
        if out_dir:
            os.makedirs(out_dir, exist_ok=True)
        with open(out_path, "wb") as f:
            pickle.dump(G, f, protocol=pickle.HIGHEST_PROTOCOL)
        log.info("saved: %s", out_path)
        return G
    finally:
        b.close()


if __name__ == "__main__":
    build_entity_graph()


2026-01-12 03:22:02,888 [INFO] build_entity_graph: domain info: ../data/datasets/monitor_domain_info.csv


2026-01-12 03:22:04,417 [INFO] build_entity_graph: policy: ../data/datasets/policy_country.csv
2026-01-12 03:22:04,428 [INFO] build_entity_graph: dns records: ../data/datasets/dataset_sample.jsonl
2026-01-12 03:22:05,426 [INFO] build_entity_graph: graph: nodes=50084 edges=133302
2026-01-12 03:22:05,453 [INFO] build_entity_graph: node types: {'A': 3690, 'C': 254, 'D': 43288, 'G': 11, 'O': 714, 'P': 43, 'R': 1348, 'S': 736}
2026-01-12 03:22:06,026 [INFO] build_entity_graph: saved: ../outputs/entity_graph.gpickle


In [14]:
import pickle

GRAPH_PATH = "../outputs/entity_graph.gpickle"
IP = "185.53.79.83"

with open(GRAPH_PATH, "rb") as f:
    G = pickle.load(f)

# 优先按 Answer IP 节点查
candidates = [f"A:{IP}", f"R:{IP}"]
node = next((n for n in candidates if n in G), None)

if node is None:
    print(f"Node not found for {IP}. Tried: {candidates}")
    print("Tip: check if your graph stores it as A:ip or R:ip.")
    raise SystemExit(0)

print(f"Found node: {node}")
print("Node attrs:", G.nodes[node])

# 出边
print("\n=== Out edges ===")
for u, v, k, data in G.out_edges(node, keys=True, data=True):
    print(f"{u} --[{data.get('rtype')}|key={k}]--> {v}   dst_attrs={G.nodes[v]}")

# 入边
print("\n=== In edges ===")
for u, v, k, data in G.in_edges(node, keys=True, data=True):
    print(f"{u} --[{data.get('rtype')}|key={k}]--> {v}   src_attrs={G.nodes[u]}")

print(f"\nDone. out_degree={G.out_degree(node)} in_degree={G.in_degree(node)}")


Found node: A:185.53.79.83
Node attrs: {'ntype': 'A', 'value': '185.53.79.83'}

=== Out edges ===
A:185.53.79.83 --[answer_ip_in_asn|key=0]--> S:16223   dst_attrs={'ntype': 'S', 'value': '16223'}
A:185.53.79.83 --[answer_ip_in_org|key=0]--> O:Maxnet Telecom, Ltd   dst_attrs={'ntype': 'O', 'value': 'Maxnet Telecom, Ltd'}
A:185.53.79.83 --[answer_ip_in_country|key=0]--> C:ua   dst_attrs={'ntype': 'C', 'value': 'ua'}

=== In edges ===
S:16223 --[answer_ip_in_asn_rev|key=0]--> A:185.53.79.83   src_attrs={'ntype': 'S', 'value': '16223'}
O:Maxnet Telecom, Ltd --[answer_ip_in_org_rev|key=0]--> A:185.53.79.83   src_attrs={'ntype': 'O', 'value': 'Maxnet Telecom, Ltd'}
C:ua --[answer_ip_in_country_rev|key=0]--> A:185.53.79.83   src_attrs={'ntype': 'C', 'value': 'ua'}

Done. out_degree=3 in_degree=3
