### Anon demo

Assignment:

1) Anon the user field and create a mapping
2) Anon the IP address field (which is expressed as a string, but can also map to a 32bit integer. [yet sparsely so in our data]
3) Make a vectors of the anonymous data and clear data
4) Generate a "ragged array" of the data in this form (i.e. dump out the records which are of variable length)
Bonus) Tell us which users jobs fail and how often as an absolute number, and as a fraction of the whole dataset.

In [1]:
# --- Imports
import os, hmac, hashlib, base64, ipaddress, json
from typing import Any, Dict, Iterable, List, Tuple, Optional
import numpy as np
import pandas as pd

DATA_DIR = "../data"

# ==========
# 0) Key setup (replace with KMS in prod)
# ==========
KEY_HEX = hashlib.sha256(b"random-demo-key").hexdigest()
KEY = bytes.fromhex(KEY_HEX)

# ==========
# 1) HMAC tokenization (deterministic pseudonym)
# ==========
def _to_bytes(x: Any) -> bytes:
    if x is None:
        return b""
    try:
        if pd.isna(x):
            return b""
    except Exception:
        pass
    if isinstance(x, bytes):
        return x
    return str(x).encode("utf-8", errors="ignore")

def hmac_token(value: Any, key: bytes, out_len: int = 22) -> str:
    mac = hmac.new(key, _to_bytes(value), hashlib.sha256).digest()
    return base64.b32encode(mac).decode("ascii").rstrip("=")[:out_len].lower()

# ==========
# 2) IPv4 anonymization (deterministic 32-bit remap → dotted-quad)
#    Note: This preserves IPv4 format, not subnet/prefix structure (MVP-friendly).
# ==========
def ip_str_to_int(ip_str: str) -> Optional[int]:
    if ip_str is None or (isinstance(ip_str, float) and np.isnan(ip_str)):
        return None
    try:
        return int(ipaddress.IPv4Address(str(ip_str)))
    except Exception:
        return None

def int_to_ip_str(ip_int: int) -> str:
    return str(ipaddress.IPv4Address(ip_int & 0xFFFFFFFF))

def anon_ipv4(ip_value: Any, key: bytes) -> Optional[str]:
    """Map any IPv4 string (or int-like) to a deterministic anon IPv4 string."""
    if ip_value is None or (isinstance(ip_value, float) and np.isnan(ip_value)):
        return None
    # Normalize to 32-bit int
    ip_int = None
    if isinstance(ip_value, (int, np.integer)):
        ip_int = int(ip_value)
    else:
        ip_int = ip_str_to_int(str(ip_value))
    if ip_int is None:
        return None
    mac = hmac.new(key, ip_int.to_bytes(4, "big", signed=False), hashlib.sha256).digest()
    anon_int = int.from_bytes(mac[:4], "big")  # 32-bit remap
    return int_to_ip_str(anon_int)

def anon_ipv4_both(ip_value: Any, key: bytes) -> Tuple[Optional[str], Optional[int]]:
    """Return (anon_ip_str, anon_ip_int) for convenience."""
    s = anon_ipv4(ip_value, key)
    return s, (int(ipaddress.IPv4Address(s)) if s else None)

# ==========
# 3) Assignment functions
# ==========

# 1) Anon the user field and create a mapping
def anonymize_user_with_mapping(
    df: pd.DataFrame, user_col: str, key: bytes = KEY, token_len: int = 22
) -> Tuple[pd.Series, Dict[Any, str]]:
    mapping: Dict[Any, str] = {}
    def _map(v):
        if v in mapping:
            return mapping[v]
        t = hmac_token(v, key, out_len=token_len) if (v is not None and not (isinstance(v, float) and np.isnan(v))) else v
        mapping[v] = t
        return t
    anon_series = df[user_col].map(_map)
    return anon_series, mapping

# 2) Anon the IP address field (works if your data is strings or 32-bit ints)
def anonymize_ip_with_mapping(
    df: pd.DataFrame, ip_col: str, key: bytes = KEY
) -> Tuple[pd.Series, Dict[Any, Dict[str, Any]]]:
    """
    Returns:
      anon_ip_str_series,
      mapping: { original -> {"anon_ip": str, "anon_ip_int": int} }
    """
    mapping: Dict[Any, Dict[str, Any]] = {}
    def _map(v):
        if v in mapping:
            return mapping[v]["anon_ip"]
        s, i = anon_ipv4_both(v, key)
        mapping[v] = {"anon_ip": s, "anon_ip_int": i}
        return s
    anon_series = df[ip_col].map(_map)
    return anon_series, mapping

# 3) Make vectors of the anonymous data and clear data
def make_vectors(
    df: pd.DataFrame,
    anon_cols: Iterable[str],
    clear_cols: Iterable[str],
    user_col: Optional[str] = None,
    ip_col: Optional[str] = None,
) -> Tuple[np.ndarray, np.ndarray]:
    """
    Build simple numeric vectors:
      - Strings are hashed to integers (stable) with SHA256 and truncated.
      - Numbers remain as-is; datetime → epoch ms; bool → 0/1; None/NaN → -1.
    Produces (X_anon, X_clear) aligned by row.
    """
    def _numify(val: Any) -> float:
        if val is None or (isinstance(val, float) and np.isnan(val)):
            return -1.0
        if isinstance(val, (int, float, np.integer, np.floating)):
            return float(val)
        if isinstance(val, (pd.Timestamp, np.datetime64)):
            try:
                return pd.to_datetime(val).value / 1e6  # ms
            except Exception:
                pass
        if isinstance(val, (bool, np.bool_)):
            return 1.0 if val else 0.0
        # strings/other → deterministic hash to 32-bit range
        h = hashlib.sha256(_to_bytes(val)).digest()
        return float(int.from_bytes(h[:4], "big"))  # 0..2^32-1

    X_anon = np.array([[ _numify(df.loc[i, c]) for c in anon_cols ] for i in df.index], dtype=float)
    X_clear = np.array([[ _numify(df.loc[i, c]) for c in clear_cols ] for i in df.index], dtype=float)
    return X_anon, X_clear

# 4) Generate a ragged array (variable-length dump)
def to_ragged_records(df: pd.DataFrame, include_cols: Iterable[str]) -> List[List[Tuple[str, Any]]]:
    """
    Represent each record as a list of (key, value) for only present (non-null) columns.
    This yields variable-length records → a ragged array.
    """
    ragged: List[List[Tuple[str, Any]]] = []
    for _, row in df.iterrows():
        items = []
        for c in include_cols:
            v = row.get(c, None)
            if v is not None and not (isinstance(v, float) and np.isnan(v)):
                items.append((c, v))
        ragged.append(items)
    return ragged

def dump_ragged_jsonl(ragged: List[List[Tuple[str, Any]]], path: str) -> None:
    with open(path, "w", encoding="utf-8") as f:
        for rec in ragged:
            # dump as list of {"k":..., "v":...} for clarity
            f.write(json.dumps([{"k": k, "v": v} for k, v in rec], ensure_ascii=False) + "\n")

# Bonus) Failure stats per user (absolute and fraction of dataset)
def failure_stats(
    df: pd.DataFrame,
    user_col: str,
    failed_col: str,          # boolean or {0/1} or {"success"/"failed"} etc.
    normalize: bool = True,
) -> pd.DataFrame:
    """
    Returns a DataFrame with:
      user, fails, total, frac_of_user_rows, frac_of_all_rows
    """
    # normalize to boolean
    failed = df[failed_col]
    if failed.dtype == "O":
        failed_bool = failed.astype(str).str.lower().isin(["1","true","t","yes","y","failed","fail"])
    else:
        failed_bool = failed.astype(bool)

    g = df.assign(__failed=failed_bool).groupby(user_col, dropna=False)
    per_user = g["__failed"].agg(fails="sum", total="count")
    per_user["frac_of_user_rows"] = per_user["fails"] / per_user["total"]
    per_user = per_user.reset_index()
    if normalize:
        all_total = len(df)
        per_user["frac_of_all_rows"] = per_user["fails"] / max(all_total, 1)
    else:
        per_user["frac_of_all_rows"] = np.nan
    return per_user.sort_values(["fails","total"], ascending=[False, False])

# ==========
# Demo wiring (remove/adjust for your data)
# ==========
# Example schema guess; rename these to match your columns:
USER_COL = "user"
IP_COL   = "ip"        # can be IPv4 string or 32-bit int in your data
FAILED_COL = "failed"  # boolean-ish; see failure_stats()

# Minimal demo DataFrame (replace with your read_parquet)
demo = pd.DataFrame({
    USER_COL:   ["alice","bob","alice","carol", None],
    IP_COL:     ["10.1.2.3", "10.1.2.4", 3232235777, "192.168.1.10", "not-an-ip"],  # third is 192.168.1.1 as int
    FAILED_COL: [True, False, True, False, True],
    "job_id":   [1,2,3,4,5],
})

# 1) User anon + mapping
demo["user_anon"], user_map = anonymize_user_with_mapping(demo, USER_COL, key=KEY)
# 2) IP anon + mapping
demo["ip_anon"], ip_map = anonymize_ip_with_mapping(demo, IP_COL, key=KEY)

# 3) Vectors (choose any columns you want included)
anon_cols  = ["user_anon","ip_anon"]
clear_cols = [USER_COL, IP_COL, FAILED_COL]
X_anon, X_clear = make_vectors(demo, anon_cols=anon_cols, clear_cols=clear_cols)

# 4) Ragged array dump (variable-length records)
ragged = to_ragged_records(demo, include_cols=[USER_COL,"user_anon",IP_COL,"ip_anon","job_id",FAILED_COL])
dump_ragged_jsonl(ragged, f"{DATA_DIR}/anon_ragged.jsonl")

# Bonus) Failure stats per user
fails = failure_stats(demo, user_col=USER_COL, failed_col=FAILED_COL, normalize=True)

print("=== Anonymized preview ===")
print(demo)
print("\n=== User mapping (first 3) ===")
print(dict(list(user_map.items())[:3]))
print("\n=== IP mapping (first 3) ===")
print(dict(list(ip_map.items())[:3]))
print("\n=== X_anon shape / X_clear shape ===", X_anon.shape, X_clear.shape)
print("\n=== Failure stats ===")
print(fails)

print(f"\nRagged JSONL written to: {DATA_DIR}/anon_ragged.jsonl")


=== Anonymized preview ===
    user            ip  failed  job_id               user_anon         ip_anon
0  alice      10.1.2.3    True       1  fqeh3jdtz2iyvzdgq3ju4q  250.82.115.216
1    bob      10.1.2.4   False       2  nz6qbszgpehbffbuae62te  93.175.157.185
2  alice    3232235777    True       3  fqeh3jdtz2iyvzdgq3ju4q   20.94.214.208
3  carol  192.168.1.10   False       4  px7kwwhxx52uon5bl6ijtu  77.173.143.144
4   None     not-an-ip    True       5                    None            None

=== User mapping (first 3) ===
{'alice': 'fqeh3jdtz2iyvzdgq3ju4q', 'bob': 'nz6qbszgpehbffbuae62te', 'carol': 'px7kwwhxx52uon5bl6ijtu'}

=== IP mapping (first 3) ===
{'10.1.2.3': {'anon_ip': '250.82.115.216', 'anon_ip_int': 4199707608}, '10.1.2.4': {'anon_ip': '93.175.157.185', 'anon_ip_int': 1571790265}, 3232235777: {'anon_ip': '20.94.214.208', 'anon_ip_int': 341759696}}

=== X_anon shape / X_clear shape === (5, 2) (5, 3)

=== Failure stats ===
    user  fails  total  frac_of_user_rows  frac_o

In [4]:
# --- Imports
import os, hmac, hashlib, base64, ipaddress, json
from typing import Any, Dict, Iterable, List, Tuple, Optional
import numpy as np
import pandas as pd
import glob

DATA_DIR = "../data"

# ==========
# 0) Key setup (replace with KMS in prod)
# ==========
KEY_HEX = hashlib.sha256(b"random-demo-key").hexdigest()
KEY = bytes.fromhex(KEY_HEX)

# ==========
# 1) HMAC tokenization (deterministic pseudonym)
# ==========
def _to_bytes(x: Any) -> bytes:
    if x is None:
        return b""
    try:
        if pd.isna(x):
            return b""
    except Exception:
        pass
    if isinstance(x, bytes):
        return x
    return str(x).encode("utf-8", errors="ignore")

def hmac_token(value: Any, key: bytes, out_len: int = 22) -> str:
    mac = hmac.new(key, _to_bytes(value), hashlib.sha256).digest()
    return base64.b32encode(mac).decode("ascii").rstrip("=")[:out_len].lower()

# ==========
# 2) IPv4 anonymization (deterministic 32-bit remap → dotted-quad)
#    Note: This preserves IPv4 format, not subnet/prefix structure (MVP-friendly).
# ==========
def ip_str_to_int(ip_str: str) -> Optional[int]:
    if ip_str is None or (isinstance(ip_str, float) and np.isnan(ip_str)):
        return None
    try:
        return int(ipaddress.IPv4Address(str(ip_str)))
    except Exception:
        return None

def int_to_ip_str(ip_int: int) -> str:
    return str(ipaddress.IPv4Address(ip_int & 0xFFFFFFFF))

def anon_ipv4(ip_value: Any, key: bytes) -> Optional[str]:
    """Map any IPv4 string (or int-like) to a deterministic anon IPv4 string."""
    if ip_value is None or (isinstance(ip_value, float) and np.isnan(ip_value)):
        return None
    # Normalize to 32-bit int
    ip_int = None
    if isinstance(ip_value, (int, np.integer)):
        ip_int = int(ip_value)
    else:
        ip_int = ip_str_to_int(str(ip_value))
    if ip_int is None:
        return None
    mac = hmac.new(key, ip_int.to_bytes(4, "big", signed=False), hashlib.sha256).digest()
    anon_int = int.from_bytes(mac[:4], "big")  # 32-bit remap
    return int_to_ip_str(anon_int)

def anon_ipv4_both(ip_value: Any, key: bytes) -> Tuple[Optional[str], Optional[int]]:
    """Return (anon_ip_str, anon_ip_int) for convenience."""
    s = anon_ipv4(ip_value, key)
    return s, (int(ipaddress.IPv4Address(s)) if s else None)

# ==========
# 3) Assignment functions
# ==========

# 1) Anon the user field and create a mapping
def anonymize_user_with_mapping(
    df: pd.DataFrame, user_col: str, key: bytes = KEY, token_len: int = 22
) -> Tuple[pd.Series, Dict[Any, str]]:
    mapping: Dict[Any, str] = {}
    def _map(v):
        if v in mapping:
            return mapping[v]
        t = hmac_token(v, key, out_len=token_len) if (v is not None and not (isinstance(v, float) and np.isnan(v))) else v
        mapping[v] = t
        return t
    anon_series = df[user_col].map(_map)
    return anon_series, mapping

# 2) Anon the IP address field (works if your data is strings or 32-bit ints)
def anonymize_ip_with_mapping(
    df: pd.DataFrame, ip_col: str, key: bytes = KEY
) -> Tuple[pd.Series, Dict[Any, Dict[str, Any]]]:
    """
    Returns:
      anon_ip_str_series,
      mapping: { original -> {"anon_ip": str, "anon_ip_int": int} }
    """
    mapping: Dict[Any, Dict[str, Any]] = {}
    def _map(v):
        if v in mapping:
            return mapping[v]["anon_ip"]
        s, i = anon_ipv4_both(v, key)
        mapping[v] = {"anon_ip": s, "anon_ip_int": i}
        return s
    anon_series = df[ip_col].map(_map)
    return anon_series, mapping

# 3) Make vectors of the anonymous data and clear data
def make_vectors(
    df: pd.DataFrame,
    anon_cols: Iterable[str],
    clear_cols: Iterable[str],
    user_col: Optional[str] = None,
    ip_col: Optional[str] = None,
) -> Tuple[np.ndarray, np.ndarray]:
    """
    Build simple numeric vectors:
      - Strings are hashed to integers (stable) with SHA256 and truncated.
      - Numbers remain as-is; datetime → epoch ms; bool → 0/1; None/NaN → -1.
    Produces (X_anon, X_clear) aligned by row.
    """
    def _numify(val: Any) -> float:
        if val is None or (isinstance(val, float) and np.isnan(val)):
            return -1.0
        if isinstance(val, (int, float, np.integer, np.floating)):
            return float(val)
        if isinstance(val, (pd.Timestamp, np.datetime64)):
            try:
                return pd.to_datetime(val).value / 1e6  # ms
            except Exception:
                pass
        if isinstance(val, (bool, np.bool_)):
            return 1.0 if val else 0.0
        # strings/other → deterministic hash to 32-bit range
        h = hashlib.sha256(_to_bytes(val)).digest()
        return float(int.from_bytes(h[:4], "big"))  # 0..2^32-1

    X_anon = np.array([[ _numify(df.loc[i, c]) for c in anon_cols ] for i in df.index], dtype=float)
    X_clear = np.array([[ _numify(df.loc[i, c]) for c in clear_cols ] for i in df.index], dtype=float)
    return X_anon, X_clear

# 4) Generate a ragged array (variable-length dump)
def to_ragged_records(df: pd.DataFrame, include_cols: Iterable[str]) -> List[List[Tuple[str, Any]]]:
    """
    Represent each record as a list of (key, value) for only present (non-null) columns.
    This yields variable-length records → a ragged array.
    """
    ragged: List[List[Tuple[str, Any]]] = []
    for _, row in df.iterrows():
        items = []
        for c in include_cols:
            v = row.get(c, None)
            if v is not None and not (isinstance(v, float) and np.isnan(v)):
                items.append((c, v))
        ragged.append(items)
    return ragged

def dump_ragged_jsonl(ragged: List[List[Tuple[str, Any]]], path: str) -> None:
    with open(path, "w", encoding="utf-8") as f:
        for rec in ragged:
            # dump as list of {"k":..., "v":...} for clarity
            f.write(json.dumps([{"k": k, "v": v} for k, v in rec], ensure_ascii=False) + "\n")

# Bonus) Failure stats per user (absolute and fraction of dataset)
def failure_stats(
    df: pd.DataFrame,
    user_col: str,
    failed_col: str,          # boolean or {0/1} or {"success"/"failed"} etc.
    normalize: bool = True,
) -> pd.DataFrame:
    """
    Returns a DataFrame with:
      user, fails, total, frac_of_user_rows, frac_of_all_rows
    """
    # normalize to boolean
    failed = df[failed_col]
    if failed.dtype == "O":
        failed_bool = failed.astype(str).str.lower().isin(["1","true","t","yes","y","failed","fail"])
    else:
        failed_bool = failed.astype(bool)

    g = df.assign(__failed=failed_bool).groupby(user_col, dropna=False)
    per_user = g["__failed"].agg(fails="sum", total="count")
    per_user["frac_of_user_rows"] = per_user["fails"] / per_user["total"]
    per_user = per_user.reset_index()
    if normalize:
        all_total = len(df)
        per_user["frac_of_all_rows"] = per_user["fails"] / max(all_total, 1)
    else:
        per_user["frac_of_all_rows"] = np.nan
    return per_user.sort_values(["fails","total"], ascending=[False, False])

# ==========
# Demo wiring (remove/adjust for your data)
# ==========
# Example schema guess; rename these to match your columns:
anonymized_columns = ["x509UserProxyEmail","User","JobsubClientIpAddress"]
USER_COL = "User"
IP_COL   = "JobsubClientIpAddress"        # can be IPv4 string or 32-bit int in your data
FAILED_COL = "DAG_NodesFailed"  # boolean-ish; see failure_stats()

# Minimal demo DataFrame (replace with your read_parquet)
fnames_in = f"{DATA_DIR}/fifebatch-history-*.parquet"
files = glob.glob(fnames_in)
print(f"Found {len(files)} files")
n_files = 2
# Read and concatenate
demo = pd.concat([pd.read_parquet(f, engine="fastparquet") for f in files[:n_files]], ignore_index=True)
print(f"Read {len(files[:n_files])} and loaded demo:{len(demo)}")
# demo = pd.DataFrame({
#     USER_COL:   ["alice","bob","alice","carol", None],
#     IP_COL:     ["10.1.2.3", "10.1.2.4", 3232235777, "192.168.1.10", "not-an-ip"],  # third is 192.168.1.1 as int
#     FAILED_COL: [True, False, True, False, True],
#     "job_id":   [1,2,3,4,5],
# })

# 1) User anon + mapping
demo["user_anon"], user_map = anonymize_user_with_mapping(demo, USER_COL, key=KEY)
print("1) Anonymized user with mapping")
# 2) IP anon + mapping
demo["ip_anon"], ip_map = anonymize_ip_with_mapping(demo, IP_COL, key=KEY)
print("2) Anonymized IP with mapping")
# 3) Vectors (choose any columns you want included)
anon_cols  = ["user_anon","ip_anon"]
clear_cols = [USER_COL, IP_COL, FAILED_COL]
X_anon, X_clear = make_vectors(demo, anon_cols=anon_cols, clear_cols=clear_cols)
print("2) Made vectors")
# 4) Ragged array dump (variable-length records)
ragged = to_ragged_records(demo, include_cols=[USER_COL,"user_anon",IP_COL,"ip_anon","job_id",FAILED_COL])
dump_ragged_jsonl(ragged, f"{DATA_DIR}/anon_ragged.jsonl")
print("4) Made ragged array")
# Bonus) Failure stats per user
fails = failure_stats(demo, user_col=USER_COL, failed_col=FAILED_COL, normalize=True)

print("=== Anonymized preview ===")
print(demo)
print("\n=== User mapping (first 3) ===")
print(dict(list(user_map.items())[:3]))
print("\n=== IP mapping (first 3) ===")
print(dict(list(ip_map.items())[:3]))
print("\n=== X_anon shape / X_clear shape ===", X_anon.shape, X_clear.shape)
print("\n=== Failure stats ===")
print(fails)

print(f"\nRagged JSONL written to: {DATA_DIR}/anon_ragged.jsonl")


Found 6 files
Read 2 and loaded demo:74500
1) Anonymized user with mapping
2) Anonymized IP with mapping


  demo["user_anon"], user_map = anonymize_user_with_mapping(demo, USER_COL, key=KEY)
  demo["ip_anon"], ip_map = anonymize_ip_with_mapping(demo, IP_COL, key=KEY)


2) Made vectors
4) Made ragged array
=== Anonymized preview ===
      @timestamp @version              AccountingGroup AccountingGroupOSG  \
0            NaT        1  group_uboone.prod.uboonepro               None   
1            NaT        1  group_uboone.prod.uboonepro               None   
2            NaT        1  group_uboone.prod.uboonepro               None   
3            NaT        1  group_uboone.prod.uboonepro               None   
4            NaT        1   group_icarus.pro.icaruspro               None   
...          ...      ...                          ...                ...   
74495        NaT        1   group_icarus.pro.icaruspro               None   
74496        NaT        1   group_icarus.pro.icaruspro               None   
74497        NaT        1  group_uboone.prod.uboonepro               None   
74498        NaT        1   group_icarus.pro.icaruspro               None   
74499        NaT        1   group_icarus.pro.icaruspro               None   

      AcctG