### First pass of exploring FNAL data

In [1]:
import sys, os
import pandas as pd

In [2]:
DATA_DIR = "../data"

In [3]:
lib_path = '../'
if lib_path not in sys.path:
    sys.path.append(lib_path)

In [4]:
#!pip install fastparquet

In [5]:
fname_in = f"{DATA_DIR}/fifebatch-history-v3-2024.01.01-0002.parquet"
fname_out = f"{DATA_DIR}/anonymized-batch-0002.parquet"

In [6]:
fname_hmac_out = f"{DATA_DIR}/hmac-anonymized-batch-0002.parquet"
fname_fpe_out = f"{DATA_DIR}/fpe-anonymized-batch-0002.parquet"

In [7]:
df = pd.read_parquet(fname_in, engine="fastparquet")  # or engine="fastparquet"
print(df.dtypes)       # Column names + types

@timestamp                datetime64[us]
@version                          object
AccountingGroup                   object
AccountingGroupOSG                object
AcctGroup                         object
                               ...      
x509UserProxyFirstFQAN            object
x509UserProxyVOName               object
x509userproxy                     object
x509userproxysubject              object
xcount                            object
Length: 595, dtype: object


In [8]:
df.head()

Unnamed: 0,@timestamp,@version,AccountingGroup,AccountingGroupOSG,AcctGroup,AcctGroupUser,AllowOpportunistic,Args,Arguments,AutoClusterAttrs,...,use_x509userproxy,x509UserProxyEmail,x509UserProxyExpiration,x509UserProxyExpiration_ms,x509UserProxyFQAN,x509UserProxyFirstFQAN,x509UserProxyVOName,x509userproxy,x509userproxysubject,xcount
0,NaT,1,group_gm2.prod.gm2pro,,,,,multipleroot True muon True noGrid False input...,,,...,,,2024-01-07 16:58:02,1704668000000.0,,,,gm2pro.Production.proxy,/DC=org/DC=incommon/C=US/ST=Illinois/O=Fermi R...,
1,NaT,1,group_icarus.pro.icaruspro,,,,,--debug --find_setups --source-unquote /cvmfs/...,,,...,,,2024-01-07 16:58:07,1704668000000.0,,,,icaruspro.Production.proxy,/DC=org/DC=incommon/C=US/ST=Illinois/O=Fermi R...,
2,NaT,1,group_icarus.pro.icaruspro,,,,,--debug --find_setups --source-unquote /cvmfs/...,,,...,,,2024-01-07 16:58:07,1704668000000.0,,,,icaruspro.Production.proxy,/DC=org/DC=incommon/C=US/ST=Illinois/O=Fermi R...,
3,NaT,1,group_uboone.prod.uboonepro,,,,,--nfile 1 --group uboone -g -c wrapper.fcl --u...,,,...,,,2024-01-07 16:58:08,1704668000000.0,,,,uboonepro.Production.proxy,/DC=org/DC=incommon/C=US/ST=Illinois/O=Fermi R...,
4,NaT,1,group_pip2.mars.amakovec,,,,,,,,...,,amakovec@fnal.gov,2024-01-07 15:29:15,1704663000000.0,,,,x509up_pip2_MARS_59791,/DC=org/DC=cilogon/C=US/O=Fermi National Accel...,


In [9]:
from hashlib import sha256
from lib.anonymize_kv_mvp import anonymize_df
from lib.utils_secrets import gen_secret_hex

In [10]:
#key_hex = sha256(b"kms-secret").hexdigest()
key_hex = gen_secret_hex(mode="dev", data_dir="../data")
df_hmac = anonymize_df(
    df,
    columns=["AccountingGroup","x509UserProxyEmail","x509userproxy","x509userproxysubject"],
    key_hex=key_hex,
    mode="hmac",
    email_preserve_domain=True,   # token@domain
)

In [11]:
df_hmac.head(2)

Unnamed: 0,@timestamp,@version,AccountingGroup,AccountingGroupOSG,AcctGroup,AcctGroupUser,AllowOpportunistic,Args,Arguments,AutoClusterAttrs,...,use_x509userproxy,x509UserProxyEmail,x509UserProxyExpiration,x509UserProxyExpiration_ms,x509UserProxyFQAN,x509UserProxyFirstFQAN,x509UserProxyVOName,x509userproxy,x509userproxysubject,xcount
0,NaT,1,vpnpukz5fin54v5yeb45cf,,,,,multipleroot True muon True noGrid False input...,,,...,,,2024-01-07 16:58:02,1704668000000.0,,,,lbuh7uw4tttqgjlqe3dnp5,admdemnkv33g5y7sq57beo,
1,NaT,1,zlu2bll5t2fsn6aqmpcspe,,,,,--debug --find_setups --source-unquote /cvmfs/...,,,...,,,2024-01-07 16:58:07,1704668000000.0,,,,nok2bvvc4atg2xpgaloqyp,rktnuakrlitifmbbh5ouby,


In [12]:
df_hmac.to_parquet(fname_hmac_out, index=False, engine="fastparquet")

In [13]:
df_fpe = anonymize_df(
    df,
    columns=["AccountingGroup","x509UserProxyEmail","x509userproxy","x509userproxysubject"],
    key_hex=key_hex,
    mode="fpe",
    email_preserve_domain=True,   # only local-part anonymized
)

In [14]:
df_fpe.to_parquet(fname_fpe_out, index=False, engine="fastparquet")