# Mini Functional Similarity (FS) Pipeline

Minimalist-style notebook demonstrating the core steps of the FS pipeline from FI archetype construction to FS family clustering.

1. FI bootstrap for FS: parse → normalise → FI table
2. Build `fi_hash → canonical sequence` map for FS
3. Compute and save the FS matrix (structural Levenshtein)
4. Align archetypes to FS matrix order
5. Cluster archetypes into FS families (Agglomerative)
6. Summarise FS families and members


In [1]:
# Bootstrap project root + Python path
import sys, subprocess
from pathlib import Path
import json
import numpy as np
import pandas as pd

import fi_fs

ROOT = Path(subprocess.check_output(
    ["git", "rev-parse", "--show-toplevel"],
    text=True,
).strip())

sys.path[:0] = [str(ROOT / "src")]
print("ROOT project:", ROOT.name)

from fi_fs import *

PYTHON: cpython 3.12.0
BASHLEX: 0.18
ROOT project: PhD


## Step 1 - Minimal FI bootstrap

Core: parse -> normalise -> FI table

1. Load input dataset
2. Parse command sequences into `(op, args, conn)` triplets
3. Apply normalisations:
   - alias replacement,
   - argument placeholders,
   - α-renumbering.
4. Build FI table with canonical sequences and FI hashes.

In [2]:
# FI bootstrap for FS: parse → normalise → FI table

DATASET = "Cowrie.csv"
INPUT = ROOT / "projects" / "fi_fs" / "data" / "processed" / DATASET.strip(".csv") / DATASET

agg, stats = load_aggregated_csv(str(INPUT))
print(f"\n--- FI bootstrap for FS ---")
print(f"Dataset selected: {DATASET.strip(".csv")}")
print(f"Aggregated sessions: {stats['n_sessions']}\n")

# 1) bashlex → triplets
print(f"--- AST Parsing to triplets ---")
seqs, parsed_parts, parse_df, problems = parse_dataframe_to_triplets(agg, progress=True, with_diagnostics=True, strict=False,)
print(f"\nParsed sequences: {len(seqs)}")
print(f"Sucess rate: {len(seqs) / len(agg) * 100:.2f}%")

# 1b) restrict agg to successfully parsed sessions only
parsed_sids = set(seqs.keys())
agg_ok = agg[agg["session"].isin(parsed_sids)].copy()

# 2) normalisation: aliases → placeholders → α-renumber
alias_map_path = ROOT / "src" / "fi_fs" / "resources" / "alias_map.yaml"
alias_map = load_alias_map_yaml(alias_map_path)
seqs_alias, alias_changes = apply_aliases(seqs, alias_map)
print("Alias changes:", 0 if alias_changes is None else len(alias_changes))

seqs_ph, dbg = apply_placeholders_args_only(
    seqs_alias,
    debug=True,
    preview_changed_first_n_sessions=5,
    sample_per_reason=5,
)
assert_connectors_preserved(seqs_alias, seqs_ph)
seqs_alpha = alpha_renumber(seqs_ph, check_idempotent=True)

# 3) FI table
assert_serialisation_deterministic(seqs_alpha)
fi_df = build_fi_dataframe(agg_ok, seqs_alpha, commands_col="commands_joined")
print("\n--- Results ---")
print(f"Sessions Parsed: {len(seqs_alpha)}")
print(f"FI-classes (fi_hash): {fi_df['fi_hash'].nunique()}")
print(f"Reduction Factor: {len(seqs_alpha) / fi_df['fi_hash'].nunique():.2f}x")

fi_df.head(5)



--- FI bootstrap for FS ---
Dataset selected: Cowrie
Aggregated sessions: 98513

--- AST Parsing to triplets ---


Parsing sessions (bashlex): 100%|██████████| 98513/98513 [01:05<00:00, 1494.36it/s]



Parsed sequences: 98425
Sucess rate: 99.91%
Alias changes: 488

--- Results ---
Sessions Parsed: 98425
FI-classes (fi_hash): 607
Reduction Factor: 162.15x


Unnamed: 0,session,n_rows,fi_hash,commands_clean,canonical_json
0,00031aeff1a6,5,9c07a2ac9b0db760,sh\nshell\nenable\necho 'nameserver 95.214.27....,"[[""sh"",[],"";""],[""shell"",[],"";""],[""enable"",[],""..."
1,0003e7887230,5,9c07a2ac9b0db760,sh\nshell\nenable\necho 'nameserver 95.214.27....,"[[""sh"",[],"";""],[""shell"",[],"";""],[""enable"",[],""..."
2,0003f10a2103,4,ec2785b8610be5a7,sh\nshell\nenable\ncat /bin/echo || while read...,"[[""sh"",[],"";""],[""shell"",[],"";""],[""enable"",[],""..."
3,0004025aa9c2,15,5e1318289a0eede4,sh\nshell\nenable\ncd ~ && rm -rf .ssh && mkdi...,"[[""sh"",[],"";""],[""shell"",[],"";""],[""enable"",[],""..."
4,000537a9f749,1,c1f4de5103d6ebc1,uname -s -v -n -r -m,"[[""uname"",[""-s"",""-v"",""-n"",""-r"",""-m""],""EOS""]]"


## Step 2 - Build `fi_hash -> canonical sequence` map for FS

In [3]:
arche_map = build_archetypes(fi_df)
print("FI-unique archetypes (by fi_hash):", len(arche_map))

FI-unique archetypes (by fi_hash): 607


## Step 3 - Compute FS matrix (structural Levenshtein)


In [4]:
from pathlib import Path as _Path

N = len(arche_map)
print(f"Computing structural Levenshtein FS over N={N} FI-unique archetypes...")
labels_lev, FS_lev = fs_levenshtein_structural(arche_map, include_connectors=True, progress=True,)

DATASET_NAME = _Path(DATASET).stem
fs_dir = ROOT / "projects" / "fi_fs" / "data" / "output" / DATASET_NAME / "FS_eval" / "NumPy_Arrays"
fs_dir.mkdir(parents=True, exist_ok=True)

fs_filename = f"{DATASET_NAME}_FS_Lev_opconn_N{FS_lev.shape[0]}.npy"
fs_path = fs_dir / fs_filename
np.save(fs_path, FS_lev)

print("Saved FS array to:", fs_path.relative_to(ROOT))


Computing structural Levenshtein FS over N=607 FI-unique archetypes...


Levenshtein-struct (rows):   0%|          | 0/607 [00:00<?, ?it/s]

Levenshtein-struct (pairs):   0%|          | 0/183921 [00:00<?, ?it/s]

Saved FS array to: projects/fi_fs/data/output/Cowrie/FS_eval/NumPy_Arrays/Cowrie_FS_Lev_opconn_N607.npy


## Step 4 - Align archetypes to FS matrix order

In [5]:
def decode_canonical(canon_json: str):
    rows = json.loads(canon_json)
    return [(op, tuple(args), conn) for op, args, conn in rows]

archetypes = (
    fi_df.sort_values(["n_rows", "session"], ascending=[False, True])
         .drop_duplicates("fi_hash", keep="first")
         .set_index("fi_hash")
         .loc[labels_lev]                     # align rows to FS_lev order
         .reset_index()                       # fi_hash back as a column
         [["fi_hash", "session", "n_rows", "canonical_json"]]
)

archetypes["seq"] = archetypes["canonical_json"].map(decode_canonical)
seqs_unique = archetypes["seq"].tolist()
fi_hashes   = archetypes["fi_hash"].tolist()

print(f"Aligned archetypes to FS matrix order. Rows: {len(archetypes)}")
display(archetypes.head(5))


Aligned archetypes to FS matrix order. Rows: 607


Unnamed: 0,fi_hash,session,n_rows,canonical_json,seq
0,0014c5294a1b8182,b898ce35a477,5,"[[""curl"",[""PH_URL_1""],""|""],[""sudo"",[""python3"",...","[(curl, (PH_URL_1,), |), (sudo, (python3, -, -..."
1,0129d0d0e783bb89,60a7ef53c929,1,"[[""cd"",[""PH_PATH_1""],"";""],[""wget"",[""PH_URL_1""]...","[(cd, (PH_PATH_1,), ;), (wget, (PH_URL_1,), ;)..."
2,015931310783b74b,88f6d0afbaf6,9,"[[""free"",[],"";""],[""lscpu"",[],"";""],[""top"",[],"";...","[(free, (), ;), (lscpu, (), ;), (top, (), ;), ..."
3,02038519ce400bdc,13166095803e,6,"[[""linuxshell"",[],"";""],[""sh"",[],"";""],[""enable""...","[(linuxshell, (), ;), (sh, (), ;), (enable, ()..."
4,021cda85adade07c,1474808cc7d5,1,"[[""rm"",[""-rf"",""PH_PATH_1""],"";""],[""wget"",[""PH_U...","[(rm, (-rf, PH_PATH_1), ;), (wget, (PH_URL_1, ..."


## Step 5 - Cluster archetypes into FS families (Agglomerative)

In [9]:
TAU = 0.75

labels_agg = agglomerative_from_fs(FS_lev, tau=TAU)
archetypes["family_agg"] = labels_agg

stats = evaluate_fs_clustering(
    FS_lev,
    labels_agg,
    tau=TAU,
)

print(f"FS Agglomerative clustering @ tau={TAU}")
print("Families discovered:", stats["n_clusters"])
print("Singleton families:", stats["n_singletons"])

print("\nInternal metrics:")
display(pd.DataFrame([stats]))

groups = group_indices_from_labels(labels_agg)
medoids = medoid_indices(FS_lev, groups)
print(f"Clusters: {len(groups)}")
print("Example cluster → members (first 5 clusters):")
for gid in sorted(groups.keys())[:5]:
    print(f"  Cluster {gid}: indices {groups[gid][:10]}{' ...' if len(groups[gid]) > 10 else ''}")
print("\nMedoid indices per cluster (first 5):")
for gid in sorted(medoids.keys())[:5]:
    print(f"  Cluster {gid}: medoid index {medoids[gid]}")

FS Agglomerative clustering @ tau=0.75
Families discovered: 298
Singleton families: 204

Internal metrics:


Unnamed: 0,config,tau,N,n_clusters,n_singletons,max_cluster_size,median_cluster_size,cohesion_min_FS,cohesion_mean_FS,silhouette,calinski_harabasz,davies_bouldin,dunn
0,,0.75,607,298,204,49,1.0,0.633333,0.961235,0.454071,150.145207,0.412721,0.30303


Clusters: 298
Example cluster → members (first 5 clusters):
  Cluster 0: indices [415, 456, 520]
  Cluster 1: indices [35, 48, 55, 57, 122, 274, 299, 317, 410, 469] ...
  Cluster 2: indices [39, 41, 147, 168, 246, 321, 324, 372, 397, 455] ...
  Cluster 3: indices [19, 37, 124, 180, 231, 252, 288, 338, 381, 417] ...
  Cluster 4: indices [83, 198, 509]

Medoid indices per cluster (first 5):
  Cluster 0: medoid index 520
  Cluster 1: medoid index 35
  Cluster 2: medoid index 321
  Cluster 3: medoid index 37
  Cluster 4: medoid index 83


## Step 6 - Summarise FS families and members

In [12]:
fi_hashes = archetypes["fi_hash"].tolist()

summ_df, family_groups, medoids = summarise_families(
    FS=FS_lev,
    seqs_unique=seqs_unique,
    labels=labels_agg,
    fi_hashes=fi_hashes,
)

cols = [
    "family_id",
    "size",
    "mean_FS",
    "sd_FS",
    "medoid_idx",
    "medoid_fi_hash",
    "top_ops",
    "consensus_skeleton_pairs",
]

n_singletons = int((summ_df["size"] == 1).sum())
print(f"\nFamilies total: {len(summ_df)} | Singleton families: {n_singletons}")
print("\nFamily summaries (top 10 by size, then mean_FS):")
display(summ_df[cols].head(10))



Families total: 298 | Singleton families: 204

Family summaries (top 10 by size, then mean_FS):


Unnamed: 0,family_id,size,mean_FS,sd_FS,medoid_idx,medoid_fi_hash,top_ops,consensus_skeleton_pairs
0,45,49,1.0,0.0,16,0575fce921790578,"[(busybox, 49)]","[(busybox, EOS)]"
1,69,22,1.0,0.0,50,153fd69fc12292c2,"[(echo, 22)]","[(echo, EOS)]"
2,3,14,0.9202,0.0819,37,0e2f33f329730941,"[(cd, 70), (sh, 57), (chmod, 42), (rm, 29), (t...","[(cd, ||), (cd, ||), (cd, ||), (cd, ||), (cd, ..."
3,2,13,0.7998,0.0834,321,8e6a4487862fa88a,"[(ps, 22), (grep, 22), (cat, 21), (echo, 13), ...","[(ps, |), (grep, ;), (echo, |), (cat, EOS)]"
4,44,12,0.9186,0.037,40,0fbbb73a992a1549,"[(cp, 297), (chmod, 209), (cd, 122), (rm, 111)...","[(cd, &&), (rm, &&), (mkdir, &&), (echo, &&), ..."
5,1,11,0.8149,0.0908,35,0ca225e81884a110,"[(cd, 49), (sh, 32), (chmod, 31), (tftp, 20), ...","[(cd, ||), (cd, ||), (cd, ;), (wget, ;), (chmo..."
6,73,10,0.9492,0.0346,73,1dcdd0685e28acdb,"[(rm, 35), (tftp, 20), (enable, 10), (system, ...","[(enable, ;), (system, ;), (shell, ;), (sh, ;)..."
7,8,10,0.8545,0.0809,106,2b5150c23e9a475b,"[(cd, 50), (cat, 18), (chmod, 14), (wget, 10),...","[(cd, ||), (cd, ||), (cd, ||), (cd, ||), (cd, ..."
8,136,9,0.9655,0.0278,58,16584cd6f1989339,"[(echo, 575), (rm, 63), (chmod, 27), (tftp, 18...","[(enable, ;), (system, ;), (shell, ;), (sh, ;)..."
9,61,8,0.9152,0.0751,54,15c5aa567680fe3f,"[(chmod, 8), (PH_EXEC_1, 8), (wget, 7), (cd, 6...","[(chmod, ;), (PH_EXEC_1, EOS)]"
