From 0b7a9d65f6516b30a342c394a0e5f7d8d12f0d15 Mon Sep 17 00:00:00 2001 From: JohT <7671054+JohT@users.noreply.github.com> Date: Sat, 22 Nov 2025 11:05:11 +0100 Subject: [PATCH 1/3] Document anomaly detection pipeline architecture --- domains/anomaly-detection/README.md | 27 + .../documentation/Architecture.gv | 194 +++++ .../documentation/Architecture.svg | 702 ++++++++++++++++++ .../anomaly-detection/documentation/README.md | 11 + .../documentation/renderArchitecture.sh | 20 + renovate.json | 1 + scripts/visualization/renderGraphVizSVG.sh | 53 ++ 7 files changed, 1008 insertions(+) create mode 100644 domains/anomaly-detection/README.md create mode 100644 domains/anomaly-detection/documentation/Architecture.gv create mode 100644 domains/anomaly-detection/documentation/Architecture.svg create mode 100644 domains/anomaly-detection/documentation/README.md create mode 100755 domains/anomaly-detection/documentation/renderArchitecture.sh create mode 100755 scripts/visualization/renderGraphVizSVG.sh diff --git a/domains/anomaly-detection/README.md b/domains/anomaly-detection/README.md new file mode 100644 index 000000000..8c5ceb754 --- /dev/null +++ b/domains/anomaly-detection/README.md @@ -0,0 +1,27 @@ +# Anomaly Detection Domain + +This directory contains the implementation and resources related to the Anomaly Detection domain within the Code Graph Analysis Pipeline project. + +## Entry Points + +The following scripts serve as entry points for various anomaly detection tasks and reports. They will be invoked by [AllReports.sh](./../../scripts/reports/compilations/AllReports.sh) an its sub-scripts dynamically by their names. + +- [anomalyDetectionCsv.sh](./anomalyDetectionCsv.sh): Entry point for CSV reports based solely on Graph queries. +- [anomalyDetectionPython.sh](./anomalyDetectionPython.sh): Entry point for Python-based anomaly detection tasks and reports. +- [anomalyDetectionVisualization.sh](./anomalyDetectionVisualization.sh): Entry point for Graph visualization reports. +- [anomalyDetectionMarkdown.sh](./anomalyDetectionMarkdown.sh): Entry point for generating the Markdown summary report. + +## Folder Structure + +- [documentation](./documentation): Contains documentation including architecture diagrams. +- [explore](./explore/): Jupyter notebooks for interactive, exploratory anomaly detection analysis. +- [features](./features/): Cypher queries to extract features and run graph algorithms relevant for anomaly detection. +- [graphs](./graphs/): Cypher queries and GraphViz templates for Graph visualizations related to anomaly detection. +- [labels](./labels/): Cypher queries label nodes that represent specific archetypes. +- [queries](./queries/): Cypher queries to identify anomalies based on various (deterministic/explainable) criteria. +- [reset](./reset/): Cypher queries to reset the graph database state related to anomaly detection. +- [summary](./summary/): Markdown templates and resources for generating the summary report. + +## Pipeline Architecture Overview + +![Anomaly Detection Architecture](./documentation/Architecture.svg) \ No newline at end of file diff --git a/domains/anomaly-detection/documentation/Architecture.gv b/domains/anomaly-detection/documentation/Architecture.gv new file mode 100644 index 000000000..24ce04d5d --- /dev/null +++ b/domains/anomaly-detection/documentation/Architecture.gv @@ -0,0 +1,194 @@ +digraph AnomalyDetectionPipeline { + rankdir=LR; + node [fontname="Helvetica", fontsize=10]; + + // Leiden community detection + subgraph cluster_leiden { + label="Leiden Community Detection"; + style=filled; color=lightblue; + node [shape=box, style=filled, fillcolor=white]; + + Tuning_Leiden [label="Tuning\n(Optuna)"]; + Leiden_Gamma [label="gamma", shape=diamond] + Leiden_Theta [label="theta", shape=diamond] + Leiden_Algorithm [label="Leiden Community Detection"]; + CommunityId [label="Community", shape=ellipse]; + } + + // --- Leiden Community Detection relationships --- + Tuning_Leiden -> Leiden_Gamma; + Tuning_Leiden -> Leiden_Theta; + Leiden_Gamma -> Leiden_Algorithm + Leiden_Theta -> Leiden_Algorithm + Leiden_Algorithm -> Tuning_Leiden [label="modularity", style="dashed"] + Leiden_Algorithm -> Tuning_Leiden [label="size", style="dashed"] + Leiden_Algorithm -> CommunityId; + + // Fast Random Projection (FastRP) + subgraph cluster_fastRP { + label="Fast Random Projection (FastRP)"; + style=filled; color=lightpink; + node [shape=box, style=filled, fillcolor=white]; + + Tuning_FastRP [label="Tuning\n(Optuna)"]; + FastRP_Dimension [label="dimension", shape=diamond]; + FastRP_Normalization_Strength [label="normalization strength", shape="diamond"]; + FastRP_Forth_Iteration_Weight [label="forth iteration weight", shape="diamond"]; + FastRP_Algorithm [label="FastRP"]; + NodeEmbeddings [label="Node Embeddings", shape=ellipse]; + } + + // --- FastRP relationships --- + Tuning_FastRP -> FastRP_Dimension; + Tuning_FastRP -> FastRP_Normalization_Strength; + Tuning_FastRP -> FastRP_Forth_Iteration_Weight; + FastRP_Dimension -> FastRP_Algorithm; + FastRP_Normalization_Strength -> FastRP_Algorithm; + FastRP_Forth_Iteration_Weight -> FastRP_Algorithm + FastRP_Algorithm -> Tuning_FastRP [label="adjusted mutual info score\n(incl. preview clustering)", style="dashed"] + FastRP_Algorithm -> NodeEmbeddings; + + // Uniform Manifold Approximation and Projection (UMAP) + subgraph cluster_UMAP { + label="Uniform Manifold Approximation and Projection (UMAP)\nDimensionality Reduction for Visualization"; + style=filled; color=lightgrey; + node [shape=box, style=filled, fillcolor=white]; + + UMAP_Algorithm [label="UMAP"]; + UMAP_Coordinates [label="2D Coordinates", shape=ellipse]; + } + + // UMAP relationships + NodeEmbeddings -> UMAP_Algorithm + UMAP_Algorithm -> UMAP_Coordinates + + // HDBSCAN clustering and tuning + subgraph cluster_hdbscan { + label="Hierarchical Density-Based Spatial Clustering (HDBSCAN)"; + style=filled; color=lightgoldenrod; + node [shape=box, style=filled, fillcolor=white]; + + Tuning_HDBSCAN [label="Tuning\n(Optuna)"]; + HDBSCAN_Node [label="HDBSCAN"]; + HDBSCAN_Min_Cluster_Size [label="Min Cluster Size", shape=diamond]; + HDBSCAN_Min_Samples [label="Min Samples", shape=diamond]; + + ClusterLabel [label="Label", shape=ellipse]; + ClusterRadius [label="Radius\n(avg,max)", shape=ellipse]; + ClusterSize [label="Size", shape=ellipse]; + NormDistToMedoid [label="Normalized Distance\nTo Medoid", shape=ellipse]; + ClusterNoise [label="Noise\n(label=-1)", shape=ellipse]; + ClusterProbability [label="Probability", shape=ellipse]; + ClusterApproximationOutlierScore [label="Approximation\nOutlierScore\n(= 1 - Probability)", shape=ellipse]; + } + + // --- Inputs into HDBSCAN --- + CommunityId -> Tuning_HDBSCAN [label="reference"]; + NodeEmbeddings -> HDBSCAN_Node; + + Tuning_HDBSCAN -> HDBSCAN_Min_Cluster_Size + Tuning_HDBSCAN -> HDBSCAN_Min_Samples + HDBSCAN_Min_Cluster_Size -> HDBSCAN_Node; + HDBSCAN_Min_Samples -> HDBSCAN_Node; + + HDBSCAN_Node -> Tuning_HDBSCAN [label="adjusted mutual info score", style=dashed]; + + // HDBSCAN outputs (cluster features) + HDBSCAN_Node -> ClusterLabel; + HDBSCAN_Node -> ClusterNoise; + HDBSCAN_Node -> ClusterRadius; + HDBSCAN_Node -> ClusterSize; + HDBSCAN_Node -> NormDistToMedoid; + HDBSCAN_Node -> ClusterProbability; + HDBSCAN_Node -> ClusterApproximationOutlierScore; + + // Graph algorithm based features + subgraph cluster_graph_features { + label="Graph (Algorithm) Features"; + style=filled; color=lightcyan; + node [shape=ellipse, style=filled, fillcolor=white]; + + ArticleRank [label="ArticleRank"]; + PageRank [label="PageRank"]; + PageRank_minus_ArticleRank [label="PageRank -\nArticleRank"]; + BetweennessCentrality [label="Betweenness\nCentrality"]; + LocalClusteringCoefficient [label="Local Clustering\nCoefficient"]; + Degree [label="Degree\n(in, out, sum)"]; + } + + // Anomaly detection model area + subgraph cluster_anomaly { + label="Anomaly Detection Model"; + style=filled; color=lightgreen; penwidth=4; pencolor=green; margin="50,50"; + node [shape=box, style=filled, fillcolor=white]; + + TuningAnomaly [label="Tuning\n(Optuna)"]; + IsolationMinCluster [label="Min Cluster Size", shape=diamond]; + IsolationEstimators [label="n estimators", shape=diamond]; + + ProxyEstimators [label="n estimators", shape=diamond]; + ProxyMaxDepth [label="max depth", shape=diamond]; + + AnomalyStandardizer [label="Standardizer"] + AnomalyPCA [label="Principal Component\nAnalysis (PCA)"] + IsolationForest [label="Isolation Forest\nAnomaly Detector", margin="0.4,0.4"]; + ProxyRandomForest [label="RandomForest\n(Proxy)"]; + AnomalyScore [label="Score", shape=ellipse]; + AnomalyLabel [label="Label", shape=ellipse]; + } + + // Embeddings feed anomaly model + NodeEmbeddings -> AnomalyPCA; + + // HDBSCAN-derived features feed anomaly model + ClusterRadius -> AnomalyStandardizer; + NormDistToMedoid -> AnomalyStandardizer; + ClusterApproximationOutlierScore -> AnomalyStandardizer; + + // Graph Algorithm Features feed anomaly model + ArticleRank -> AnomalyStandardizer; + PageRank -> AnomalyStandardizer; + PageRank_minus_ArticleRank -> AnomalyStandardizer; + BetweennessCentrality -> AnomalyStandardizer; + LocalClusteringCoefficient -> AnomalyStandardizer; + Degree -> AnomalyStandardizer; + + // Proxy RandomForest used as a backing/tuning model for the Isolation Forest + TuningAnomaly -> IsolationMinCluster; + TuningAnomaly -> IsolationEstimators; + IsolationMinCluster -> IsolationForest + IsolationEstimators -> IsolationForest + + TuningAnomaly -> ProxyEstimators + TuningAnomaly -> ProxyMaxDepth + ProxyEstimators -> ProxyRandomForest + ProxyMaxDepth -> ProxyRandomForest + + AnomalyStandardizer -> IsolationForest; + AnomalyPCA -> IsolationForest; + IsolationForest -> ProxyRandomForest [label="reference", style="dashed"]; + ProxyRandomForest -> TuningAnomaly [label="f1 score\n(cross validation)", style="dashed"]; + + IsolationForest -> AnomalyLabel + IsolationForest -> AnomalyScore + + // Explainable AI / SHAP + subgraph cluster_explainability { + label="Explainable AI (SHAP)"; + style=filled; color=lavender; + node [shape=note, style=filled, fillcolor=white]; + + SHAP [label="SHAP TreeExplainer"]; + + SHAP_Values [label="Top SHAP Values", shape=ellipse]; + SHAP_Features [label="Top Features", shape=ellipse]; + SHAP_Embedding_Sum [label="Node Embeddings\nSHAP Sum", shape=ellipse]; + } + + // Explainability connections (RandomForest -> SHAP) + ProxyRandomForest -> SHAP; + SHAP -> SHAP_Values; + SHAP -> SHAP_Features; + SHAP -> SHAP_Embedding_Sum; + +} \ No newline at end of file diff --git a/domains/anomaly-detection/documentation/Architecture.svg b/domains/anomaly-detection/documentation/Architecture.svg new file mode 100644 index 000000000..e59b1662b --- /dev/null +++ b/domains/anomaly-detection/documentation/Architecture.svg @@ -0,0 +1,702 @@ + + + + + + +AnomalyDetectionPipeline + + +cluster_leiden + +Leiden Community Detection + + +cluster_fastRP + +Fast Random Projection (FastRP) + + +cluster_UMAP + +Uniform Manifold Approximation and Projection (UMAP) +Dimensionality Reduction for Visualization + + +cluster_hdbscan + +Hierarchical Density-Based Spatial Clustering (HDBSCAN) + + +cluster_graph_features + +Graph (Algorithm) Features + + +cluster_anomaly + +Anomaly Detection Model + + +cluster_explainability + +Explainable AI (SHAP) + + + +Tuning_Leiden + +Tuning +(Optuna) + + + +Leiden_Gamma + +gamma + + + +Tuning_Leiden->Leiden_Gamma + + + + + +Leiden_Theta + +theta + + + +Tuning_Leiden->Leiden_Theta + + + + + +Leiden_Algorithm + +Leiden Community Detection + + + +Leiden_Gamma->Leiden_Algorithm + + + + + +Leiden_Theta->Leiden_Algorithm + + + + + +Leiden_Algorithm->Tuning_Leiden + + +modularity + + + +Leiden_Algorithm->Tuning_Leiden + + +size + + + +CommunityId + +Community + + + +Leiden_Algorithm->CommunityId + + + + + +Tuning_HDBSCAN + +Tuning +(Optuna) + + + +CommunityId->Tuning_HDBSCAN + + +reference + + + +Tuning_FastRP + +Tuning +(Optuna) + + + +FastRP_Dimension + +dimension + + + +Tuning_FastRP->FastRP_Dimension + + + + + +FastRP_Normalization_Strength + +normalization strength + + + +Tuning_FastRP->FastRP_Normalization_Strength + + + + + +FastRP_Forth_Iteration_Weight + +forth iteration weight + + + +Tuning_FastRP->FastRP_Forth_Iteration_Weight + + + + + +FastRP_Algorithm + +FastRP + + + +FastRP_Dimension->FastRP_Algorithm + + + + + +FastRP_Normalization_Strength->FastRP_Algorithm + + + + + +FastRP_Forth_Iteration_Weight->FastRP_Algorithm + + + + + +FastRP_Algorithm->Tuning_FastRP + + +adjusted mutual info score +(incl. preview clustering) + + + +NodeEmbeddings + +Node Embeddings + + + +FastRP_Algorithm->NodeEmbeddings + + + + + +UMAP_Algorithm + +UMAP + + + +NodeEmbeddings->UMAP_Algorithm + + + + + +HDBSCAN_Node + +HDBSCAN + + + +NodeEmbeddings->HDBSCAN_Node + + + + + +AnomalyPCA + +Principal Component +Analysis (PCA) + + + +NodeEmbeddings->AnomalyPCA + + + + + +UMAP_Coordinates + +2D Coordinates + + + +UMAP_Algorithm->UMAP_Coordinates + + + + + +HDBSCAN_Min_Cluster_Size + +Min Cluster Size + + + +Tuning_HDBSCAN->HDBSCAN_Min_Cluster_Size + + + + + +HDBSCAN_Min_Samples + +Min Samples + + + +Tuning_HDBSCAN->HDBSCAN_Min_Samples + + + + + +HDBSCAN_Node->Tuning_HDBSCAN + + +adjusted mutual info score + + + +ClusterLabel + +Label + + + +HDBSCAN_Node->ClusterLabel + + + + + +ClusterRadius + +Radius +(avg,max) + + + +HDBSCAN_Node->ClusterRadius + + + + + +ClusterSize + +Size + + + +HDBSCAN_Node->ClusterSize + + + + + +NormDistToMedoid + +Normalized Distance +To Medoid + + + +HDBSCAN_Node->NormDistToMedoid + + + + + +ClusterNoise + +Noise +(label=-1) + + + +HDBSCAN_Node->ClusterNoise + + + + + +ClusterProbability + +Probability + + + +HDBSCAN_Node->ClusterProbability + + + + + +ClusterApproximationOutlierScore + +Approximation +OutlierScore +(= 1 - Probability) + + + +HDBSCAN_Node->ClusterApproximationOutlierScore + + + + + +HDBSCAN_Min_Cluster_Size->HDBSCAN_Node + + + + + +HDBSCAN_Min_Samples->HDBSCAN_Node + + + + + +AnomalyStandardizer + +Standardizer + + + +ClusterRadius->AnomalyStandardizer + + + + + +NormDistToMedoid->AnomalyStandardizer + + + + + +ClusterApproximationOutlierScore->AnomalyStandardizer + + + + + +ArticleRank + +ArticleRank + + + +ArticleRank->AnomalyStandardizer + + + + + +PageRank + +PageRank + + + +PageRank->AnomalyStandardizer + + + + + +PageRank_minus_ArticleRank + +PageRank - +ArticleRank + + + +PageRank_minus_ArticleRank->AnomalyStandardizer + + + + + +BetweennessCentrality + +Betweenness +Centrality + + + +BetweennessCentrality->AnomalyStandardizer + + + + + +LocalClusteringCoefficient + +Local Clustering +Coefficient + + + +LocalClusteringCoefficient->AnomalyStandardizer + + + + + +Degree + +Degree +(in, out, sum) + + + +Degree->AnomalyStandardizer + + + + + +TuningAnomaly + +Tuning +(Optuna) + + + +IsolationMinCluster + +Min Cluster Size + + + +TuningAnomaly->IsolationMinCluster + + + + + +IsolationEstimators + +n estimators + + + +TuningAnomaly->IsolationEstimators + + + + + +ProxyEstimators + +n estimators + + + +TuningAnomaly->ProxyEstimators + + + + + +ProxyMaxDepth + +max depth + + + +TuningAnomaly->ProxyMaxDepth + + + + + +IsolationForest + +Isolation Forest +Anomaly Detector + + + +IsolationMinCluster->IsolationForest + + + + + +IsolationEstimators->IsolationForest + + + + + +ProxyRandomForest + +RandomForest +(Proxy) + + + +ProxyEstimators->ProxyRandomForest + + + + + +ProxyMaxDepth->ProxyRandomForest + + + + + +AnomalyStandardizer->IsolationForest + + + + + +AnomalyPCA->IsolationForest + + + + + +IsolationForest->ProxyRandomForest + + +reference + + + +AnomalyScore + +Score + + + +IsolationForest->AnomalyScore + + + + + +AnomalyLabel + +Label + + + +IsolationForest->AnomalyLabel + + + + + +ProxyRandomForest->TuningAnomaly + + +f1 score +(cross validation) + + + +SHAP + + + +SHAP TreeExplainer + + + +ProxyRandomForest->SHAP + + + + + +SHAP_Values + +Top SHAP Values + + + +SHAP->SHAP_Values + + + + + +SHAP_Features + +Top Features + + + +SHAP->SHAP_Features + + + + + +SHAP_Embedding_Sum + +Node Embeddings +SHAP Sum + + + +SHAP->SHAP_Embedding_Sum + + + + + + diff --git a/domains/anomaly-detection/documentation/README.md b/domains/anomaly-detection/documentation/README.md new file mode 100644 index 000000000..0432ff144 --- /dev/null +++ b/domains/anomaly-detection/documentation/README.md @@ -0,0 +1,11 @@ +# Documentation for Anomaly Detection Domain + +This directory contains resources and documentation related to the Anomaly Detection domain within the Code Graph Analysis Pipeline project. + +## Generate Architecture Diagram + +To generate the architecture diagram for the Anomaly Detection domain, you can use the [renderArchitecture.sh](./renderArchitecture.sh) script in this directory. It utilizes Graphviz to create a visual representations of the anomaly detection pipeline architecture described in [Architecture.gv](./Architecture.gv) to render a SVG file. + +The generated SVG file will also be added to the summary report Appendix section. + +:warning: Currently, the architecture description in `Architecture.gv` is manually maintained. The same applies to the SVG file, that needs to be regenerated manually when changes are made to the `.gv` file. diff --git a/domains/anomaly-detection/documentation/renderArchitecture.sh b/domains/anomaly-detection/documentation/renderArchitecture.sh new file mode 100755 index 000000000..73ef7142f --- /dev/null +++ b/domains/anomaly-detection/documentation/renderArchitecture.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash + +# Renders the described Graph in Architecture.gv as a SVG image. +# +# Requires renderGraphVizSVG.sh +# +# Fail on any error ("-e" = exit on first error, "-o pipefail" exist on errors within piped commands) +set -o errexit -o pipefail + +## Get this "scripts/reports" directory if not already set +# Even if $BASH_SOURCE is made for Bourne-like shells it is also supported by others and therefore here the preferred solution. +# CDPATH reduces the scope of the cd command to potentially prevent unintended directory changes. +# This way non-standard tools like readlink aren't needed. +ANOMALY_DETECTION_DOCS_DIR=${ANOMALY_DETECTION_DOCS_DIR:-$(CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P)} # Directory containing documentation for the anomaly detection +# Get the "scripts" directory by taking the path of this script and going one directory up. +SCRIPTS_DIR=${SCRIPTS_DIR:-"${ANOMALY_DETECTION_DOCS_DIR}/../../../scripts"} # Repository directory containing the shell scripts +# Get the "scripts/visualization" directory. +VISUALIZATION_SCRIPTS_DIR=${VISUALIZATION_SCRIPTS_DIR:-"${SCRIPTS_DIR}/visualization"} # Repository directory containing the shell scripts for visualization + +source "${VISUALIZATION_SCRIPTS_DIR}/renderGraphVizSVG.sh" "${ANOMALY_DETECTION_DOCS_DIR}/Architecture.gv" \ No newline at end of file diff --git a/renovate.json b/renovate.json index 1fbc4eec6..ba0a808af 100644 --- a/renovate.json +++ b/renovate.json @@ -178,6 +178,7 @@ "fileMatch": [ "^scripts/[^/]*\\.sh$", "^scripts/visualization/[^/]*\\.sh$", + "^domains/anomaly-detection/documentation/[^/]*\\.sh$", "^(workflow-templates|\\.github/workflows)\\/[^/]+\\.ya?ml$", "(^|\\/)action\\.ya?ml$]" ], diff --git a/scripts/visualization/renderGraphVizSVG.sh b/scripts/visualization/renderGraphVizSVG.sh new file mode 100755 index 000000000..164cb7ce2 --- /dev/null +++ b/scripts/visualization/renderGraphVizSVG.sh @@ -0,0 +1,53 @@ +#!/usr/bin/env bash + +# Renders the given GraphViz file as a SVG image. +# +# Fail on any error ("-e" = exit on first error, "-o pipefail" exist on errors within piped commands) +set -o errexit -o pipefail + +# Local constants +SCRIPT_NAME=$(basename "${0}") + +# Read the first unnamed input argument containing the version of the project +inputGvFileName="${1}" + +if [ -z "${inputGvFileName}" ]; then + echo "${SCRIPT_NAME}: Error: Please specify the GraphViz *.gv file as input parameter." + exit 1 +fi + +if [ ! -f "${inputGvFileName}" ]; then + echo "${SCRIPT_NAME}: Error: GraphViz file not found: ${inputGvFileName}" + exit 1 +fi + +number_of_input_file_lines=$(wc -l < "${inputGvFileName}" | awk '{print $1}') +if [ "${number_of_input_file_lines}" -le 1 ]; then + echo "${SCRIPT_NAME}: Info: Input file is empty. Skipping *.svg file generation." + return 0 +fi + +echo "${SCRIPT_NAME}: Rendering ${inputGvFileName}..." + +graphName=$(basename -- "${inputGvFileName}") +graphName="${graphName%.*}" # Remove file extension +graphName=${graphName//-/_} # Replace all dashes in the graphName by underscores +inputGvFilePath=$(dirname "${inputGvFileName}") + +if command -v "dot" &> /dev/null ; then + echo "${SCRIPT_NAME}: Info: Rendering ${inputGvFileName} using preinstalled GraphViz dot command line interface..." + dot -T svg "${inputGvFilePath}/${graphName}.gv" > "${inputGvFilePath}/${graphName}.svg" + return 0 +fi + +if ! command -v "npx" &> /dev/null ; then + echo "${SCRIPT_NAME}: Error: Command npx (to run npm locally) not found. It's needed for Graph visualization with GraphViz." >&2 + exit 1 +fi + +# Run GraphViz command line interface (CLI) wrapped utilizing WASM (WebAssembly) +# to convert the DOT file to SVG operating system independently. +# Use "npm install" first to create local "node_modules" and be able to run it after that in offline mode. +echo "${SCRIPT_NAME}: Info: Rendering ${inputGvFileName} using npx to run GraphViz CLI Web Assembly Wrapper..." +npm install @hpcc-js/wasm-graphviz-cli@1.2.6 --silent --no-progress --loglevel=error > /dev/null +npx --yes @hpcc-js/wasm-graphviz-cli@1.6.0 -T svg "${inputGvFilePath}/${graphName}.gv" > "${inputGvFilePath}/${graphName}.svg" \ No newline at end of file From 98c842564291b63ede4d9ab5aac6e9b67bce68f0 Mon Sep 17 00:00:00 2001 From: JohT <7671054+JohT@users.noreply.github.com> Date: Sat, 22 Nov 2025 20:47:03 +0100 Subject: [PATCH 2/3] Add pipeline architecture overview to Markdown summary report --- domains/anomaly-detection/summary/anomalyDetectionSummary.sh | 3 +++ domains/anomaly-detection/summary/report.template.md | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/domains/anomaly-detection/summary/anomalyDetectionSummary.sh b/domains/anomaly-detection/summary/anomalyDetectionSummary.sh index c542c22cd..429b44d19 100755 --- a/domains/anomaly-detection/summary/anomalyDetectionSummary.sh +++ b/domains/anomaly-detection/summary/anomalyDetectionSummary.sh @@ -23,6 +23,8 @@ MARKDOWN_INCLUDES_DIRECTORY=${MARKDOWN_INCLUDES_DIRECTORY:-"includes"} # Subdire # This way non-standard tools like readlink aren't needed. ANOMALY_DETECTION_SUMMARY_DIR=${ANOMALY_DETECTION_SUMMARY_DIR:-$(CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P)} #echo "anomalyDetectionSummary: ANOMALY_DETECTION_SUMMARY_DIR=${ANOMALY_DETECTION_SUMMARY_DIR}" +ANOMALY_DETECTION_DOCS_DIR="${ANOMALY_DETECTION_SUMMARY_DIR}/../documentation" +#echo "anomalyDetectionSummary: ANOMALY_DETECTION_DOCS_DIR=${ANOMALY_DETECTION_DOCS_DIR}" # Get the "scripts" directory by taking the path of this script and going one directory up. SCRIPTS_DIR=${SCRIPTS_DIR:-"${ANOMALY_DETECTION_SUMMARY_DIR}/../../../scripts"} # Repository directory containing the shell scripts @@ -180,6 +182,7 @@ anomaly_detection_finalize_report() { # Collect static Markdown includes (after cleanup to not remove one-liner) cp -f "${ANOMALY_DETECTION_SUMMARY_DIR}/report_no_dependency_data.template.md" "${report_include_directory}/report_no_dependency_data.md" cp -f "${ANOMALY_DETECTION_SUMMARY_DIR}/report_no_anomaly_detection_treemaps.template.md" "${report_include_directory}/report_no_anomaly_detection_treemaps.md" + cp -f "${ANOMALY_DETECTION_DOCS_DIR}/Architecture.svg" "${FULL_REPORT_DIRECTORY}/AnomalyDetectionArchitecture.svg" # Assemble final report by applying includes to the main template cp -f "${ANOMALY_DETECTION_SUMMARY_DIR}/report.template.md" "${FULL_REPORT_DIRECTORY}/report.template.md" diff --git a/domains/anomaly-detection/summary/report.template.md b/domains/anomaly-detection/summary/report.template.md index 407cfc5a1..164bec860 100644 --- a/domains/anomaly-detection/summary/report.template.md +++ b/domains/anomaly-detection/summary/report.template.md @@ -268,3 +268,7 @@ archetypes: * Cluster Radius (avg, max) * Cluster Size * Node Embedding (PCA 20–35 dims) + +### 6.3 Architecture Diagram + +![Anomaly Detection Architecture](./AnomalyDetectionArchitecture.svg) \ No newline at end of file From a53aa52deca0a9a6193e7dcff4c2157ddd0a716c Mon Sep 17 00:00:00 2001 From: JohT <7671054+JohT@users.noreply.github.com> Date: Sun, 23 Nov 2025 10:40:59 +0100 Subject: [PATCH 3/3] Add anomaly detector input feature visualization --- ...yDetectionIsolationForestExploration.ipynb | 289 +++++++++++++++++- .../tunedAnomalyDetectionExplained.py | 142 ++++++++- 2 files changed, 419 insertions(+), 12 deletions(-) diff --git a/domains/anomaly-detection/explore/AnomalyDetectionIsolationForestExploration.ipynb b/domains/anomaly-detection/explore/AnomalyDetectionIsolationForestExploration.ipynb index 757966892..c354ba369 100644 --- a/domains/anomaly-detection/explore/AnomalyDetectionIsolationForestExploration.ipynb +++ b/domains/anomaly-detection/explore/AnomalyDetectionIsolationForestExploration.ipynb @@ -67,6 +67,7 @@ "from optuna import Study, create_study\n", "\n", "import shap # Explainable AI tool\n", + "import umap\n", "\n", "import matplotlib.pyplot as plot" ] @@ -921,6 +922,7 @@ " cluster_label_column: str = \"clusterLabel\",\n", " cluster_medoid_column: str = \"clusterMedoid\",\n", " cluster_size_column: str = \"clusterSize\",\n", + " cluster_color_map: str = \"tab20\",\n", " anomaly_label_column: str = \"anomalyLabel\",\n", " anomaly_score_column: str = \"anomalyScore\",\n", " size_column: str = \"articleRank\",\n", @@ -929,6 +931,8 @@ " annotate_top_n_anomalies: int = 10,\n", " annotate_top_n_non_anomalies: int = 5,\n", " annotate_top_n_clusters: int = 20,\n", + " percentile_of_distance_to_center: float = 0.8,\n", + " no_cluster_coloring: bool = False,\n", ") -> None:\n", " \n", " if clustering_visualization_dataframe.empty:\n", @@ -966,7 +970,7 @@ "\n", " distances_to_center = calculate_distances_to_center(clustering_visualization_dataframe, x_position_column, y_position_column)\n", " top_anomaly_columns_mask = mask_top_anomaly_columns(clustering_visualization_dataframe, anomaly_score_column, annotate_top_n_anomalies)\n", - " clustering_visualization_dataframe_zoomed = zoom_into_center_while_preserving_masked_rows(clustering_visualization_dataframe, distances_to_center, top_anomaly_columns_mask)\n", + " clustering_visualization_dataframe_zoomed = zoom_into_center_while_preserving_masked_rows(clustering_visualization_dataframe, distances_to_center, top_anomaly_columns_mask, percentile_of_distance_to_center)\n", "\n", " cluster_anomalies = clustering_visualization_dataframe_zoomed[clustering_visualization_dataframe_zoomed[anomaly_label_column] == 1]\n", " cluster_without_anomalies = clustering_visualization_dataframe_zoomed[clustering_visualization_dataframe_zoomed[anomaly_label_column] != 1]\n", @@ -982,7 +986,7 @@ " y=cluster_noise[y_position_column],\n", " s=cluster_noise[size_column] * 60 + 2,\n", " color='lightgrey',\n", - " alpha=0.4,\n", + " alpha=0.3,\n", " label='Noise'\n", " )\n", "\n", @@ -991,9 +995,9 @@ " x=cluster_non_noise[x_position_column],\n", " y=cluster_non_noise[y_position_column],\n", " s=cluster_non_noise[size_column] * 60 + 2,\n", - " c=cluster_non_noise[cluster_label_column],\n", - " cmap='tab20',\n", - " alpha=0.7,\n", + " c=cluster_non_noise[cluster_label_column] if not no_cluster_coloring else 'silver',\n", + " cmap=cluster_color_map if not no_cluster_coloring else None,\n", + " alpha=0.5,\n", " label='Clusters'\n", " )\n", "\n", @@ -1085,7 +1089,7 @@ " plot.annotate(\n", " text=f\"#{index + 1}: {truncate(row[code_unit_column])} ({row[anomaly_score_column]:.3f})\",\n", " xy=(row[x_position_column], row[y_position_column]),\n", - " xytext=(5, 5 + (index % 5) * 10),\n", + " xytext=(5, 5 + (index % 5) * 15),\n", " color='red',\n", " **plot_annotation_style\n", " )\n", @@ -1103,12 +1107,210 @@ "plot_anomalies(java_package_anomaly_detection_features, title_prefix=\"Java Package Anomalies\")" ] }, + { + "cell_type": "markdown", + "id": "77dee89a", + "metadata": {}, + "source": [ + "#### 1.4b Plot features with highlighted top anomalies in a 2D scatter plot (UMAP reduction)\n", + "\n", + "This plot visualizes the input features used by the Isolation Forest anomaly detector in a 2D scatter plot. Dimensionality reduction is performed with UMAP to illustrate how the detector \"sees\" the data.\n", + "\n", + "- Red: detected anomalies \n", + "- Lightgrey: code units labeled as noise by HDBSCAN \n", + "- Greys: cluster labels \n", + "- Size: Article Rank (larger = more important)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c30a29f8", + "metadata": {}, + "outputs": [], + "source": [ + "def prepare_features_for_2d_visualization(features: np.ndarray, anomaly_detection_results: pd.DataFrame) -> pd.DataFrame:\n", + " \"\"\"\n", + " Reduces the dimensionality of the features down to two dimensions for 2D visualization using UMAP.\n", + " see https://umap-learn.readthedocs.io\n", + " \"\"\"\n", + "\n", + " # Check if features are empty\n", + " if features is None or len(features) == 0:\n", + " print(\"No feature data available\")\n", + " return anomaly_detection_results\n", + "\n", + " # Check if features and anomaly_detection_results have compatible lengths\n", + " if features.shape[0] != anomaly_detection_results.shape[0]:\n", + " raise ValueError(\"Features and anomaly_detection_results must have the same number of samples.\")\n", + "\n", + " # Use UMAP to reduce the dimensionality to 2D for visualization\n", + " umap_reducer = umap.UMAP(n_components=2, min_dist=0.3, random_state=47, n_jobs=1)\n", + " two_dimensional_features = umap_reducer.fit_transform(features)\n", + " \n", + " # Convert to dense numpy array (works for both sparse and dense input)\n", + " feature_coordinates = np.asarray(two_dimensional_features)\n", + "\n", + " anomaly_detection_results['featureVisualizationX'] = feature_coordinates[:, 0]\n", + " anomaly_detection_results['featureVisualizationY'] = feature_coordinates[:, 1]\n", + "\n", + " return anomaly_detection_results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f02b5dec", + "metadata": {}, + "outputs": [], + "source": [ + "def plot_features_with_anomalies(\n", + " clustering_visualization_dataframe: pd.DataFrame,\n", + " title_prefix: str,\n", + " code_unit_column: str = \"shortCodeUnitName\",\n", + " cluster_label_column: str = \"clusterLabel\",\n", + " anomaly_label_column: str = \"anomalyLabel\",\n", + " anomaly_score_column: str = \"anomalyScore\",\n", + " size_column: str = \"articleRank\",\n", + " x_position_column: str = 'embeddingVisualizationX',\n", + " y_position_column: str = 'embeddingVisualizationY',\n", + " annotate_top_n_anomalies: int = 10,\n", + " annotate_fully_top_n_anomalies: int = 3,\n", + ") -> None:\n", + " \n", + " if clustering_visualization_dataframe.empty:\n", + " print(\"No projected data to plot available\")\n", + " return\n", + " \n", + " def truncate(text: str, max_length: int = 22):\n", + " if len(text) <= max_length:\n", + " return text\n", + " return text[:max_length - 3] + \"...\"\n", + "\n", + "\n", + " cluster_anomalies = clustering_visualization_dataframe[clustering_visualization_dataframe[anomaly_label_column] == 1]\n", + " cluster_without_anomalies = clustering_visualization_dataframe[clustering_visualization_dataframe[anomaly_label_column] != 1]\n", + " cluster_noise = cluster_without_anomalies[cluster_without_anomalies[cluster_label_column] == -1]\n", + " cluster_non_noise = cluster_without_anomalies[cluster_without_anomalies[cluster_label_column] != -1]\n", + "\n", + " plot.figure(figsize=(10, 10))\n", + " plot.title(f\"{title_prefix} (size={size_column}, red=anomaly, blue=noise)\", pad=20)\n", + "\n", + " # Plot noise (from clustering)\n", + " plot.scatter(\n", + " x=cluster_noise[x_position_column],\n", + " y=cluster_noise[y_position_column],\n", + " s=cluster_noise[size_column] * 20 + 2,\n", + " color='lightblue',\n", + " alpha=0.4,\n", + " label='Noise'\n", + " )\n", + "\n", + " # Plot clusters\n", + " plot.scatter(\n", + " x=cluster_non_noise[x_position_column],\n", + " y=cluster_non_noise[y_position_column],\n", + " s=cluster_non_noise[size_column] * 20 + 2,\n", + " color='lightgrey',\n", + " alpha=0.6,\n", + " label='Clusters'\n", + " )\n", + "\n", + " # Plot anomalies\n", + " plot.scatter(\n", + " x=cluster_anomalies[x_position_column],\n", + " y=cluster_anomalies[y_position_column],\n", + " s=cluster_anomalies[size_column] * 10 + 2,\n", + " c=cluster_anomalies[anomaly_score_column],\n", + " cmap=\"Reds\",\n", + " alpha=0.95,\n", + " label='Anomaly',\n", + " )\n", + "\n", + " # Annotate top anomalies\n", + " anomalies = cluster_anomalies.sort_values(by=anomaly_score_column, ascending=False).reset_index(drop=True).head(annotate_top_n_anomalies)\n", + " anomalies_in_reversed_order = anomalies.iloc[::-1] # plot most important annotations last to overlap less important ones\n", + " for dataframe_index, row in anomalies_in_reversed_order.iterrows():\n", + " index = typing.cast(int, dataframe_index)\n", + " text = f\"{index + 1}\"\n", + " xytext = (5, 5)\n", + " if index < annotate_fully_top_n_anomalies:\n", + " text = f\"{text}: {truncate(row[code_unit_column])}\"\n", + " xytext = (5, 5 + (index % 4) * 12)\n", + "\n", + " plot.annotate(\n", + " text=text,\n", + " xy=(row[x_position_column], row[y_position_column]),\n", + " xytext=xytext,\n", + " color='red',\n", + " **plot_annotation_style\n", + " )\n", + "\n", + " plot.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f6af9eb9", + "metadata": {}, + "outputs": [], + "source": [ + "java_package_anomaly_detection_features = prepare_features_for_2d_visualization(\n", + " java_package_anomaly_detection_features_prepared,\n", + " java_package_anomaly_detection_features\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7a679562", + "metadata": {}, + "outputs": [], + "source": [ + "plot_features_with_anomalies(\n", + " java_package_anomaly_detection_features,\n", + " title_prefix=\"Java Package Anomalies (2D Feature Visualization)\",\n", + " x_position_column='featureVisualizationX',\n", + " y_position_column='featureVisualizationY',\n", + " annotate_top_n_anomalies=5,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "f9832cc9", + "metadata": {}, + "source": [ + "##### 1.4b/2 Plot features zoomed with highlighted top anomalies in a 2D scatter plot (UMAP reduction)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "acbe2034", + "metadata": {}, + "outputs": [], + "source": [ + "plot_anomalies(\n", + " java_package_anomaly_detection_features,\n", + " title_prefix=\"Java Package Anomalies (2D Feature Visualization Zoomed)\",\n", + " x_position_column='featureVisualizationX',\n", + " y_position_column='featureVisualizationY',\n", + " annotate_top_n_clusters=0,\n", + " annotate_top_n_non_anomalies=0,\n", + " percentile_of_distance_to_center=0.7,\n", + " no_cluster_coloring=True\n", + ")" + ] + }, { "cell_type": "markdown", "id": "0f1b08b6", "metadata": {}, "source": [ - "#### 1.4b Plot anomalies solely based on embeddings" + "#### 1.4c Plot anomalies solely based on embeddings" ] }, { @@ -1914,12 +2116,83 @@ "plot_anomalies(java_type_anomaly_detection_features, title_prefix=\"Java Type Anomalies\")" ] }, + { + "cell_type": "markdown", + "id": "6eb52ab0", + "metadata": {}, + "source": [ + "#### 2.4b Plot features with highlighted top anomalies in a 2D scatter plot (UMAP reduction)\n", + "\n", + "This plot visualizes the input features used by the Isolation Forest anomaly detector in a 2D scatter plot. Dimensionality reduction is performed with UMAP to illustrate how the detector \"sees\" the data.\n", + "\n", + "- Red: detected anomalies \n", + "- Lightgrey: code units labeled as noise by HDBSCAN \n", + "- Greys: cluster labels \n", + "- Size: Article Rank (larger = more important)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "129cced0", + "metadata": {}, + "outputs": [], + "source": [ + "java_type_anomaly_detection_features = prepare_features_for_2d_visualization(\n", + " java_type_anomaly_detection_features_prepared,\n", + " java_type_anomaly_detection_features\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f05ef08c", + "metadata": {}, + "outputs": [], + "source": [ + "plot_features_with_anomalies(\n", + " java_type_anomaly_detection_features,\n", + " title_prefix=\"Java Type Anomalies (2D Feature Visualization)\",\n", + " x_position_column='featureVisualizationX',\n", + " y_position_column='featureVisualizationY',\n", + " annotate_top_n_anomalies=30\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "3472efed", + "metadata": {}, + "source": [ + "##### 2.4b/2 Plot features zoomed with highlighted top anomalies in a 2D scatter plot (UMAP reduction)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c44f04e9", + "metadata": {}, + "outputs": [], + "source": [ + "plot_anomalies(\n", + " java_type_anomaly_detection_features,\n", + " title_prefix=\"Java Type Anomalies (2D Feature Visualization Zoomed)\",\n", + " x_position_column='featureVisualizationX',\n", + " y_position_column='featureVisualizationY',\n", + " annotate_top_n_clusters=0,\n", + " annotate_top_n_non_anomalies=0,\n", + " percentile_of_distance_to_center=0.7,\n", + " no_cluster_coloring=True\n", + ")" + ] + }, { "cell_type": "markdown", "id": "05275be7", "metadata": {}, "source": [ - "#### 2.4.b Plot anomalies solely based on embeddings" + "#### 2.4c Plot anomalies solely based on embeddings" ] }, { diff --git a/domains/anomaly-detection/tunedAnomalyDetectionExplained.py b/domains/anomaly-detection/tunedAnomalyDetectionExplained.py index 722aa997d..83daea96c 100755 --- a/domains/anomaly-detection/tunedAnomalyDetectionExplained.py +++ b/domains/anomaly-detection/tunedAnomalyDetectionExplained.py @@ -37,9 +37,10 @@ import shap # Explainable AI tool +import umap # Dimensionality reduction for visualization import matplotlib.pyplot as plot -from visualization import annotate_each, annotate_each_with_index, scale_marker_sizes, zoom_into_center_while_preserving_top_scores +from visualization import annotate_each, annotate_each_with_index, scale_marker_sizes, zoom_into_center_while_preserving_top_scores, plot_annotation_style class Parameters: required_parameters_ = ["projection_node_label"] @@ -149,10 +150,17 @@ def get_file_path(name: str, parameters: Parameters, extension: str = 'svg') -> return name +def get_neo4j_password() -> str: + password = os.environ.get("NEO4J_INITIAL_PASSWORD") + if password is None: + raise RuntimeError("Environment variable NEO4J_INITIAL_PASSWORD is not set. Please set it to the Neo4j password.") + return password + + def get_graph_database_driver() -> Driver: driver = GraphDatabase.driver( uri="bolt://localhost:7687", - auth=("neo4j", os.environ.get("NEO4J_INITIAL_PASSWORD")) + auth=("neo4j", get_neo4j_password()) ) driver.verify_connectivity() return driver @@ -544,6 +552,34 @@ def add_anomaly_detection_results_to_features( return features +def prepare_features_for_2d_visualization(features: np.ndarray, anomaly_detection_results: pd.DataFrame) -> pd.DataFrame: + """ + Reduces the dimensionality of the features down to two dimensions for 2D visualization using UMAP. + see https://umap-learn.readthedocs.io + """ + + # Check if features are empty + if features is None or len(features) == 0: + print("No feature data available") + return anomaly_detection_results + + # Check if features and anomaly_detection_results have compatible lengths + if features.shape[0] != anomaly_detection_results.shape[0]: + raise ValueError("Features and anomaly_detection_results must have the same number of samples.") + + # Use UMAP to reduce the dimensionality to 2D for visualization + umap_reducer = umap.UMAP(n_components=2, min_dist=0.3, random_state=47, n_jobs=1) + two_dimensional_features = umap_reducer.fit_transform(features) + + # Convert to dense numpy array (works for both sparse and dense input) + feature_coordinates = np.asarray(two_dimensional_features) + + anomaly_detection_results['featureVisualizationX'] = feature_coordinates[:, 0] + anomaly_detection_results['featureVisualizationY'] = feature_coordinates[:, 1] + + return anomaly_detection_results + + def get_top_10_anomalies( anomaly_detected_features: pd.DataFrame, anomaly_label_column: str = "anomalyLabel", @@ -609,7 +645,7 @@ def get_common_plot_parameters(data: pd.DataFrame) -> dict: cluster_non_noise = cluster_without_anomalies[cluster_without_anomalies[cluster_label_column] != -1] plot.figure(figsize=(10, 10)) - plot.title(f"{title_prefix} (size={size_column}, main-color=cluster, red=anomaly, green=non-anomaly)", pad=20) + plot.title(f"{title_prefix} Anomalies (size={size_column}, main-color=cluster, red=anomaly, green=non-anomaly)", pad=20) # Plot noise (from clustering) plot.scatter( @@ -678,6 +714,93 @@ def get_common_plot_parameters(data: pd.DataFrame) -> dict: plot.close() +def plot_features_with_anomalies( + features_to_visualize: pd.DataFrame, + title_prefix: str, + plot_file_path: str, + code_unit_column: str = "shortCodeUnitName", + cluster_label_column: str = "clusterLabel", + anomaly_label_column: str = "anomalyLabel", + anomaly_score_column: str = "anomalyScore", + size_column: str = "articleRank", + x_position_column: str = 'featureVisualizationX', + y_position_column: str = 'featureVisualizationY', + annotate_top_n_anomalies: int = 5, + annotate_fully_top_n_anomalies: int = 3, +) -> None: + + if features_to_visualize.empty: + print("No projected data to plot available") + return + + def truncate(text: str, max_length: int = 22): + if len(text) <= max_length: + return text + return text[:max_length - 3] + "..." + + features_to_visualize.loc[:, size_column + '_scaled'] = scale_marker_sizes(features_to_visualize[size_column]) + def get_common_plot_parameters(data: pd.DataFrame) -> dict: + return { + "x": data[x_position_column], + "y": data[y_position_column], + "s": data[size_column + '_scaled'], + } + cluster_anomalies = features_to_visualize[features_to_visualize[anomaly_label_column] == 1] + cluster_without_anomalies = features_to_visualize[features_to_visualize[anomaly_label_column] != 1] + cluster_noise = cluster_without_anomalies[cluster_without_anomalies[cluster_label_column] == -1] + cluster_non_noise = cluster_without_anomalies[cluster_without_anomalies[cluster_label_column] != -1] + + plot.figure(figsize=(10, 10)) + plot.title(f"{title_prefix} Anomaly Detection Features (size={size_column}, red=anomaly, blue=noise)", pad=20) + + # Plot noise (from clustering) + plot.scatter( + **get_common_plot_parameters(cluster_noise), + color='lightblue', + alpha=0.4, + label='Noise' + ) + + # Plot clusters + plot.scatter( + **get_common_plot_parameters(cluster_non_noise), + color='lightgrey', + alpha=0.6, + label='Clusters' + ) + + # Plot anomalies + plot.scatter( + **get_common_plot_parameters(cluster_anomalies), + c=cluster_anomalies[anomaly_score_column], + cmap="Reds", + alpha=0.95, + label='Anomaly', + ) + + # Annotate top anomalies + anomalies = cluster_anomalies.sort_values(by=anomaly_score_column, ascending=False).reset_index(drop=True).head(annotate_top_n_anomalies) + anomalies_in_reversed_order = anomalies.iloc[::-1] # plot most important annotations last to overlap less important ones + for dataframe_index, row in anomalies_in_reversed_order.iterrows(): + index = typing.cast(int, dataframe_index) + text = f"{index + 1}" + xytext = (5, 5) + if index < annotate_fully_top_n_anomalies: + text = f"{text}: {truncate(row[code_unit_column])}" + xytext = (5, 5 + (index % 4) * 12) + + plot.annotate( + text=text, + xy=(row[x_position_column], row[y_position_column]), + xytext=xytext, + color='red', + **plot_annotation_style + ) + + plot.savefig(plot_file_path) + plot.close() + + DType = typing.TypeVar("DType", bound=np.generic) Numpy_Array = numpy_typing.NDArray[DType] Two_Dimensional_Vector = typing.Annotated[Numpy_Array, typing.Literal[2]] @@ -1050,10 +1173,21 @@ def output_top_shap_explained_global_features_as_markdown_table( plot_anomalies( features_to_visualize=features, - title_prefix="Java Package Anomalies", + title_prefix=parameters.get_title_prefix(), plot_file_path=get_file_path("Anomalies", parameters) ) +features = prepare_features_for_2d_visualization( + features_prepared, + features +) + +plot_features_with_anomalies( + features_to_visualize=features, + title_prefix=parameters.get_title_prefix(), + plot_file_path=get_file_path("AnomalyDetectionFeatures", parameters), +) + if parameters.is_verbose(): feature_importances = pd.Series(anomaly_detection_results.feature_importances, index=feature_names).sort_values(ascending=False) print("tunedAnomalyDetectionExplained: Most influential features for anomaly detection according to the proxy model directly without SHAP (top 10):")