Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions domains/anomaly-detection/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Anomaly Detection Domain

This directory contains the implementation and resources related to the Anomaly Detection domain within the Code Graph Analysis Pipeline project.

## Entry Points

The following scripts serve as entry points for various anomaly detection tasks and reports. They will be invoked by [AllReports.sh](./../../scripts/reports/compilations/AllReports.sh) an its sub-scripts dynamically by their names.

- [anomalyDetectionCsv.sh](./anomalyDetectionCsv.sh): Entry point for CSV reports based solely on Graph queries.
- [anomalyDetectionPython.sh](./anomalyDetectionPython.sh): Entry point for Python-based anomaly detection tasks and reports.
- [anomalyDetectionVisualization.sh](./anomalyDetectionVisualization.sh): Entry point for Graph visualization reports.
- [anomalyDetectionMarkdown.sh](./anomalyDetectionMarkdown.sh): Entry point for generating the Markdown summary report.

## Folder Structure

- [documentation](./documentation): Contains documentation including architecture diagrams.
- [explore](./explore/): Jupyter notebooks for interactive, exploratory anomaly detection analysis.
- [features](./features/): Cypher queries to extract features and run graph algorithms relevant for anomaly detection.
- [graphs](./graphs/): Cypher queries and GraphViz templates for Graph visualizations related to anomaly detection.
- [labels](./labels/): Cypher queries label nodes that represent specific archetypes.
- [queries](./queries/): Cypher queries to identify anomalies based on various (deterministic/explainable) criteria.
- [reset](./reset/): Cypher queries to reset the graph database state related to anomaly detection.
- [summary](./summary/): Markdown templates and resources for generating the summary report.

## Pipeline Architecture Overview

![Anomaly Detection Architecture](./documentation/Architecture.svg)
194 changes: 194 additions & 0 deletions domains/anomaly-detection/documentation/Architecture.gv
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
digraph AnomalyDetectionPipeline {
rankdir=LR;
node [fontname="Helvetica", fontsize=10];

// Leiden community detection
subgraph cluster_leiden {
label="Leiden Community Detection";
style=filled; color=lightblue;
node [shape=box, style=filled, fillcolor=white];

Tuning_Leiden [label="Tuning\n(Optuna)"];
Leiden_Gamma [label="gamma", shape=diamond]
Leiden_Theta [label="theta", shape=diamond]
Leiden_Algorithm [label="Leiden Community Detection"];
CommunityId [label="Community", shape=ellipse];
}

// --- Leiden Community Detection relationships ---
Tuning_Leiden -> Leiden_Gamma;
Tuning_Leiden -> Leiden_Theta;
Leiden_Gamma -> Leiden_Algorithm
Leiden_Theta -> Leiden_Algorithm
Leiden_Algorithm -> Tuning_Leiden [label="modularity", style="dashed"]
Leiden_Algorithm -> Tuning_Leiden [label="size", style="dashed"]
Leiden_Algorithm -> CommunityId;

// Fast Random Projection (FastRP)
subgraph cluster_fastRP {
label="Fast Random Projection (FastRP)";
style=filled; color=lightpink;
node [shape=box, style=filled, fillcolor=white];

Tuning_FastRP [label="Tuning\n(Optuna)"];
FastRP_Dimension [label="dimension", shape=diamond];
FastRP_Normalization_Strength [label="normalization strength", shape="diamond"];
FastRP_Forth_Iteration_Weight [label="forth iteration weight", shape="diamond"];
FastRP_Algorithm [label="FastRP"];
NodeEmbeddings [label="Node Embeddings", shape=ellipse];
}

// --- FastRP relationships ---
Tuning_FastRP -> FastRP_Dimension;
Tuning_FastRP -> FastRP_Normalization_Strength;
Tuning_FastRP -> FastRP_Forth_Iteration_Weight;
FastRP_Dimension -> FastRP_Algorithm;
FastRP_Normalization_Strength -> FastRP_Algorithm;
FastRP_Forth_Iteration_Weight -> FastRP_Algorithm
FastRP_Algorithm -> Tuning_FastRP [label="adjusted mutual info score\n(incl. preview clustering)", style="dashed"]
FastRP_Algorithm -> NodeEmbeddings;

// Uniform Manifold Approximation and Projection (UMAP)
subgraph cluster_UMAP {
label="Uniform Manifold Approximation and Projection (UMAP)\nDimensionality Reduction for Visualization";
style=filled; color=lightgrey;
node [shape=box, style=filled, fillcolor=white];

UMAP_Algorithm [label="UMAP"];
UMAP_Coordinates [label="2D Coordinates", shape=ellipse];
}

// UMAP relationships
NodeEmbeddings -> UMAP_Algorithm
UMAP_Algorithm -> UMAP_Coordinates

// HDBSCAN clustering and tuning
subgraph cluster_hdbscan {
label="Hierarchical Density-Based Spatial Clustering (HDBSCAN)";
style=filled; color=lightgoldenrod;
node [shape=box, style=filled, fillcolor=white];

Tuning_HDBSCAN [label="Tuning\n(Optuna)"];
HDBSCAN_Node [label="HDBSCAN"];
HDBSCAN_Min_Cluster_Size [label="Min Cluster Size", shape=diamond];
HDBSCAN_Min_Samples [label="Min Samples", shape=diamond];

ClusterLabel [label="Label", shape=ellipse];
ClusterRadius [label="Radius\n(avg,max)", shape=ellipse];
ClusterSize [label="Size", shape=ellipse];
NormDistToMedoid [label="Normalized Distance\nTo Medoid", shape=ellipse];
ClusterNoise [label="Noise\n(label=-1)", shape=ellipse];
ClusterProbability [label="Probability", shape=ellipse];
ClusterApproximationOutlierScore [label="Approximation\nOutlierScore\n(= 1 - Probability)", shape=ellipse];
}

// --- Inputs into HDBSCAN ---
CommunityId -> Tuning_HDBSCAN [label="reference"];
NodeEmbeddings -> HDBSCAN_Node;

Tuning_HDBSCAN -> HDBSCAN_Min_Cluster_Size
Tuning_HDBSCAN -> HDBSCAN_Min_Samples
HDBSCAN_Min_Cluster_Size -> HDBSCAN_Node;
HDBSCAN_Min_Samples -> HDBSCAN_Node;

HDBSCAN_Node -> Tuning_HDBSCAN [label="adjusted mutual info score", style=dashed];

// HDBSCAN outputs (cluster features)
HDBSCAN_Node -> ClusterLabel;
HDBSCAN_Node -> ClusterNoise;
HDBSCAN_Node -> ClusterRadius;
HDBSCAN_Node -> ClusterSize;
HDBSCAN_Node -> NormDistToMedoid;
HDBSCAN_Node -> ClusterProbability;
HDBSCAN_Node -> ClusterApproximationOutlierScore;

// Graph algorithm based features
subgraph cluster_graph_features {
label="Graph (Algorithm) Features";
style=filled; color=lightcyan;
node [shape=ellipse, style=filled, fillcolor=white];

ArticleRank [label="ArticleRank"];
PageRank [label="PageRank"];
PageRank_minus_ArticleRank [label="PageRank -\nArticleRank"];
BetweennessCentrality [label="Betweenness\nCentrality"];
LocalClusteringCoefficient [label="Local Clustering\nCoefficient"];
Degree [label="Degree\n(in, out, sum)"];
}

// Anomaly detection model area
subgraph cluster_anomaly {
label="Anomaly Detection Model";
style=filled; color=lightgreen; penwidth=4; pencolor=green; margin="50,50";
node [shape=box, style=filled, fillcolor=white];

TuningAnomaly [label="Tuning\n(Optuna)"];
IsolationMinCluster [label="Min Cluster Size", shape=diamond];
IsolationEstimators [label="n estimators", shape=diamond];

ProxyEstimators [label="n estimators", shape=diamond];
ProxyMaxDepth [label="max depth", shape=diamond];

AnomalyStandardizer [label="Standardizer"]
AnomalyPCA [label="Principal Component\nAnalysis (PCA)"]
IsolationForest [label="Isolation Forest\nAnomaly Detector", margin="0.4,0.4"];
ProxyRandomForest [label="RandomForest\n(Proxy)"];
AnomalyScore [label="Score", shape=ellipse];
AnomalyLabel [label="Label", shape=ellipse];
}

// Embeddings feed anomaly model
NodeEmbeddings -> AnomalyPCA;

// HDBSCAN-derived features feed anomaly model
ClusterRadius -> AnomalyStandardizer;
NormDistToMedoid -> AnomalyStandardizer;
ClusterApproximationOutlierScore -> AnomalyStandardizer;

// Graph Algorithm Features feed anomaly model
ArticleRank -> AnomalyStandardizer;
PageRank -> AnomalyStandardizer;
PageRank_minus_ArticleRank -> AnomalyStandardizer;
BetweennessCentrality -> AnomalyStandardizer;
LocalClusteringCoefficient -> AnomalyStandardizer;
Degree -> AnomalyStandardizer;

// Proxy RandomForest used as a backing/tuning model for the Isolation Forest
TuningAnomaly -> IsolationMinCluster;
TuningAnomaly -> IsolationEstimators;
IsolationMinCluster -> IsolationForest
IsolationEstimators -> IsolationForest

TuningAnomaly -> ProxyEstimators
TuningAnomaly -> ProxyMaxDepth
ProxyEstimators -> ProxyRandomForest
ProxyMaxDepth -> ProxyRandomForest

AnomalyStandardizer -> IsolationForest;
AnomalyPCA -> IsolationForest;
IsolationForest -> ProxyRandomForest [label="reference", style="dashed"];
ProxyRandomForest -> TuningAnomaly [label="f1 score\n(cross validation)", style="dashed"];

IsolationForest -> AnomalyLabel
IsolationForest -> AnomalyScore

// Explainable AI / SHAP
subgraph cluster_explainability {
label="Explainable AI (SHAP)";
style=filled; color=lavender;
node [shape=note, style=filled, fillcolor=white];

SHAP [label="SHAP TreeExplainer"];

SHAP_Values [label="Top SHAP Values", shape=ellipse];
SHAP_Features [label="Top Features", shape=ellipse];
SHAP_Embedding_Sum [label="Node Embeddings\nSHAP Sum", shape=ellipse];
}

// Explainability connections (RandomForest -> SHAP)
ProxyRandomForest -> SHAP;
SHAP -> SHAP_Values;
SHAP -> SHAP_Features;
SHAP -> SHAP_Embedding_Sum;

}
Loading
Loading