JohT · JohT · Aug 6, 2025 · Aug 3, 2025 · Aug 3, 2025 · Aug 3, 2025
diff --git a/.github/workflows/internal-check-python-venv-support.yml b/.github/workflows/internal-check-python-venv-support.yml
@@ -0,0 +1,41 @@
+name: Check Python venv virtual environment
+
+on:
+  pull_request:
+    branches:
+      - main
+    # Only watch changes related to Python virtual environment venv
+    paths:
+      - 'requirements.txt'
+      - 'scripts/activatePythonEnvironment.sh'
+      - '.github/workflows/internal-check-python-venv-support.yml' # or when this file changed
+
+jobs:
+  check-python-venv-environment:
+    runs-on: ubuntu-22.04
+    strategy:
+      matrix:
+        include:
+        - python: 3.12
+
+    steps:
+    - name: Checkout GIT Repository
+      uses: actions/checkout@v4
+
+    - name: (Python Setup) Use version ${{ matrix.python }} with venv environment management module
+      uses: actions/setup-python@v5
+      with:
+        python-version: ${{ matrix.python }}
+        cache: 'pip'
+
+    - name: Activate virtual environment using venv and check if the required packages were installed
+      env: 
+        USE_VIRTUAL_PYTHON_ENVIRONMENT_VENV: "true"
+      # 1.  Run the script under test to create, activate and install the virtual environment
+      # 2a. Run pip in dry-run mode without installing or resolving dependencies
+      # 2b. Suppress all pip output (stderr)
+      # 2c. Check if pip *would install* anything using grep
+      # 2d. If there are missing dependencies and the environment is incomplete, return 1 (indicates all requirements already satisfied)
+      run: |
+        ./scripts/activatePythonEnvironment.sh
+        pip install --dry-run --no-deps --requirement "./requirements.txt" 2>/dev/null | grep -q "Would install" || return 1
diff --git a/.github/workflows/public-analyze-code-graph.yml b/.github/workflows/public-analyze-code-graph.yml
@@ -61,6 +61,12 @@ on:
         required: false
         type: string
         default: 'true'
+      use-venv_virtual_python_environment:
+        description: >
+          Use venv for virtual Python environments instead of Conda ("true") or not ("false", default).
+        required: false
+        type: string
+        default: 'false'
     outputs:
       uploaded-analysis-results:
         description: >
@@ -103,16 +109,26 @@ jobs:
 
       # "Setup Python" can be skipped if jupyter notebook analysis-results aren't needed
       - name: (Python Setup) Use version ${{ matrix.python }} with Conda package manager Miniforge
+        if: inputs.use-venv_virtual_python_environment == 'false'
         id: prepare-conda-environment
         uses: conda-incubator/setup-miniconda@505e6394dae86d6a5c7fbb6e3fb8938e3e863830 # v3
         with:
           python-version: ${{ matrix.python }}
           miniforge-version: ${{ matrix.miniforge }}
           activate-environment: codegraph
-          environment-file: ./jupyter/environment.yml
+          environment-file: ./conda-environment.yml
           auto-activate-base: false
           show-channel-urls: true
+
+      - name: (Python Setup) Use version ${{ matrix.python }} with venv environment management module
+        if: inputs.use-venv_virtual_python_environment == 'true'
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python }}
+          cache: 'pip'
+
       - name: (Python Setup) Conda environment info
+        if: inputs.use-venv_virtual_python_environment == 'false'
         shell: bash -el {0}
         run: | 
           conda info
@@ -168,6 +184,7 @@ jobs:
           ENABLE_JUPYTER_NOTEBOOK_PDF_GENERATION: ${{ inputs.jupyter-pdf }}
           IMPORT_GIT_LOG_DATA_IF_SOURCE_IS_PRESENT: "" # Options: "none", "aggregated", "full". default = "plugin" or ""
           PREPARE_CONDA_ENVIRONMENT: "false" # Had already been done in step with id "prepare-conda-environment".
+          USE_VIRTUAL_PYTHON_ENVIRONMENT_VENV: ${{ inputs.use-venv_virtual_python_environment }}
         run: |
           TYPESCRIPT_SCAN_HEAP_MEMORY=${{ inputs.typescript-scan-heap-memory }} ./../../scripts/analysis/analyze.sh ${{ inputs.analysis-arguments }}
 

diff --git a/.gitignore b/.gitignore
@@ -98,6 +98,8 @@ __pycache__/
 
 # Python environments
 .conda
+.venv/
+*.pyc
 
 # Optuna (and other) Database data
 *.db
diff --git a/COMMANDS.md b/COMMANDS.md
@@ -389,23 +389,23 @@ Here is an example on how to use [executeJupyterNotebook.sh](./scripts/executeJu
   conda activate codegraph
   ```
 
-  or by using the environment file [codegraph-environment.yml](./jupyter/environment.yml):
+  or by using the codegraph environment file [conda-environment.yml](./conda-environment.yml):
 
   ```shell
-  conda env create --file ./jupyter/environment.yml
+  conda env create --file ./conda-environment.yml
   conda activate codegraph
   ```
 
-- Export full environment.yml
+- Export full conda-environment.yml
 
   ```shell
-  conda env export --name codegraph > full-codegraph-environment.yml
+  conda env export --name codegraph > full-codegraph-conda-environment.yml
   ```
 
-- Export only explicit environment.yml
+- Export only explicit conda-environment.yml
 
   ```shell
-  conda env export --from-history --name codegraph | grep -v "^prefix: " > explicit-codegraph-environment.yml
+  conda env export --from-history --name codegraph | grep -v "^prefix: " > explicit-codegraph-conda-environment.yml
   ```
 
 ### Executing Jupyter Notebooks with nbconvert

diff --git a/GETTING_STARTED.md b/GETTING_STARTED.md
@@ -84,13 +84,13 @@ Use these optional command line options as needed:
   ./../../scripts/analysis/analyze.sh --report Csv
   ```
 
-- Jupyter notebook reports when Python and Conda are installed (and Chromium Browser for PDF generation):
+- Jupyter notebook reports when Python and Conda (or venv) are installed (and Chromium Browser for PDF generation):
 
   ```shell
   ./../../scripts/analysis/analyze.sh --report Jupyter
   ```
 
-- Python reports when Python and Conda are installed (without Chromium Browser for PDF generation):
+- Python reports when Python and Conda (or venv) are installed (without Chromium Browser for PDF generation):
 
   ```shell
   ./../../scripts/analysis/analyze.sh --report Python
@@ -102,7 +102,7 @@ Use these optional command line options as needed:
   ./../../scripts/analysis/analyze.sh --report Visualization
   ```
 
-- All reports with Python, Conda, Node.js and npm installed:
+- All reports with Python, Conda (or venv), Node.js and npm installed:
 
   ```shell
   ./../../scripts/analysis/analyze.sh

diff --git a/README.md b/README.md
@@ -86,8 +86,10 @@ Here are some fully automated graph visualizations utilizing [GraphViz](https://
 
 ### Additional Prerequisites for Python and Jupyter Notebooks
 
-- Python is required for Jupyter Notebook reports.
-- A conda package manager like [Miniconda](https://docs.conda.io/projects/miniconda/en/latest) or [Anaconda](https://www.anaconda.com/download)(Recommended for Windows) is required for Jupyter Notebook reports.
+- Python is required for Jupyter Notebook and Python reports.
+- Either [Conda](https://docs.conda.io) or Python's build-in module [venv](https://docs.python.org/3/library/venv.html) a required as environment manager.
+- For Conda, use for example [Miniconda](https://docs.conda.io/projects/miniconda/en/latest) or [Anaconda](https://www.anaconda.com/download)(Recommended for Windows).
+- To use venv, no additional installation is needed. For that the environment variable `USE_VIRTUAL_PYTHON_ENVIRONMENT_VENV` needs to be set to `'true'`.
 - Chromium will automatically be downloaded if needed for Jupyter Notebook PDF reports generation.
 
 ### Additional Prerequisites for Graph Visualization
@@ -131,13 +133,14 @@ The [Code Structure Analysis Pipeline](./.github/workflows/internal-java-code-an
 - [Checkout GIT Repository](https://github.com/actions/checkout)
 - [Setup Java](https://github.com/actions/setup-java)
 - [Setup Python with Conda](https://github.com/conda-incubator/setup-miniconda) package manager [Mambaforge](https://github.com/conda-forge/miniforge#mambaforge)
+- [Setup Python with venv](https://docs.python.org/3/library/venv.html)
 - Download artifacts and optionally source code that contain the code to be analyzed [scripts/downloader](./scripts/downloader)
 - Setup [Neo4j](https://neo4j.com) Graph Database ([analysis.sh](./scripts/analysis/analyze.sh))
 - Setup [jQAssistant](https://jqassistant.github.io/jqassistant/current) for Java and [Typescript](https://github.com/jqassistant-plugin/jqassistant-typescript-plugin) analysis ([analysis.sh](./scripts/analysis/analyze.sh))
 - Start [Neo4j](https://neo4j.com) Graph Database ([analysis.sh](./scripts/analysis/analyze.sh))
 - Generate CSV Reports [scripts/reports](./scripts/reports) using the command line JSON parser [jq](https://jqlang.github.io/jq)
 - Uses [Neo4j Graph Data Science](https://neo4j.com/product/graph-data-science) for community detection, centrality, similarity, node embeddings and topological sort ([analysis.sh](./scripts/analysis/analyze.sh))
-- Generate [Jupyter Notebook](https://jupyter.org) reports using these libraries specified in the [environment.yml](./jupyter/environment.yml):
+- Generate [Jupyter Notebook](https://jupyter.org) reports using these libraries specified in the [conda-environment.yml](./conda-environment.yml):
   - [Python](https://www.python.org)
   - [jupyter](https://jupyter.org)
   - [matplotlib](https://matplotlib.org)

diff --git a/jupyter/environment.yml → conda-environment.yml b/jupyter/environment.yml → conda-environment.yml
diff --git a/domains/anomaly-detection/anomalyDetectionPython.sh b/domains/anomaly-detection/anomalyDetectionPython.sh
@@ -60,6 +60,24 @@ source "${SCRIPTS_DIR}/executeQueryFunctions.sh"
 # Define functions to create and delete Graph Projections like "createUndirectedDependencyProjection"
 source "${SCRIPTS_DIR}/projectionFunctions.sh"
 
+# Define functions (like is_csv_column_greater_zero) to parse CSV format strings from Cypher query results.
+source "${SCRIPTS_DIR}/parseCsvFunctions.sh"
+
+is_sufficient_data_available() {
+    language=$( extractQueryParameter "projection_language" "${@}" )
+    nodeLabel=$( extractQueryParameter "projection_node_label" "${@}" )
+
+    query_result=$( execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionNodeCount.cypher" "${@}" )
+    node_count=$(get_csv_column_value "${query_result}" "node_count")
+    if [ "${node_count}" -lt 15 ]; then
+        echo "anomalyDetectionPipeline: Warning: Skipping anomaly detection. Only ${node_count} ${language} ${nodeLabel} nodes. At least 15 required."
+        false
+    else
+        echo "anomalyDetectionPipeline: Info: Running anomaly detection with ${node_count} ${language} ${nodeLabel} nodes."
+        true
+    fi
+}
+
 # Query or recalculate features.
 # 
 # Required Parameters:
@@ -158,30 +176,38 @@ EMBEDDING_PROPERTY="embedding_property=embeddingsFastRandomProjectionTunedForClu
 
 # -- Java Artifact Node Embeddings -------------------------------
 
-if createUndirectedDependencyProjection "${PROJECTION_NAME}=artifact-anomaly-detection" "${PROJECTION_NODE}=Artifact" "${PROJECTION_WEIGHT}=weight" "${PROJECTION_LANGUAGE}=Java"; then
-    createDirectedDependencyProjection "${PROJECTION_NAME}=artifact-anomaly-detection-directed" "${PROJECTION_NODE}=Artifact" "${PROJECTION_WEIGHT}=weight" "${PROJECTION_LANGUAGE}=Java"
-    anomaly_detection_python_reports "${ALGORITHM_PROJECTION}=artifact-anomaly-detection" "${ALGORITHM_NODE}=Artifact" "${ALGORITHM_WEIGHT}=weight" "${ALGORITHM_LANGUAGE}=Java" "${COMMUNITY_PROPERTY}" "${EMBEDDING_PROPERTY}"
+if is_sufficient_data_available "${ALGORITHM_NODE}=Artifact" "${ALGORITHM_WEIGHT}=weight"; then
+  if createUndirectedDependencyProjection "${PROJECTION_NAME}=artifact-anomaly-detection" "${PROJECTION_NODE}=Artifact" "${PROJECTION_WEIGHT}=weight" "${PROJECTION_LANGUAGE}=Java"; then
+      createDirectedDependencyProjection "${PROJECTION_NAME}=artifact-anomaly-detection-directed" "${PROJECTION_NODE}=Artifact" "${PROJECTION_WEIGHT}=weight" "${PROJECTION_LANGUAGE}=Java"
+      anomaly_detection_python_reports "${ALGORITHM_PROJECTION}=artifact-anomaly-detection" "${ALGORITHM_NODE}=Artifact" "${ALGORITHM_WEIGHT}=weight" "${ALGORITHM_LANGUAGE}=Java" "${COMMUNITY_PROPERTY}" "${EMBEDDING_PROPERTY}"
+  fi
 fi
 
 # -- Java Package Node Embeddings --------------------------------
 
-if createUndirectedDependencyProjection "${PROJECTION_NAME}=package-anomaly-detection" "${PROJECTION_NODE}=Package" "${PROJECTION_WEIGHT}=weight25PercentInterfaces" "${PROJECTION_LANGUAGE}=Java"; then
-    createDirectedDependencyProjection "${PROJECTION_NAME}=package-anomaly-detection-directed" "${PROJECTION_NODE}=Package" "${PROJECTION_WEIGHT}=weight25PercentInterfaces" "${PROJECTION_LANGUAGE}=Java"
-    anomaly_detection_python_reports "${ALGORITHM_PROJECTION}=package-anomaly-detection" "${ALGORITHM_NODE}=Package" "${ALGORITHM_WEIGHT}=weight25PercentInterfaces" "${ALGORITHM_LANGUAGE}=Java" "${COMMUNITY_PROPERTY}" "${EMBEDDING_PROPERTY}"
+if is_sufficient_data_available "${ALGORITHM_NODE}=Package" "${ALGORITHM_WEIGHT}=weight25PercentInterfaces"; then
+  if createUndirectedDependencyProjection "${PROJECTION_NAME}=package-anomaly-detection" "${PROJECTION_NODE}=Package" "${PROJECTION_WEIGHT}=weight25PercentInterfaces" "${PROJECTION_LANGUAGE}=Java"; then
+      createDirectedDependencyProjection "${PROJECTION_NAME}=package-anomaly-detection-directed" "${PROJECTION_NODE}=Package" "${PROJECTION_WEIGHT}=weight25PercentInterfaces" "${PROJECTION_LANGUAGE}=Java"
+      anomaly_detection_python_reports "${ALGORITHM_PROJECTION}=package-anomaly-detection" "${ALGORITHM_NODE}=Package" "${ALGORITHM_WEIGHT}=weight25PercentInterfaces" "${ALGORITHM_LANGUAGE}=Java" "${COMMUNITY_PROPERTY}" "${EMBEDDING_PROPERTY}"
+  fi
 fi
 
 # -- Java Type Node Embeddings -----------------------------------
 
-if createUndirectedJavaTypeDependencyProjection "${PROJECTION_NAME}=type-anomaly-detection"; then
-    createDirectedJavaTypeDependencyProjection "${PROJECTION_NAME}=type-anomaly-detection-directed"
-    anomaly_detection_python_reports "${ALGORITHM_PROJECTION}=type-anomaly-detection" "${ALGORITHM_NODE}=Type" "${ALGORITHM_WEIGHT}=weight" "${ALGORITHM_LANGUAGE}=Java" "${COMMUNITY_PROPERTY}" "${EMBEDDING_PROPERTY}"
+if is_sufficient_data_available "${ALGORITHM_NODE}=Type" "${ALGORITHM_WEIGHT}=weight"; then
+  if createUndirectedJavaTypeDependencyProjection "${PROJECTION_NAME}=type-anomaly-detection"; then
+      createDirectedJavaTypeDependencyProjection "${PROJECTION_NAME}=type-anomaly-detection-directed"
+      anomaly_detection_python_reports "${ALGORITHM_PROJECTION}=type-anomaly-detection" "${ALGORITHM_NODE}=Type" "${ALGORITHM_WEIGHT}=weight" "${ALGORITHM_LANGUAGE}=Java" "${COMMUNITY_PROPERTY}" "${EMBEDDING_PROPERTY}"
+  fi
 fi
 
 # -- Typescript Module Node Embeddings ---------------------------
 
-if createUndirectedDependencyProjection "${PROJECTION_NAME}=typescript-module-embedding" "${PROJECTION_NODE}=Module" "${PROJECTION_WEIGHT}=lowCouplingElement25PercentWeight" "${PROJECTION_LANGUAGE}=Typescript"; then
-    createDirectedDependencyProjection "${PROJECTION_NAME}=typescript-module-embedding-directed" "${PROJECTION_NODE}=Module" "${PROJECTION_WEIGHT}=lowCouplingElement25PercentWeight" "${PROJECTION_LANGUAGE}=Typescript"
-    anomaly_detection_python_reports "${ALGORITHM_PROJECTION}=typescript-module-embedding" "${ALGORITHM_NODE}=Module" "${ALGORITHM_WEIGHT}=lowCouplingElement25PercentWeight" "${ALGORITHM_LANGUAGE}=Typescript" "${COMMUNITY_PROPERTY}" "${EMBEDDING_PROPERTY}"
+if is_sufficient_data_available "${ALGORITHM_NODE}=Module" "${ALGORITHM_WEIGHT}=lowCouplingElement25PercentWeight"; then
+  if createUndirectedDependencyProjection "${PROJECTION_NAME}=typescript-module-embedding" "${PROJECTION_NODE}=Module" "${PROJECTION_WEIGHT}=lowCouplingElement25PercentWeight" "${PROJECTION_LANGUAGE}=Typescript"; then
+      createDirectedDependencyProjection "${PROJECTION_NAME}=typescript-module-embedding-directed" "${PROJECTION_NODE}=Module" "${PROJECTION_WEIGHT}=lowCouplingElement25PercentWeight" "${PROJECTION_LANGUAGE}=Typescript"
+      anomaly_detection_python_reports "${ALGORITHM_PROJECTION}=typescript-module-embedding" "${ALGORITHM_NODE}=Module" "${ALGORITHM_WEIGHT}=lowCouplingElement25PercentWeight" "${ALGORITHM_LANGUAGE}=Typescript" "${COMMUNITY_PROPERTY}" "${EMBEDDING_PROPERTY}"
+  fi
 fi
 
 # ---------------------------------------------------------------

diff --git a/domains/anomaly-detection/queries/AnomalyDetectionNodeCount.cypher b/domains/anomaly-detection/queries/AnomalyDetectionNodeCount.cypher
@@ -0,0 +1,10 @@
+// Count the number of nodes with dependencies. Variables: dependencies_projection_node, dependencies_projection_weight_property
+
+ MATCH (source)-[dependency:DEPENDS_ON]->(target)
+ WHERE $projection_node_label      IN labels(source)
+   AND $projection_node_label      IN labels(target)
+   AND $projection_weight_property IN keys(dependency)
+  WITH collect(DISTINCT source.name) AS sources
+      ,collect(DISTINCT target.name) AS targets
+ UNWIND sources + targets AS source_or_target
+RETURN count(DISTINCT source_or_target) AS node_count
diff --git a/domains/anomaly-detection/tunedAnomalyDetectionExplained.py b/domains/anomaly-detection/tunedAnomalyDetectionExplained.py
@@ -384,10 +384,11 @@ def no_anomalies(cls):
 
 def tune_anomaly_detection_models(
     feature_matrix: np.ndarray,
+    parameters: Parameters,
     contamination: float | typing.Literal["auto"] = 0.05,
     random_seed: int = 42,
     number_of_trials: int = 25,
-    optimization_timeout_in_seconds: int = 60
+    optimization_timeout_in_seconds: int = 50
 ) -> AnomalyDetectionResults:
     """
     Tunes both Isolation Forest and a proxy Random Forest using Optuna, maximizing the F1 score
@@ -464,7 +465,7 @@ def objective(trial) -> float:
 
     # Print the number of samples and features in the feature matrix
     n_samples = feature_matrix.shape[0]
-    print(f"tunedAnomalyDetectionExplained: Tuned Anomaly Detection: Number of samples: {n_samples}, Number of features: {feature_matrix.shape[1]}, Number of trials: {number_of_trials}")
+    print(f"tunedAnomalyDetectionExplained: Tuning Anomaly Detection: Number of samples: {n_samples}, Number of features: {feature_matrix.shape[1]}, Number of trials: {number_of_trials}")
 
     # Run Optuna optimization
     study = create_study(direction="maximize", sampler=TPESampler(seed=random_seed), study_name="AnomalyDetection_Tuning")
@@ -480,7 +481,12 @@ def objective(trial) -> float:
     study.enqueue_trial({'isolation_max_samples': 0.10015063610944819, 'isolation_n_estimators': 329, 'proxy_n_estimators': 314, 'proxy_max_depth': 8})
 
     study.optimize(objective, n_trials=number_of_trials, timeout=optimization_timeout_in_seconds)
-    output_optuna_tuning_results(study, study.study_name)
+
+    # Output tuning results
+    print(f"Best Isolation & Random Forest parameters for {parameters.get_plot_prefix()} after {len(study.trials)}/{number_of_trials} trials with best #{study.best_trial.number} (Optuna):", study.best_params)
+
+    if parameters.is_verbose():
+        output_optuna_tuning_results(study, study.study_name)
 
     if np.isclose(study.best_value, 0.0, rtol=1e-09, atol=1e-09):
         red = "\x1b[31;20m"
@@ -869,7 +875,7 @@ def add_top_shap_features_to_anomalies(
 features_prepared = np.hstack([features_standardized, node_embeddings_reduced])
 feature_names = list(features_to_standardize) + [f'nodeEmbeddingPCA_{i}' for i in range(node_embeddings_reduced.shape[1])]
 
-anomaly_detection_results = tune_anomaly_detection_models(features_prepared)
+anomaly_detection_results = tune_anomaly_detection_models(features_prepared, parameters)
 if anomaly_detection_results.is_empty():
     sys.exit(0)
 

diff --git a/domains/anomaly-detection/tunedLeidenCommunityDetection.py b/domains/anomaly-detection/tunedLeidenCommunityDetection.py
@@ -359,10 +359,10 @@ def objective(trial):
     study.enqueue_trial({'gamma': 1.14, 'theta': 0.001, 'max_levels': 10})
 
     # Execute the hyperparameter tuning
-    study.optimize(objective, n_trials=20, timeout=30)
+    study.optimize(objective, n_trials=20, timeout=20)
 
     # Output tuning results
-    print(f"Best Leiden Community Detection parameters for {parameters.get_projection_name()} (Optuna):", study.best_params)
+    print(f"Best Leiden Community Detection parameters for {parameters.get_projection_name()} after {len(study.trials)}/20 trials with best #{study.best_trial.number} (Optuna):", study.best_params)
     if parameters.is_verbose():
         output_detailed_optuna_tuning_results(study)
-Original file line number
+Diff line change
@@ Expand Up / @@ -98,6 +98,8 @@ __pycache__/ @@
     # Python environments
     .conda
+    .venv/
+    *.pyc
     # Optuna (and other) Database data
     *.db