diff --git a/.github/workflows/internal-check-python-venv-support.yml b/.github/workflows/internal-check-python-venv-support.yml new file mode 100644 index 000000000..46dec4c5f --- /dev/null +++ b/.github/workflows/internal-check-python-venv-support.yml @@ -0,0 +1,41 @@ +name: Check Python venv virtual environment + +on: + pull_request: + branches: + - main + # Only watch changes related to Python virtual environment venv + paths: + - 'requirements.txt' + - 'scripts/activatePythonEnvironment.sh' + - '.github/workflows/internal-check-python-venv-support.yml' # or when this file changed + +jobs: + check-python-venv-environment: + runs-on: ubuntu-22.04 + strategy: + matrix: + include: + - python: 3.12 + + steps: + - name: Checkout GIT Repository + uses: actions/checkout@v4 + + - name: (Python Setup) Use version ${{ matrix.python }} with venv environment management module + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python }} + cache: 'pip' + + - name: Activate virtual environment using venv and check if the required packages were installed + env: + USE_VIRTUAL_PYTHON_ENVIRONMENT_VENV: "true" + # 1. Run the script under test to create, activate and install the virtual environment + # 2a. Run pip in dry-run mode without installing or resolving dependencies + # 2b. Suppress all pip output (stderr) + # 2c. Check if pip *would install* anything using grep + # 2d. If there are missing dependencies and the environment is incomplete, return 1 (indicates all requirements already satisfied) + run: | + ./scripts/activatePythonEnvironment.sh + pip install --dry-run --no-deps --requirement "./requirements.txt" 2>/dev/null | grep -q "Would install" || return 1 \ No newline at end of file diff --git a/.github/workflows/public-analyze-code-graph.yml b/.github/workflows/public-analyze-code-graph.yml index 21f3c2158..31c0da0f3 100644 --- a/.github/workflows/public-analyze-code-graph.yml +++ b/.github/workflows/public-analyze-code-graph.yml @@ -61,6 +61,12 @@ on: required: false type: string default: 'true' + use-venv_virtual_python_environment: + description: > + Use venv for virtual Python environments instead of Conda ("true") or not ("false", default). + required: false + type: string + default: 'false' outputs: uploaded-analysis-results: description: > @@ -103,16 +109,26 @@ jobs: # "Setup Python" can be skipped if jupyter notebook analysis-results aren't needed - name: (Python Setup) Use version ${{ matrix.python }} with Conda package manager Miniforge + if: inputs.use-venv_virtual_python_environment == 'false' id: prepare-conda-environment uses: conda-incubator/setup-miniconda@505e6394dae86d6a5c7fbb6e3fb8938e3e863830 # v3 with: python-version: ${{ matrix.python }} miniforge-version: ${{ matrix.miniforge }} activate-environment: codegraph - environment-file: ./jupyter/environment.yml + environment-file: ./conda-environment.yml auto-activate-base: false show-channel-urls: true + + - name: (Python Setup) Use version ${{ matrix.python }} with venv environment management module + if: inputs.use-venv_virtual_python_environment == 'true' + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python }} + cache: 'pip' + - name: (Python Setup) Conda environment info + if: inputs.use-venv_virtual_python_environment == 'false' shell: bash -el {0} run: | conda info @@ -168,6 +184,7 @@ jobs: ENABLE_JUPYTER_NOTEBOOK_PDF_GENERATION: ${{ inputs.jupyter-pdf }} IMPORT_GIT_LOG_DATA_IF_SOURCE_IS_PRESENT: "" # Options: "none", "aggregated", "full". default = "plugin" or "" PREPARE_CONDA_ENVIRONMENT: "false" # Had already been done in step with id "prepare-conda-environment". + USE_VIRTUAL_PYTHON_ENVIRONMENT_VENV: ${{ inputs.use-venv_virtual_python_environment }} run: | TYPESCRIPT_SCAN_HEAP_MEMORY=${{ inputs.typescript-scan-heap-memory }} ./../../scripts/analysis/analyze.sh ${{ inputs.analysis-arguments }} diff --git a/.gitignore b/.gitignore index 6d3a34750..e393661b2 100644 --- a/.gitignore +++ b/.gitignore @@ -98,6 +98,8 @@ __pycache__/ # Python environments .conda +.venv/ +*.pyc # Optuna (and other) Database data *.db \ No newline at end of file diff --git a/COMMANDS.md b/COMMANDS.md index 5c30f0099..99fac5470 100644 --- a/COMMANDS.md +++ b/COMMANDS.md @@ -389,23 +389,23 @@ Here is an example on how to use [executeJupyterNotebook.sh](./scripts/executeJu conda activate codegraph ``` - or by using the environment file [codegraph-environment.yml](./jupyter/environment.yml): + or by using the codegraph environment file [conda-environment.yml](./conda-environment.yml): ```shell - conda env create --file ./jupyter/environment.yml + conda env create --file ./conda-environment.yml conda activate codegraph ``` -- Export full environment.yml +- Export full conda-environment.yml ```shell - conda env export --name codegraph > full-codegraph-environment.yml + conda env export --name codegraph > full-codegraph-conda-environment.yml ``` -- Export only explicit environment.yml +- Export only explicit conda-environment.yml ```shell - conda env export --from-history --name codegraph | grep -v "^prefix: " > explicit-codegraph-environment.yml + conda env export --from-history --name codegraph | grep -v "^prefix: " > explicit-codegraph-conda-environment.yml ``` ### Executing Jupyter Notebooks with nbconvert diff --git a/GETTING_STARTED.md b/GETTING_STARTED.md index 8850965e2..f2e431496 100644 --- a/GETTING_STARTED.md +++ b/GETTING_STARTED.md @@ -84,13 +84,13 @@ Use these optional command line options as needed: ./../../scripts/analysis/analyze.sh --report Csv ``` -- Jupyter notebook reports when Python and Conda are installed (and Chromium Browser for PDF generation): +- Jupyter notebook reports when Python and Conda (or venv) are installed (and Chromium Browser for PDF generation): ```shell ./../../scripts/analysis/analyze.sh --report Jupyter ``` -- Python reports when Python and Conda are installed (without Chromium Browser for PDF generation): +- Python reports when Python and Conda (or venv) are installed (without Chromium Browser for PDF generation): ```shell ./../../scripts/analysis/analyze.sh --report Python @@ -102,7 +102,7 @@ Use these optional command line options as needed: ./../../scripts/analysis/analyze.sh --report Visualization ``` -- All reports with Python, Conda, Node.js and npm installed: +- All reports with Python, Conda (or venv), Node.js and npm installed: ```shell ./../../scripts/analysis/analyze.sh diff --git a/README.md b/README.md index 2634945d3..f002c6147 100644 --- a/README.md +++ b/README.md @@ -86,8 +86,10 @@ Here are some fully automated graph visualizations utilizing [GraphViz](https:// ### Additional Prerequisites for Python and Jupyter Notebooks -- Python is required for Jupyter Notebook reports. -- A conda package manager like [Miniconda](https://docs.conda.io/projects/miniconda/en/latest) or [Anaconda](https://www.anaconda.com/download)(Recommended for Windows) is required for Jupyter Notebook reports. +- Python is required for Jupyter Notebook and Python reports. +- Either [Conda](https://docs.conda.io) or Python's build-in module [venv](https://docs.python.org/3/library/venv.html) a required as environment manager. +- For Conda, use for example [Miniconda](https://docs.conda.io/projects/miniconda/en/latest) or [Anaconda](https://www.anaconda.com/download)(Recommended for Windows). +- To use venv, no additional installation is needed. For that the environment variable `USE_VIRTUAL_PYTHON_ENVIRONMENT_VENV` needs to be set to `'true'`. - Chromium will automatically be downloaded if needed for Jupyter Notebook PDF reports generation. ### Additional Prerequisites for Graph Visualization @@ -131,13 +133,14 @@ The [Code Structure Analysis Pipeline](./.github/workflows/internal-java-code-an - [Checkout GIT Repository](https://github.com/actions/checkout) - [Setup Java](https://github.com/actions/setup-java) - [Setup Python with Conda](https://github.com/conda-incubator/setup-miniconda) package manager [Mambaforge](https://github.com/conda-forge/miniforge#mambaforge) +- [Setup Python with venv](https://docs.python.org/3/library/venv.html) - Download artifacts and optionally source code that contain the code to be analyzed [scripts/downloader](./scripts/downloader) - Setup [Neo4j](https://neo4j.com) Graph Database ([analysis.sh](./scripts/analysis/analyze.sh)) - Setup [jQAssistant](https://jqassistant.github.io/jqassistant/current) for Java and [Typescript](https://github.com/jqassistant-plugin/jqassistant-typescript-plugin) analysis ([analysis.sh](./scripts/analysis/analyze.sh)) - Start [Neo4j](https://neo4j.com) Graph Database ([analysis.sh](./scripts/analysis/analyze.sh)) - Generate CSV Reports [scripts/reports](./scripts/reports) using the command line JSON parser [jq](https://jqlang.github.io/jq) - Uses [Neo4j Graph Data Science](https://neo4j.com/product/graph-data-science) for community detection, centrality, similarity, node embeddings and topological sort ([analysis.sh](./scripts/analysis/analyze.sh)) -- Generate [Jupyter Notebook](https://jupyter.org) reports using these libraries specified in the [environment.yml](./jupyter/environment.yml): +- Generate [Jupyter Notebook](https://jupyter.org) reports using these libraries specified in the [conda-environment.yml](./conda-environment.yml): - [Python](https://www.python.org) - [jupyter](https://jupyter.org) - [matplotlib](https://matplotlib.org) diff --git a/jupyter/environment.yml b/conda-environment.yml similarity index 100% rename from jupyter/environment.yml rename to conda-environment.yml diff --git a/domains/anomaly-detection/anomalyDetectionPython.sh b/domains/anomaly-detection/anomalyDetectionPython.sh index 4bd69c2a3..bd12c2f87 100755 --- a/domains/anomaly-detection/anomalyDetectionPython.sh +++ b/domains/anomaly-detection/anomalyDetectionPython.sh @@ -60,6 +60,24 @@ source "${SCRIPTS_DIR}/executeQueryFunctions.sh" # Define functions to create and delete Graph Projections like "createUndirectedDependencyProjection" source "${SCRIPTS_DIR}/projectionFunctions.sh" +# Define functions (like is_csv_column_greater_zero) to parse CSV format strings from Cypher query results. +source "${SCRIPTS_DIR}/parseCsvFunctions.sh" + +is_sufficient_data_available() { + language=$( extractQueryParameter "projection_language" "${@}" ) + nodeLabel=$( extractQueryParameter "projection_node_label" "${@}" ) + + query_result=$( execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionNodeCount.cypher" "${@}" ) + node_count=$(get_csv_column_value "${query_result}" "node_count") + if [ "${node_count}" -lt 15 ]; then + echo "anomalyDetectionPipeline: Warning: Skipping anomaly detection. Only ${node_count} ${language} ${nodeLabel} nodes. At least 15 required." + false + else + echo "anomalyDetectionPipeline: Info: Running anomaly detection with ${node_count} ${language} ${nodeLabel} nodes." + true + fi +} + # Query or recalculate features. # # Required Parameters: @@ -158,30 +176,38 @@ EMBEDDING_PROPERTY="embedding_property=embeddingsFastRandomProjectionTunedForClu # -- Java Artifact Node Embeddings ------------------------------- -if createUndirectedDependencyProjection "${PROJECTION_NAME}=artifact-anomaly-detection" "${PROJECTION_NODE}=Artifact" "${PROJECTION_WEIGHT}=weight" "${PROJECTION_LANGUAGE}=Java"; then - createDirectedDependencyProjection "${PROJECTION_NAME}=artifact-anomaly-detection-directed" "${PROJECTION_NODE}=Artifact" "${PROJECTION_WEIGHT}=weight" "${PROJECTION_LANGUAGE}=Java" - anomaly_detection_python_reports "${ALGORITHM_PROJECTION}=artifact-anomaly-detection" "${ALGORITHM_NODE}=Artifact" "${ALGORITHM_WEIGHT}=weight" "${ALGORITHM_LANGUAGE}=Java" "${COMMUNITY_PROPERTY}" "${EMBEDDING_PROPERTY}" +if is_sufficient_data_available "${ALGORITHM_NODE}=Artifact" "${ALGORITHM_WEIGHT}=weight"; then + if createUndirectedDependencyProjection "${PROJECTION_NAME}=artifact-anomaly-detection" "${PROJECTION_NODE}=Artifact" "${PROJECTION_WEIGHT}=weight" "${PROJECTION_LANGUAGE}=Java"; then + createDirectedDependencyProjection "${PROJECTION_NAME}=artifact-anomaly-detection-directed" "${PROJECTION_NODE}=Artifact" "${PROJECTION_WEIGHT}=weight" "${PROJECTION_LANGUAGE}=Java" + anomaly_detection_python_reports "${ALGORITHM_PROJECTION}=artifact-anomaly-detection" "${ALGORITHM_NODE}=Artifact" "${ALGORITHM_WEIGHT}=weight" "${ALGORITHM_LANGUAGE}=Java" "${COMMUNITY_PROPERTY}" "${EMBEDDING_PROPERTY}" + fi fi # -- Java Package Node Embeddings -------------------------------- -if createUndirectedDependencyProjection "${PROJECTION_NAME}=package-anomaly-detection" "${PROJECTION_NODE}=Package" "${PROJECTION_WEIGHT}=weight25PercentInterfaces" "${PROJECTION_LANGUAGE}=Java"; then - createDirectedDependencyProjection "${PROJECTION_NAME}=package-anomaly-detection-directed" "${PROJECTION_NODE}=Package" "${PROJECTION_WEIGHT}=weight25PercentInterfaces" "${PROJECTION_LANGUAGE}=Java" - anomaly_detection_python_reports "${ALGORITHM_PROJECTION}=package-anomaly-detection" "${ALGORITHM_NODE}=Package" "${ALGORITHM_WEIGHT}=weight25PercentInterfaces" "${ALGORITHM_LANGUAGE}=Java" "${COMMUNITY_PROPERTY}" "${EMBEDDING_PROPERTY}" +if is_sufficient_data_available "${ALGORITHM_NODE}=Package" "${ALGORITHM_WEIGHT}=weight25PercentInterfaces"; then + if createUndirectedDependencyProjection "${PROJECTION_NAME}=package-anomaly-detection" "${PROJECTION_NODE}=Package" "${PROJECTION_WEIGHT}=weight25PercentInterfaces" "${PROJECTION_LANGUAGE}=Java"; then + createDirectedDependencyProjection "${PROJECTION_NAME}=package-anomaly-detection-directed" "${PROJECTION_NODE}=Package" "${PROJECTION_WEIGHT}=weight25PercentInterfaces" "${PROJECTION_LANGUAGE}=Java" + anomaly_detection_python_reports "${ALGORITHM_PROJECTION}=package-anomaly-detection" "${ALGORITHM_NODE}=Package" "${ALGORITHM_WEIGHT}=weight25PercentInterfaces" "${ALGORITHM_LANGUAGE}=Java" "${COMMUNITY_PROPERTY}" "${EMBEDDING_PROPERTY}" + fi fi # -- Java Type Node Embeddings ----------------------------------- -if createUndirectedJavaTypeDependencyProjection "${PROJECTION_NAME}=type-anomaly-detection"; then - createDirectedJavaTypeDependencyProjection "${PROJECTION_NAME}=type-anomaly-detection-directed" - anomaly_detection_python_reports "${ALGORITHM_PROJECTION}=type-anomaly-detection" "${ALGORITHM_NODE}=Type" "${ALGORITHM_WEIGHT}=weight" "${ALGORITHM_LANGUAGE}=Java" "${COMMUNITY_PROPERTY}" "${EMBEDDING_PROPERTY}" +if is_sufficient_data_available "${ALGORITHM_NODE}=Type" "${ALGORITHM_WEIGHT}=weight"; then + if createUndirectedJavaTypeDependencyProjection "${PROJECTION_NAME}=type-anomaly-detection"; then + createDirectedJavaTypeDependencyProjection "${PROJECTION_NAME}=type-anomaly-detection-directed" + anomaly_detection_python_reports "${ALGORITHM_PROJECTION}=type-anomaly-detection" "${ALGORITHM_NODE}=Type" "${ALGORITHM_WEIGHT}=weight" "${ALGORITHM_LANGUAGE}=Java" "${COMMUNITY_PROPERTY}" "${EMBEDDING_PROPERTY}" + fi fi # -- Typescript Module Node Embeddings --------------------------- -if createUndirectedDependencyProjection "${PROJECTION_NAME}=typescript-module-embedding" "${PROJECTION_NODE}=Module" "${PROJECTION_WEIGHT}=lowCouplingElement25PercentWeight" "${PROJECTION_LANGUAGE}=Typescript"; then - createDirectedDependencyProjection "${PROJECTION_NAME}=typescript-module-embedding-directed" "${PROJECTION_NODE}=Module" "${PROJECTION_WEIGHT}=lowCouplingElement25PercentWeight" "${PROJECTION_LANGUAGE}=Typescript" - anomaly_detection_python_reports "${ALGORITHM_PROJECTION}=typescript-module-embedding" "${ALGORITHM_NODE}=Module" "${ALGORITHM_WEIGHT}=lowCouplingElement25PercentWeight" "${ALGORITHM_LANGUAGE}=Typescript" "${COMMUNITY_PROPERTY}" "${EMBEDDING_PROPERTY}" +if is_sufficient_data_available "${ALGORITHM_NODE}=Module" "${ALGORITHM_WEIGHT}=lowCouplingElement25PercentWeight"; then + if createUndirectedDependencyProjection "${PROJECTION_NAME}=typescript-module-embedding" "${PROJECTION_NODE}=Module" "${PROJECTION_WEIGHT}=lowCouplingElement25PercentWeight" "${PROJECTION_LANGUAGE}=Typescript"; then + createDirectedDependencyProjection "${PROJECTION_NAME}=typescript-module-embedding-directed" "${PROJECTION_NODE}=Module" "${PROJECTION_WEIGHT}=lowCouplingElement25PercentWeight" "${PROJECTION_LANGUAGE}=Typescript" + anomaly_detection_python_reports "${ALGORITHM_PROJECTION}=typescript-module-embedding" "${ALGORITHM_NODE}=Module" "${ALGORITHM_WEIGHT}=lowCouplingElement25PercentWeight" "${ALGORITHM_LANGUAGE}=Typescript" "${COMMUNITY_PROPERTY}" "${EMBEDDING_PROPERTY}" + fi fi # --------------------------------------------------------------- diff --git a/domains/anomaly-detection/queries/AnomalyDetectionNodeCount.cypher b/domains/anomaly-detection/queries/AnomalyDetectionNodeCount.cypher new file mode 100644 index 000000000..30b27e72a --- /dev/null +++ b/domains/anomaly-detection/queries/AnomalyDetectionNodeCount.cypher @@ -0,0 +1,10 @@ +// Count the number of nodes with dependencies. Variables: dependencies_projection_node, dependencies_projection_weight_property + + MATCH (source)-[dependency:DEPENDS_ON]->(target) + WHERE $projection_node_label IN labels(source) + AND $projection_node_label IN labels(target) + AND $projection_weight_property IN keys(dependency) + WITH collect(DISTINCT source.name) AS sources + ,collect(DISTINCT target.name) AS targets + UNWIND sources + targets AS source_or_target +RETURN count(DISTINCT source_or_target) AS node_count \ No newline at end of file diff --git a/domains/anomaly-detection/tunedAnomalyDetectionExplained.py b/domains/anomaly-detection/tunedAnomalyDetectionExplained.py index 7c2356474..4760e5f4a 100755 --- a/domains/anomaly-detection/tunedAnomalyDetectionExplained.py +++ b/domains/anomaly-detection/tunedAnomalyDetectionExplained.py @@ -384,10 +384,11 @@ def no_anomalies(cls): def tune_anomaly_detection_models( feature_matrix: np.ndarray, + parameters: Parameters, contamination: float | typing.Literal["auto"] = 0.05, random_seed: int = 42, number_of_trials: int = 25, - optimization_timeout_in_seconds: int = 60 + optimization_timeout_in_seconds: int = 50 ) -> AnomalyDetectionResults: """ Tunes both Isolation Forest and a proxy Random Forest using Optuna, maximizing the F1 score @@ -464,7 +465,7 @@ def objective(trial) -> float: # Print the number of samples and features in the feature matrix n_samples = feature_matrix.shape[0] - print(f"tunedAnomalyDetectionExplained: Tuned Anomaly Detection: Number of samples: {n_samples}, Number of features: {feature_matrix.shape[1]}, Number of trials: {number_of_trials}") + print(f"tunedAnomalyDetectionExplained: Tuning Anomaly Detection: Number of samples: {n_samples}, Number of features: {feature_matrix.shape[1]}, Number of trials: {number_of_trials}") # Run Optuna optimization study = create_study(direction="maximize", sampler=TPESampler(seed=random_seed), study_name="AnomalyDetection_Tuning") @@ -480,7 +481,12 @@ def objective(trial) -> float: study.enqueue_trial({'isolation_max_samples': 0.10015063610944819, 'isolation_n_estimators': 329, 'proxy_n_estimators': 314, 'proxy_max_depth': 8}) study.optimize(objective, n_trials=number_of_trials, timeout=optimization_timeout_in_seconds) - output_optuna_tuning_results(study, study.study_name) + + # Output tuning results + print(f"Best Isolation & Random Forest parameters for {parameters.get_plot_prefix()} after {len(study.trials)}/{number_of_trials} trials with best #{study.best_trial.number} (Optuna):", study.best_params) + + if parameters.is_verbose(): + output_optuna_tuning_results(study, study.study_name) if np.isclose(study.best_value, 0.0, rtol=1e-09, atol=1e-09): red = "\x1b[31;20m" @@ -869,7 +875,7 @@ def add_top_shap_features_to_anomalies( features_prepared = np.hstack([features_standardized, node_embeddings_reduced]) feature_names = list(features_to_standardize) + [f'nodeEmbeddingPCA_{i}' for i in range(node_embeddings_reduced.shape[1])] -anomaly_detection_results = tune_anomaly_detection_models(features_prepared) +anomaly_detection_results = tune_anomaly_detection_models(features_prepared, parameters) if anomaly_detection_results.is_empty(): sys.exit(0) diff --git a/domains/anomaly-detection/tunedLeidenCommunityDetection.py b/domains/anomaly-detection/tunedLeidenCommunityDetection.py index 832a25f13..64d5a2a58 100755 --- a/domains/anomaly-detection/tunedLeidenCommunityDetection.py +++ b/domains/anomaly-detection/tunedLeidenCommunityDetection.py @@ -359,10 +359,10 @@ def objective(trial): study.enqueue_trial({'gamma': 1.14, 'theta': 0.001, 'max_levels': 10}) # Execute the hyperparameter tuning - study.optimize(objective, n_trials=20, timeout=30) + study.optimize(objective, n_trials=20, timeout=20) # Output tuning results - print(f"Best Leiden Community Detection parameters for {parameters.get_projection_name()} (Optuna):", study.best_params) + print(f"Best Leiden Community Detection parameters for {parameters.get_projection_name()} after {len(study.trials)}/20 trials with best #{study.best_trial.number} (Optuna):", study.best_params) if parameters.is_verbose(): output_detailed_optuna_tuning_results(study) diff --git a/domains/anomaly-detection/tunedNodeEmbeddingClustering.py b/domains/anomaly-detection/tunedNodeEmbeddingClustering.py index 2aae7e7cd..514f317da 100755 --- a/domains/anomaly-detection/tunedNodeEmbeddingClustering.py +++ b/domains/anomaly-detection/tunedNodeEmbeddingClustering.py @@ -308,7 +308,7 @@ def objective(trial): # Start the hyperparameter tuning study.optimize(objective, n_trials=20, timeout=10) - print(f"Best HDBSCAN parameters (Optuna):", study.best_params) + print(f"Best HDBSCAN parameters after {len(study.trials)}/20 trials with best #{study.best_trial.number} (Optuna):", study.best_params) if parameters.is_verbose(): output_detailed_optuna_tuning_results(study, 'HDBSCAN') @@ -709,10 +709,12 @@ def objective(trial): study.enqueue_trial({'embedding_dimension': 128, 'forth_iteration_weight': 1.0, 'normalization_strength': 0.5}) study.enqueue_trial({'embedding_dimension': 256, 'forth_iteration_weight': 0.5, 'normalization_strength': 0.3}) study.enqueue_trial({'embedding_dimension': 256, 'forth_iteration_weight': 1.0, 'normalization_strength': 0.3}) + study.enqueue_trial({'embedding_dimension': 64, 'normalization_strength': -0.4, 'forth_iteration_weight': 1.4}) + study.enqueue_trial({'embedding_dimension': 256, 'normalization_strength': 0.3, 'forth_iteration_weight': 1.0}) # Start the hyperparameter tuning study.optimize(objective, n_trials=80, timeout=40) - print(f"Best Fast Random Projection (FastRP) parameters for {parameters.get_projection_name()} (Optuna):", study.best_params) + print(f"Best Fast Random Projection (FastRP) parameters for {parameters.get_projection_name()} after {len(study.trials)}/80 trials with best #{study.best_trial.number} (Optuna):", study.best_params) if parameters.is_verbose(): output_detailed_optuna_tuning_results(study, 'Fast Random Projection (FastRP)') diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 000000000..5a8642692 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,32 @@ +# --- Core Python version --- +# NOTE: Python version must be >= 3.12 for compatibility +# This should be enforced by the user/environment, not pip + +# --- Core tools --- +jupyter==1.1.* +matplotlib==3.10.* +nbconvert[webpdf]==7.16.* +numpy==1.26.* +pandas==2.2.* +pip==25.0.* +setuptools==75.8.* # opentsne uses sklearn.base uses joblib uses distutils missing in Python >= 12 (TODO use native openTSNE?) +typing-extensions==4.12.* # Needed for opentsne and Python >= 3.12 + +# --- Visualization --- +wordcloud==1.9.* +monotonic==1.* +plotly[kaleido]==6.2.* +seaborn==0.13 # To visualize clustering results + +# --- Machine Learning / Optimization --- +scikit-learn==1.6.* +optuna==4.3.* +umap-learn==0.5.* # Dimensionality reduction to visualize node embeddings in 2D + +# --- Database connector --- +neo4j==5.23.* + +# --- Native/scientific packages (may require compilation) --- +# These are included but may cause install errors in pip/venv +opentsne==1.0.* # Dimensionality reduction to visualize node embeddings in 2D. Might get replaced by umap. +shap==0.48.* # For e.g. explaining anomaly detection results \ No newline at end of file diff --git a/scripts/activateCondaEnvironment.sh b/scripts/activateCondaEnvironment.sh index 7e12dc718..5ebbef857 100755 --- a/scripts/activateCondaEnvironment.sh +++ b/scripts/activateCondaEnvironment.sh @@ -1,17 +1,33 @@ #!/usr/bin/env bash -# Activates the Conda (Python package manager) environment "codegraph" with all packages needed to execute the Jupyter Notebooks. +# Activates the Conda (Python package manager) environment "codegraph" with all packages needed to run the included Jupyter Notebooks and Python scripts. # Note: This script uses the conda environment defined in CODEGRAPH_CONDA_ENVIRONMENT (defaults to "codegraph"). -# If the environment hadn't been created yet it will use "environment.yml" -# in the same directory as the given jupyter notebook ipynb file -# to create the environment. +# If the environment hadn't been created yet, it will use "conda-environment.yml" from the root directory to create the environment. # Requires operatingSystemFunctions.sh # Fail on any error ("-e" = exit on first error, "-o pipefail" exist on errors within piped commands) set -o errexit -o pipefail +PREPARE_CONDA_ENVIRONMENT=${PREPARE_CONDA_ENVIRONMENT:-"true"} # Wether to prepare a Python environment with Conda if needed (default, "true") or use an already prepared Conda environment ("false") + +if [ "${PREPARE_CONDA_ENVIRONMENT}" = "false" ]; then + echo "activateCondaEnvironment: Skipping activation. An already activated environment and installed dependencies are expected (PREPARE_CONDA_ENVIRONMENT=false)." + # "return" needs to be used here instead of "exit". + # This script is included in another script by using "source". + # "exit" would end the main script, "return" just ends this sub script. + return 0 +fi + +if [ "${USE_VIRTUAL_PYTHON_ENVIRONMENT_VENV}" = "true" ]; then + echo "activateCondaEnvironment: Skipping activation. venv will be used instead of conda (USE_VIRTUAL_PYTHON_ENVIRONMENT_VENV=true)." + # "return" needs to be used here instead of "exit". + # This script is included in another script by using "source". + # "exit" would end the main script, "return" just ends this sub script. + return 0 +fi + ## Get this "scripts" directory if not already set # Even if $BASH_SOURCE is made for Bourne-like shells it is also supported by others and therefore here the preferred solution. # CDPATH reduces the scope of the cd command to potentially prevent unintended directory changes. @@ -25,7 +41,7 @@ echo "activateCondaEnvironment: JUPYTER_NOTEBOOK_DIRECTORY=${JUPYTER_NOTEBOOK_DI # Get the file name of the environment description file for the conda package and environment manager # that contains all dependencies and their versions. -CONDA_ENVIRONMENT_FILE=${CONDA_ENVIRONMENT_FILE:-"${JUPYTER_NOTEBOOK_DIRECTORY}/environment.yml"} # Conda (package manager for Python) environment file path +CONDA_ENVIRONMENT_FILE=${CONDA_ENVIRONMENT_FILE:-"${JUPYTER_NOTEBOOK_DIRECTORY}/../conda-environment.yml"} # Conda (package manager for Python) environment file path if [ ! -f "${CONDA_ENVIRONMENT_FILE}" ] ; then echo "activateCondaEnvironment: Couldn't find environment file ${CONDA_ENVIRONMENT_FILE}." exit 2 @@ -37,16 +53,6 @@ echo "activateCondaEnvironment: CONDA_PREFIX=${CONDA_PREFIX}" echo "activateCondaEnvironment: Current conda environment=${CONDA_DEFAULT_ENV}" echo "activateCondaEnvironment: Target conda environment=${CODEGRAPH_CONDA_ENVIRONMENT}" -PREPARE_CONDA_ENVIRONMENT=${PREPARE_CONDA_ENVIRONMENT:-"true"} # Wether to prepare a Python environment with Conda if needed (default, "true") or use an already prepared Conda environment ("false") - -if [ "${PREPARE_CONDA_ENVIRONMENT}" = "false" ]; then - echo "activateCondaEnvironment: Skipping activation. ${PREPARE_CONDA_ENVIRONMENT} is set to false." - # "return" needs to be used here instead of "exit". - # This script is included in another script by using "source". - # "exit" would end the main script, "return" just ends this sub script. - return 0 -fi - # Include operation system function to for example detect Windows. source "${SCRIPTS_DIR}/operatingSystemFunctions.sh" diff --git a/scripts/activatePythonEnvironment.sh b/scripts/activatePythonEnvironment.sh new file mode 100755 index 000000000..0c1a6d0e8 --- /dev/null +++ b/scripts/activatePythonEnvironment.sh @@ -0,0 +1,96 @@ +#!/usr/bin/env bash + +# Activates the .venv environment (Python build-in virtual environments) with all packages necessary to run the included Jupyter Notebooks and Python scripts. + +# Note: If the environment hadn't been created yet, it will use "requirements.txt" from the root directory to create the environment. + +# Requires operatingSystemFunctions.sh + +# Fail on any error ("-e" = exit on first error, "-o pipefail" exist on errors within piped commands) +set -o errexit -o pipefail + +USE_VIRTUAL_PYTHON_ENVIRONMENT_VENV=${USE_VIRTUAL_PYTHON_ENVIRONMENT_VENV:-"false"} # Use "venv" for virtual Python environments ("true") or use an already prepared (e.g. conda) environment (default, "false"). + +if [ "${USE_VIRTUAL_PYTHON_ENVIRONMENT_VENV}" = "false" ]; then + echo "activatePythonEnvironment: Skipping activation. An already activated environment and installed dependencies are expected e.g. by using conda (USE_VIRTUAL_PYTHON_ENVIRONMENT_VENV=false)." + # "return" needs to be used here instead of "exit". + # This script is included in another script by using "source". + # "exit" would end the main script, "return" just ends this sub script. + return 0 +fi + +## Get this "scripts" directory if not already set +# Even if $BASH_SOURCE is made for Bourne-like shells it is also supported by others and therefore here the preferred solution. +# CDPATH reduces the scope of the cd command to potentially prevent unintended directory changes. +# This way non-standard tools like readlink aren't needed. +SCRIPTS_DIR=${SCRIPTS_DIR:-$( CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P )} # Repository directory containing the shell scripts +echo "activatePythonEnvironment: SCRIPTS_DIR=${SCRIPTS_DIR}" + +# Get the root directory by taking the path of this script and going one directory up. +ROOT_DIRECTORY=${ROOT_DIRECTORY:-"$(dirname "${SCRIPTS_DIR}")"} # Repository directory containing the Jupyter Notebooks +echo "activatePythonEnvironment: ROOT_DIRECTORY=${ROOT_DIRECTORY}" + +# Get the file name of the environment description file for the conda package and environment manager +# that contains all dependencies and their versions. +PYTHON_ENVIRONMENT_FILE=${PYTHON_ENVIRONMENT_FILE:-"${ROOT_DIRECTORY}/requirements.txt"} # Pip (package manager for Python) environment file path +if [ ! -f "${PYTHON_ENVIRONMENT_FILE}" ] ; then + echo "activatePythonEnvironment: Couldn't find environment file ${PYTHON_ENVIRONMENT_FILE}." + exit_failed +fi + +deactivate_conda_if_necessary() { + # Include operation system function to for example detect Windows. + source "${SCRIPTS_DIR}/operatingSystemFunctions.sh" + + # Determine the path to "conda" + if [ -n "${CONDA}" ]; then + if isWindows; then + pathToConda="${CONDA}\\Scripts\\" # the trailing backslash character is required + else + pathToConda="${CONDA}/bin/" # the trailing slash character is required + fi + else + pathToConda="" + fi + echo "activatePythonEnvironment: pathToConda=${pathToConda} (for deactivation)" + + scriptExtension=$(ifWindows ".bat" "") + echo "activatePythonEnvironment: scriptExtension=${scriptExtension}" + + if "${pathToConda}conda" deactivate >/dev/null 2>&1; then + # Call "deactivate" a second time to also deactivate the "base" environment + "${pathToConda}conda" deactivate >/dev/null 2>&1; + echo "activatePythonEnvironment: Conda deactivated" + else + echo "activatePythonEnvironment: Conda not found. Deactivation skipped" + fi +} + +VENV_DIRECTORY=".venv" + +# Create the virtual environment if needed +if [ ! -d "${ROOT_DIRECTORY}/${VENV_DIRECTORY}" ]; then + deactivate_conda_if_necessary + echo "activatePythonEnvironment: Creating ${VENV_DIRECTORY} environment..." + python3 -m venv "${ROOT_DIRECTORY}/${VENV_DIRECTORY}" +else + echo "activatePythonEnvironment: Already created ${VENV_DIRECTORY} environment." +fi + +# Activate the virtual environment if needed +if [ "${VIRTUAL_ENV}" != "${ROOT_DIRECTORY}/${VENV_DIRECTORY}" ]; then + echo "activatePythonEnvironment: Activate ${VENV_DIRECTORY} environment..." + source "${ROOT_DIRECTORY}/${VENV_DIRECTORY}/bin/activate" +else + echo "activatePythonEnvironment: Already activated ${VENV_DIRECTORY} environment." +fi + +# Install the virtual environment if needed +if pip install --dry-run --no-deps --requirement "${PYTHON_ENVIRONMENT_FILE}" 2>/dev/null | grep -q "Would install"; then + echo "activatePythonEnvironment: Install environment dependencies..." + pip install -q --requirement "${PYTHON_ENVIRONMENT_FILE}" +else + echo "activatePythonEnvironment: Already installed environment dependencies." +fi + +echo "activatePythonEnvironment: Python installation: $(which python)" \ No newline at end of file diff --git a/scripts/executeJupyterNotebook.sh b/scripts/executeJupyterNotebook.sh index 4e852011b..980b7156a 100755 --- a/scripts/executeJupyterNotebook.sh +++ b/scripts/executeJupyterNotebook.sh @@ -11,9 +11,12 @@ # The original ones are typically saved with all outputs cleared to be able to better compare their changes with git diff. # Note: This script uses the conda environment defined in CODEGRAPH_CONDA_ENVIRONMENT (defaults to "codegraph"). -# If the environment hadn't been created yet it will use "environment.yml" -# in the same directory as the given jupyter notebook ipynb file -# to create the environment. +# If the environment hadn't been created yet it will use "conda-environment.yml" +# in the root directory of this repository to create the environment. + +# Note: Besides Conda, Python's build-in virtual environment module "venv" is also supported. +# If the environment hadn't been created yet it will use "requirements.txt" +# in the root directory of this repository to create the environment. # Requires juypter nbconvert,operatingSystemFunctions.sh @@ -85,8 +88,12 @@ if [ ! -f "${jupyter_notebook_file_path}/.env" ] ; then echo "NEO4J_INITIAL_PASSWORD=${NEO4J_INITIAL_PASSWORD}" > "${jupyter_notebook_file_path}/.env" fi -# Create and activate (if necessary) Conda environment as defined in environment variable CODEGRAPH_CONDA_ENVIRONMENT (default "codegraph") -source "${SCRIPTS_DIR}/activateCondaEnvironment.sh" +# Create and activate (if necessary) a virtual environment (Conda or venv). +# For Conda, the environment name is taken from the environment variable CODEGRAPH_CONDA_ENVIRONMENT (default "codegraph") +# and the dependencies are listed in the root directory file "conda-environment.yml". +# For venv, the dependencies are listed in the root directory file "requirements.txt". +time source "${SCRIPTS_DIR}/activateCondaEnvironment.sh" +time source "${SCRIPTS_DIR}/activatePythonEnvironment.sh" jupyter --version || exit 1 diff --git a/scripts/reports/compilations/PythonReports.sh b/scripts/reports/compilations/PythonReports.sh index b46e89bf5..730736916 100755 --- a/scripts/reports/compilations/PythonReports.sh +++ b/scripts/reports/compilations/PythonReports.sh @@ -22,10 +22,20 @@ echo "PythonReports: REPORT_COMPILATIONS_SCRIPT_DIR=${REPORT_COMPILATIONS_SCRIPT REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR:-$(dirname -- "${REPORT_COMPILATIONS_SCRIPT_DIR}")} echo "PythonReports: REPORTS_SCRIPT_DIR=${REPORTS_SCRIPT_DIR}" +SCRIPTS_DIR=${SCRIPTS_DIR:-$(dirname -- "${REPORTS_SCRIPT_DIR}")} +echo "PythonReports: SCRIPTS_DIR=${SCRIPTS_DIR}" + # Get the "domains" directory that contains analysis and report scripts by functionality. DOMAINS_DIRECTORY=${DOMAINS_DIRECTORY:-"${REPORTS_SCRIPT_DIR}/../../domains"} echo "PythonReports: DOMAINS_DIRECTORY=${DOMAINS_DIRECTORY}" +# Create and activate (if necessary) a virtual environment (Conda or venv). +# For Conda, the environment name is taken from the environment variable CODEGRAPH_CONDA_ENVIRONMENT (default "codegraph") +# and the dependencies are listed in the root directory file "conda-environment.yml". +# For venv, the dependencies are listed in the root directory file "requirements.txt". +time source "${SCRIPTS_DIR}/activateCondaEnvironment.sh" +time source "${SCRIPTS_DIR}/activatePythonEnvironment.sh" + # Run all Python report scripts (filename ending with Csv.sh) in the REPORTS_SCRIPT_DIR and DOMAINS_DIRECTORY directories. for directory in "${REPORTS_SCRIPT_DIR}" "${DOMAINS_DIRECTORY}"; do if [ ! -d "${directory}" ]; then