From 6e14e7a8e1808fe013aa50c39b80406005b5c434 Mon Sep 17 00:00:00 2001 From: MelleB Date: Wed, 23 Jun 2021 09:18:23 +0200 Subject: [PATCH] (fix) Bump to airflow 1.10.12 based on PR #623 --- Dockerfile | 5 +- config/airflow.cfg | 96 +++++++++++++++++++++++++++++-- docker-compose-CeleryExecutor.yml | 8 +-- docker-compose-LocalExecutor.yml | 2 +- script/entrypoint.sh | 6 +- 5 files changed, 101 insertions(+), 16 deletions(-) diff --git a/Dockerfile b/Dockerfile index 02782d0c..e9a946af 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,6 @@ -# VERSION 1.10.9 +# VERSION 1.10.12 # AUTHOR: Matthieu "Puckel_" Roisil +# UPGRADE BY David Wong # DESCRIPTION: Basic Airflow container # BUILD: docker build --rm -t puckel/docker-airflow . # SOURCE: https://github.com/puckel/docker-airflow @@ -12,7 +13,7 @@ ENV DEBIAN_FRONTEND noninteractive ENV TERM linux # Airflow -ARG AIRFLOW_VERSION=1.10.9 +ARG AIRFLOW_VERSION=1.10.12 ARG AIRFLOW_USER_HOME=/usr/local/airflow ARG AIRFLOW_DEPS="" ARG PYTHON_DEPS="" diff --git a/config/airflow.cfg b/config/airflow.cfg index 9e4d5229..906f9149 100644 --- a/config/airflow.cfg +++ b/config/airflow.cfg @@ -110,6 +110,12 @@ sql_alchemy_pool_pre_ping = True # SqlAlchemy supports databases with the concept of multiple schemas. sql_alchemy_schema = +# Import path for connect args in SqlAlchemy. Default to an empty dict. +# This is useful when you want to configure db engine args that SqlAlchemy won't parse +# in connection string. +# See https://docs.sqlalchemy.org/en/13/core/engines.html#sqlalchemy.create_engine.params.connect_args +# sql_alchemy_connect_args = + # The amount of parallelism as a setting to the executor. This defines # the max number of task instances that should run simultaneously # on this airflow installation @@ -124,11 +130,16 @@ dags_are_paused_at_creation = True # The maximum number of active DAG runs per DAG max_active_runs_per_dag = 16 -# Whether to load the examples that ship with Airflow. It's good to +# Whether to load the DAG examples that ship with Airflow. It's good to # get started, but you probably want to set this to False in a production # environment load_examples = True +# Whether to load the default connections that ship with Airflow. It's good to +# get started, but you probably want to set this to False in a production +# environment +load_default_connections = False + # Where your Airflow plugins are stored plugins_folder = /usr/local/airflow/plugins @@ -184,7 +195,7 @@ dag_discovery_safe_mode = True # The number of retries each task is going to have by default. Can be overridden at dag or task level. default_task_retries = 0 -# Whether to serialises DAGs and persist them in DB. +# Whether to serialise DAGs and persist them in DB. # If set to True, Webserver reads from DB instead of parsing DAG files # More details: https://airflow.apache.org/docs/stable/dag-serialization.html store_serialized_dags = False @@ -192,9 +203,43 @@ store_serialized_dags = False # Updating serialized DAG can not be faster than a minimum interval to reduce database write rate. min_serialized_dag_update_interval = 30 +# Fetching serialized DAG can not be faster than a minimum interval to reduce database +# read rate. This config controls when your DAGs are updated in the Webserver +min_serialized_dag_fetch_interval = 10 + +# Whether to persist DAG files code in DB. +# If set to True, Webserver reads file contents from DB instead of +# trying to access files in a DAG folder. Defaults to same as the +# ``store_serialized_dags`` setting. +# Example: store_dag_code = False +# store_dag_code = + +# Maximum number of Rendered Task Instance Fields (Template Fields) per task to store +# in the Database. +# When Dag Serialization is enabled (``store_serialized_dags=True``), all the template_fields +# for each of Task Instance are stored in the Database. +# Keeping this number small may cause an error when you try to view ``Rendered`` tab in +# TaskInstance view for older tasks. +max_num_rendered_ti_fields_per_task = 30 + # On each dagrun check against defined SLAs check_slas = True +# Path to custom XCom class that will be used to store and resolve operators results +# Example: xcom_backend = path.to.CustomXCom +xcom_backend = airflow.models.xcom.BaseXCom + +[secrets] +# Full class name of secrets backend to enable (will precede env vars and metastore in search path) +# Example: backend = airflow.contrib.secrets.aws_systems_manager.SystemsManagerParameterStoreBackend +backend = + +# The backend_kwargs param is loaded into a dictionary and passed to __init__ of secrets backend class. +# See documentation for the secrets backend you are using. JSON is expected. +# Example for AWS Systems Manager ParameterStore: +# ``{{"connections_prefix": "/airflow/connections", "profile_name": "default"}}`` +backend_kwargs = + [cli] # In what way should the cli access the API. The LocalClient will use the # database directly, while the json_client will use the api running on the @@ -212,7 +257,9 @@ endpoint_url = http://localhost:8080 fail_fast = False [api] -# How to authenticate users of the API +# How to authenticate users of the API. See +# https://airflow.apache.org/docs/stable/security.html for possible values. +# ("airflow.api.auth.backend.default" allows all requests for historic reasons) auth_backend = airflow.api.auth.backend.default [lineage] @@ -245,6 +292,12 @@ default_hive_mapred_queue = # airflow sends to point links to the right web server base_url = http://localhost:8080 +# Default timezone to display all dates in the RBAC UI, can be UTC, system, or +# any IANA timezone string (e.g. Europe/Amsterdam). If left empty the +# default value of core/default_timezone will be used +# Example: default_ui_timezone = America/New_York +default_ui_timezone = + # The ip specified when starting the web server web_server_host = 0.0.0.0 @@ -273,6 +326,10 @@ worker_refresh_batch_size = 1 # Number of seconds to wait before refreshing a batch of workers. worker_refresh_interval = 30 +# If set to True, Airflow will track files in plugins_folder directory. When it detects changes, +# then reload the gunicorn. +reload_on_plugin_change = False + # Secret key used to run your flask app # It should be as random as possible secret_key = temporary_key @@ -734,18 +791,30 @@ verify_certs = True [kubernetes] # The repository, tag and imagePullPolicy of the Kubernetes Image for the Worker to Run worker_container_repository = + +# Path to the YAML pod file. If set, all other kubernetes-related fields are ignored. +# (This feature is experimental) +pod_template_file = worker_container_tag = worker_container_image_pull_policy = IfNotPresent -# If True (default), worker pods will be deleted upon termination +# If True, all worker pods will be deleted upon termination delete_worker_pods = True +# If False (and delete_worker_pods is True), +# failed worker pods will not be deleted so users can investigate them. +delete_worker_pods_on_failure = False + # Number of Kubernetes Worker Pod creation calls per scheduler loop worker_pods_creation_batch_size = 1 # The Kubernetes namespace where airflow workers should be created. Defaults to ``default`` namespace = default +# Allows users to launch pods in multiple namespaces. +# Will require creating a cluster-role for the scheduler +multi_namespace_mode = False + # The name of the Kubernetes ConfigMap containing the Airflow Configuration (this file) # Example: airflow_configmap = airflow-configmap airflow_configmap = @@ -782,6 +851,9 @@ dags_in_image = False # For either git sync or volume mounted DAGs, the worker will look in this subpath for DAGs dags_volume_subpath = +# For either git sync or volume mounted DAGs, the worker will mount the volume in this path +dags_volume_mount_point = + # For DAGs mounted via a volume claim (mutually exclusive with git-sync and host path) dags_volume_claim = @@ -810,6 +882,10 @@ env_from_secret_ref = # Git credentials and repository for DAGs mounted via Git (mutually exclusive with volume claim) git_repo = git_branch = + +# Use a shallow clone with a history truncated to the specified number of commits. +# 0 - do not use shallow clone. +git_sync_depth = 1 git_subpath = # The specific rev or hash the git_sync init container will checkout @@ -931,10 +1007,18 @@ tolerations = # Note that if no _request_timeout is specified, the kubernetes client will wait indefinitely # for kubernetes api responses, which will cause the scheduler to hang. # The timeout is specified as [connect timeout, read timeout] -kube_client_request_args = {{"_request_timeout" : [60,60] }} +kube_client_request_args = + +# Optional keyword arguments to pass to the ``delete_namespaced_pod`` kubernetes client +# ``core_v1_api`` method when using the Kubernetes Executor. +# This should be an object and can contain any of the options listed in the ``v1DeleteOptions`` +# class defined here: +# https://github.com/kubernetes-client/python/blob/41f11a09995efcd0142e25946adc7591431bfb2f/kubernetes/client/models/v1_delete_options.py#L19 +# Example: delete_option_kwargs = {{"grace_period_seconds": 10}} +delete_option_kwargs = # Specifies the uid to run the first process of the worker pods containers as -run_as_user = +run_as_user = 50000 # Specifies a gid to associate with all containers in the worker pods # if using a git_ssh_key_secret_name use an fs_group diff --git a/docker-compose-CeleryExecutor.yml b/docker-compose-CeleryExecutor.yml index de4f5dac..d5d0a281 100644 --- a/docker-compose-CeleryExecutor.yml +++ b/docker-compose-CeleryExecutor.yml @@ -16,7 +16,7 @@ services: # - ./pgdata:/var/lib/postgresql/data/pgdata webserver: - image: puckel/docker-airflow:1.10.9 + image: puckel/docker-airflow:1.10.12 restart: always depends_on: - postgres @@ -43,7 +43,7 @@ services: retries: 3 flower: - image: puckel/docker-airflow:1.10.9 + image: puckel/docker-airflow:1.10.12 restart: always depends_on: - redis @@ -55,7 +55,7 @@ services: command: flower scheduler: - image: puckel/docker-airflow:1.10.9 + image: puckel/docker-airflow:1.10.12 restart: always depends_on: - webserver @@ -74,7 +74,7 @@ services: command: scheduler worker: - image: puckel/docker-airflow:1.10.9 + image: puckel/docker-airflow:1.10.12 restart: always depends_on: - scheduler diff --git a/docker-compose-LocalExecutor.yml b/docker-compose-LocalExecutor.yml index 26e9e92e..3378d1c8 100644 --- a/docker-compose-LocalExecutor.yml +++ b/docker-compose-LocalExecutor.yml @@ -12,7 +12,7 @@ services: max-file: "3" webserver: - image: puckel/docker-airflow:1.10.9 + image: puckel/docker-airflow:1.10.12 restart: always depends_on: - postgres diff --git a/script/entrypoint.sh b/script/entrypoint.sh index 166f4837..34bbee7b 100755 --- a/script/entrypoint.sh +++ b/script/entrypoint.sh @@ -37,7 +37,7 @@ wait_for_port() { echo >&2 "$(date) - $host:$port still not reachable, giving up" exit 1 fi - echo "$(date) - waiting for $name... $j/$TRY_LOOP" + echo "$(date) - waiting for $name($host:$port)... $j/$TRY_LOOP" sleep 5 done } @@ -100,8 +100,8 @@ if [ "$AIRFLOW__CORE__EXECUTOR" = "CeleryExecutor" ]; then else # Derive useful variables from the AIRFLOW__ variables provided explicitly by the user REDIS_ENDPOINT=$(echo -n "$AIRFLOW__CELERY__BROKER_URL" | cut -d '/' -f3 | sed -e 's,.*@,,') - REDIS_HOST=$(echo -n "$POSTGRES_ENDPOINT" | cut -d ':' -f1) - REDIS_PORT=$(echo -n "$POSTGRES_ENDPOINT" | cut -d ':' -f2) + REDIS_HOST=$(echo -n "$REDIS_ENDPOINT" | cut -d ':' -f1) + REDIS_PORT=$(echo -n "$REDIS_ENDPOINT" | cut -d ':' -f2) fi wait_for_port "Redis" "$REDIS_HOST" "$REDIS_PORT"