diff --git a/docs/auto-imports.d.ts b/docs/auto-imports.d.ts index cb9b8ec01..e0efe2bfd 100644 --- a/docs/auto-imports.d.ts +++ b/docs/auto-imports.d.ts @@ -1,7 +1,6 @@ /* eslint-disable */ /* prettier-ignore */ // @ts-nocheck -// noinspection JSUnusedGlobalSymbols // Generated by unplugin-auto-import export {} declare global { diff --git a/docs/nightly/en/summary.yml b/docs/nightly/en/summary.yml index a6ef5afc9..b704b870e 100644 --- a/docs/nightly/en/summary.yml +++ b/docs/nightly/en/summary.yml @@ -96,6 +96,7 @@ - quick-start - cluster-deployment - region-migration + - region-failover - monitoring - tracing # TODO diff --git a/docs/nightly/en/user-guide/operations/configuration.md b/docs/nightly/en/user-guide/operations/configuration.md index afae342b0..4a16c84c1 100644 --- a/docs/nightly/en/user-guide/operations/configuration.md +++ b/docs/nightly/en/user-guide/operations/configuration.md @@ -153,31 +153,31 @@ with_metric_engine = true The following table describes the options in detail: -| Option | Key | Type | Description | -| ---------- | ------------------ | ------- | ------------------------------------------------------------------------------- | -| http | | | HTTP server options | -| | addr | String | Server address, "127.0.0.1:4000" by default | -| | timeout | String | HTTP request timeout, "30s" by default | -| | body_limit | String | HTTP max body size, "64MB" by default | -| | is_strict_mode | Boolean | Whether to enable the strict verification mode of the protocol, which will slightly affect performance. False by default. | -| grpc | | | gRPC server options | -| | addr | String | Server address, "127.0.0.1:4001" by default | -| | runtime_size | Integer | The number of server worker threads, 8 by default | -| mysql | | | MySQL server options | -| | enable | Boolean | Whether to enable MySQL protocol, true by default | -| | add | String | Server address, "127.0.0.1:4002" by default | -| | runtime_size | Integer | The number of server worker threads, 2 by default | -| influxdb | | | InfluxDB Protocol options | -| | enable | Boolean | Whether to enable InfluxDB protocol in HTTP API, true by default | -| opentsdb | | | OpenTSDB Protocol options | -| | enable | Boolean | Whether to enable OpenTSDB protocol in HTTP API, true by default | -| prom_store | | | Prometheus remote storage options | -| | enable | Boolean | Whether to enable Prometheus Remote Write and read in HTTP API, true by default | -| | with_metric_engine | Boolean | Whether to use the metric engine on Prometheus Remote Write, true by default | -| postgres | | | PostgresSQL server options | -| | enable | Boolean | Whether to enable PostgresSQL protocol, true by default | -| | addr | String | Server address, "127.0.0.1:4003" by default | -| | runtime_size | Integer | The number of server worker threads, 2 by default | +| Option | Key | Type | Description | +| ---------- | ------------------ | ------- | ------------------------------------------------------------------------------------------------------------------------- | +| http | | | HTTP server options | +| | addr | String | Server address, "127.0.0.1:4000" by default | +| | timeout | String | HTTP request timeout, "30s" by default | +| | body_limit | String | HTTP max body size, "64MB" by default | +| | is_strict_mode | Boolean | Whether to enable the strict verification mode of the protocol, which will slightly affect performance. False by default. | +| grpc | | | gRPC server options | +| | addr | String | Server address, "127.0.0.1:4001" by default | +| | runtime_size | Integer | The number of server worker threads, 8 by default | +| mysql | | | MySQL server options | +| | enable | Boolean | Whether to enable MySQL protocol, true by default | +| | addr | String | Server address, "127.0.0.1:4002" by default | +| | runtime_size | Integer | The number of server worker threads, 2 by default | +| influxdb | | | InfluxDB Protocol options | +| | enable | Boolean | Whether to enable InfluxDB protocol in HTTP API, true by default | +| opentsdb | | | OpenTSDB Protocol options | +| | enable | Boolean | Whether to enable OpenTSDB protocol in HTTP API, true by default | +| prom_store | | | Prometheus remote storage options | +| | enable | Boolean | Whether to enable Prometheus Remote Write and read in HTTP API, true by default | +| | with_metric_engine | Boolean | Whether to use the metric engine on Prometheus Remote Write, true by default | +| postgres | | | PostgresSQL server options | +| | enable | Boolean | Whether to enable PostgresSQL protocol, true by default | +| | addr | String | Server address, "127.0.0.1:4003" by default | +| | runtime_size | Integer | The number of server worker threads, 2 by default | ### Storage options @@ -240,7 +240,7 @@ secret_access_key = "" ### Storage engine provider -`[[storage.providers]]` setups the table storage engine providers. Based on these providers, you can create a table with a specified storage, see [create table](/reference/sql/create#create-table): +`[[storage.providers]]` setups the table storage engine providers. Based on these providers, you can create a table with a specified storage, see [create table](/reference/sql/create#create-table): ```toml # Allows using multiple storages @@ -282,8 +282,11 @@ The `cache_path` is the local file directory that keeps cache files, and the `ca The `[wal]` section in datanode or standalone config file configures the options of Write-Ahead-Log: +#### Local WAL + ```toml [wal] +provider = "raft_engine" file_size = "256MB" purge_threshold = "4GB" purge_interval = "10m" @@ -296,6 +299,28 @@ sync_write = false - `purge_threshold` and `purge_interval`: control the purging of wal files, default is `4GB`. - `sync_write`: whether to call `fsync` when writing every log. +#### Remote WAL + +```toml +[wal] +provider = "kafka" +broker_endpoints = ["127.0.0.1:9092"] +max_batch_bytes = "1MB" +consumer_wait_timeout = "100ms" +backoff_init = "500ms" +backoff_max = "10s" +backoff_base = 2 +backoff_deadline = "5mins" +``` + +- `broker_endpoints`: The Kafka broker endpoints. +- `max_batch_bytes`: The max size of a single producer batch. +- `consumer_wait_timeout`: The consumer wait timeout. +- `backoff_init`: The initial backoff delay. +- `backoff_max`: The maximum backoff delay. +- `backoff_base`: The exponential backoff rate. +- `backoff_deadline`: The deadline of retries. + ### Logging options `frontend`, `metasrv`, `datanode` and `standalone` can all configure log and tracing related parameters in the `[logging]` section: @@ -324,7 +349,6 @@ How to use distributed tracing, please reference [Tracing](./tracing.md#tutorial The parameters corresponding to different storage engines can be configured for `datanode` and `standalone` in the `[region_engine]` section. Currently, only options for `mito` region engine is available. - Frequently used options: ```toml @@ -353,8 +377,8 @@ intermediate_path = "" type = "time_series" ``` - The `mito` engine provides an experimental memtable which optimizes for write performance and memory efficiency under large amounts of time-series. Its read performance might not as fast as the default `time_series` memtable. + ```toml [region_engine.mito.memtable] type = "partition_tree" @@ -463,7 +487,6 @@ headers = { Authorization = "Basic Z3JlcHRpbWVfdXNlcjpncmVwdGltZV9wd2Q=" } - `url`: URL specified by Prometheus Remote-Write protocol. - `headers`: Some optional HTTP parameters, such as authentication information. - ### Mode option The `mode` option is valid in `datanode`, `frontend` and `standalone`, which specify the running mode of the component. @@ -498,16 +521,73 @@ store_addr = "127.0.0.1:2379" selector = "LeaseBased" # Store data in memory, false by default. use_memory_store = false +## Whether to enable region failover. +## This feature is only available on GreptimeDB running on cluster mode and +## - Using Remote WAL +## - Using shared storage (e.g., s3). +enable_region_failover = false + +[wal] +# Available wal providers: +# - `raft_engine` (default): there're none raft-engine wal config since metasrv only involves in remote wal currently. +# - `kafka`: metasrv **have to be** configured with kafka wal config when using kafka wal provider in datanode. +provider = "raft_engine" + +# Kafka wal config. + +## The broker endpoints of the Kafka cluster. +broker_endpoints = ["127.0.0.1:9092"] + +## Number of topics to be created upon start. +num_topics = 64 + +## Topic selector type. +## Available selector types: +## - `round_robin` (default) +selector_type = "round_robin" + +## A Kafka topic is constructed by concatenating `topic_name_prefix` and `topic_id`. +topic_name_prefix = "greptimedb_wal_topic" + +## Expected number of replicas of each partition. +replication_factor = 1 + +## Above which a topic creation operation will be cancelled. +create_topic_timeout = "30s" +## The initial backoff for kafka clients. +backoff_init = "500ms" + +## The maximum backoff for kafka clients. +backoff_max = "10s" + +## Exponential backoff rate, i.e. next backoff = base * current backoff. +backoff_base = 2 + +## Stop reconnecting if the total wait time reaches the deadline. If this config is missing, the reconnecting won't terminate. +backoff_deadline = "5mins" ``` -| Key | Type | Description | -| ---------------- | ------- | --------------------------------------------------------------------------------------------------------------------------------------- | -| data_home | String | The working home of Metasrv, `"/tmp/metasrv/"` by default | -| bind_addr | String | The bind address of Metasrv, `"127.0.0.1:3002"` by default. | -| server_addr | String | The communication server address for frontend and datanode to connect to Metasrv, `"127.0.0.1:3002"` by default for localhost | -| store_addr | String | etcd server addresses, `"127.0.0.1:2379"` by default, server address separated by commas, in the format of `"ip1:port1,ip2:port2,..."`. | -| selector | String | Load balance strategy to choose datanode when creating new tables, see [Selector](/contributor-guide/metasrv/selector.md) | -| use_memory_store | Boolean | Only used for testing when you don't have an etcd cluster, store data in memory, `false` by default. | +| Key | Type | Default | Descriptions | +| -------------------------- | ------- | ---------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `data_home` | String | `/tmp/metasrv/` | The working home directory. | +| `bind_addr` | String | `127.0.0.1:3002` | The bind address of metasrv. | +| `server_addr` | String | `127.0.0.1:3002` | The communication server address for frontend and datanode to connect to metasrv, "127.0.0.1:3002" by default for localhost. | +| `store_addr` | String | `127.0.0.1:2379` | Etcd server address. | +| `selector` | String | `lease_based` | Datanode selector type.
- `lease_based` (default value).
- `load_based`
For details, see [Selector](/contributor-guide/metasrv/selector.md) | +| `use_memory_store` | Bool | `false` | Store data in memory. | +| `enable_region_failover` | Bool | `false` | Whether to enable region failover.
This feature is only available on GreptimeDB running on cluster mode and
- Using Remote WAL
- Using shared storage (e.g., s3). | +| `wal` | -- | -- | -- | +| `wal.provider` | String | `raft_engine` | -- | +| `wal.broker_endpoints` | Array | -- | The broker endpoints of the Kafka cluster. | +| `wal.num_topics` | Integer | `64` | Number of topics to be created upon start. | +| `wal.selector_type` | String | `round_robin` | Topic selector type.
Available selector types:
- `round_robin` (default) | +| `wal.topic_name_prefix` | String | `greptimedb_wal_topic` | A Kafka topic is constructed by concatenating `topic_name_prefix` and `topic_id`. | +| `wal.replication_factor` | Integer | `1` | Expected number of replicas of each partition. | +| `wal.create_topic_timeout` | String | `30s` | Above which a topic creation operation will be cancelled. | +| `wal.backoff_init` | String | `500ms` | The initial backoff for kafka clients. | +| `wal.backoff_max` | String | `10s` | The maximum backoff for kafka clients. | +| `wal.backoff_base` | Integer | `2` | Exponential backoff rate, i.e. next backoff = base \* current backoff. | +| `wal.backoff_deadline` | String | `5mins` | Stop reconnecting if the total wait time reaches the deadline. If this config is missing, the reconnecting won't terminate. | ### Datanode-only configuration diff --git a/docs/nightly/en/user-guide/operations/region-failover.md b/docs/nightly/en/user-guide/operations/region-failover.md new file mode 100644 index 000000000..fa76e280c --- /dev/null +++ b/docs/nightly/en/user-guide/operations/region-failover.md @@ -0,0 +1,81 @@ +# Region Failover + +Region Failover provides the ability to recover regions from region failures without losing data. This is implemented via [Region Migration](/user-guide/operations/region-migration). + +## Enable the Region Failover + +This feature is only available on GreptimeDB running on distributed mode and + +- Using Kafka WAL +- Using [shared storage](/user-guide/operations/configuration.md#storage-options) (e.g., AWS S3) + +### Via configuration file +Set the `enable_region_failover=true` in [metasrv](/user-guide/operations/configuration.md#metasrv-only-configuration) configuration file. + +### Via GreptimeDB Operator + +Set the `meta.enableRegionFailover=true`, e.g., +```bash +helm install greptimedb greptime/greptimedb-cluster \ + --set meta.enableRegionFailover=true \ + ... +``` + +## The recovery time of Region Failover + +The recovery time of Region Failover depends on: + +- number of regions per Topic. +- the Kafka cluster read throughput performance. + +### The read amplification + +In best practices, [the number of topics/partitions supported by a Kafka cluster is limited](https://docs.aws.amazon.com/msk/latest/developerguide/bestpractices.html) (exceeding this number can degrade Kafka cluster performance). +Therefore, we allow multiple regions to share a single topic as the WAL. +However, this may cause to the read amplification issue. + +The data belonging to a specific region consists of data files plus data in the WAL (typically `WAL[LastCheckpoint...Latest]`). The failover of a specific region only requires reading the region's WAL data to reconstruct the memory state, which is called region replaying. However, If multiple regions share a single topic, replaying data for a specific region from the topic requires filtering out unrelated data (i.e., data from other regions). This means replaying data for a specific region from the topic requires reading more data than the actual size of the region's data in the topic, a phenomenon known as read amplification. + +Although multiple regions share the same topic, allowing the Datanode to support more regions, the cost of this approach is read amplification during WAL replay. + +For example, configure 128 topics for [metasrv](/user-guide/operations/configuration.md#metasrv-only-configuration), and if the whole cluster holds 1024 regions (physical regions), every 8 regions will share one topic. + +![Read Amplification](/remote-wal-read-amplification.png) + +

(Figure1: recovery Region 3 need to read redundant data 7 times larger than the actual size)

+ + +A simple model to estimate the read amplification factor (replay data size/actual data size): + +- For a single topic, if we try to replay all regions that belong to the topic, then the amplification factor would be 7+6+...+1 = 28 times. (The Region WAL data distribution is shown in the Figure 1. Replaying Region 3 will read 7 times redundant data larger than the actual size; Region 6 will read 6 times, and so on) +- When recovering 100 regions (requiring about 13 topics), the amplification factor is approximately 28 \* 13 = 364 times. + +Assuming we have 100 regions to recover, and the actual data size of all regions is 0.5GB, the following table shows the replay data size based on the number of regions per topic. + +| Number of regions per Topic | Number of topics required for 100 Regions | Single topic read amplification factor | Total reading amplification factor | Replay data size (GB) | +| --------------------------- | ----------------------------------------- | -------------------------------------- | ---------------------------------- | ---------------- | +| 1 | 100 | 0 | 0 | 0.5 | +| 2 | 50 | 1 | 50 | 25.5 | +| 4 | 25 | 6 | 150 | 75.5 | +| 8 | 13 | 28 | 364 | 182.5 | +| 16 | 7 | 120 | 840 | 420.5 | + + +The following table shows the recovery time of 100 regions under different read throughput conditions of the Kafka cluster. For example, when providing a read throughput of 300MB/s, recovering 100 regions requires approximately 10 minutes (182.5GB/0.3GB = 10m). + +| Number of regions per Topic | Replay data size (GB) | Kafka throughput 300MB/s- Recovery time (secs) | Kafka throughput 1000MB/s- Recovery time (secs) | +| --------------------------- | ---------------- | --------------------------------------------- | ---------------------------------------------- | +| 1 | 0.5 | 2 | 1 | +| 2 | 25.5 | 85 | 26 | +| 4 | 75.5 | 252 | 76 | +| 8 | 182.5 | 608 | 183 | +| 16 | 420.5 | 1402 | 421 | + + +### Suggestions for improving recovery time + +In the above example, we calculated the recovery time based on the number of Regions contained in each Topic for reference. +We have calculated the recovery time under different Number of regions per Topic configuration for reference. +In actual scenarios, the read amplification may be larger than this model. +If you are very sensitive to recovery time, we recommend that each region have its topic(i.e., Number of regions per Topic is 1). + diff --git a/docs/nightly/en/user-guide/operations/region-migration.md b/docs/nightly/en/user-guide/operations/region-migration.md index 29a5c260f..5c065210e 100644 --- a/docs/nightly/en/user-guide/operations/region-migration.md +++ b/docs/nightly/en/user-guide/operations/region-migration.md @@ -3,7 +3,7 @@ Region Migration allows users to move regions between the Datanode. :::warning Warning -This feature is only available on GreptimeDB running on cluster mode and +This feature is only available on GreptimeDB running on distributed mode and - Using Kafka WAL - Using [shared storage](/user-guide/operations/configuration.md#storage-options) (e.g., AWS S3) diff --git a/docs/nightly/zh/summary-i18n.yml b/docs/nightly/zh/summary-i18n.yml index e80458906..473afb357 100644 --- a/docs/nightly/zh/summary-i18n.yml +++ b/docs/nightly/zh/summary-i18n.yml @@ -11,6 +11,7 @@ Continuous-Aggregation: 持续聚合 Logs: 日志 Python-Scripts: Python 脚本 Operations: 运维操作 +Remote-WAL: Remote-WAL Deploy-on-Kubernetes: 部署到 Kubernetes Table-Sharding: 表分片 Prometheus: Prometheus diff --git a/docs/nightly/zh/user-guide/operations/configuration.md b/docs/nightly/zh/user-guide/operations/configuration.md index ae452198f..3a55a92ce 100644 --- a/docs/nightly/zh/user-guide/operations/configuration.md +++ b/docs/nightly/zh/user-guide/operations/configuration.md @@ -149,13 +149,13 @@ enable = true | | addr | 字符串 | 服务器地址,默认为 "127.0.0.1:4000" | | | timeout | 字符串 | HTTP 请求超时时间,默认为 "30s" | | | body_limit | 字符串 | HTTP 最大体积大小,默认为 "64MB" | -| | is_strict_mode | 布尔值 | 是否启用协议的严格校验模式,启用会轻微影响性能,默认为false | +| | is_strict_mode | 布尔值 | 是否启用协议的严格校验模式,启用会轻微影响性能,默认为false | | grpc | | | gRPC 服务器选项 | | | addr | 字符串 | 服务器地址,默认为 "127.0.0.1:4001" | | | runtime_size | 整数 | 服务器工作线程数量,默认为 8 | | mysql | | | MySQL 服务器选项 | | | enable | 布尔值 | 是否启用 MySQL 协议,默认为 true | -| | add | 字符串 | 服务器地址,默认为 "127.0.0.1:4002" | +| | addr | 字符串 | 服务器地址,默认为 "127.0.0.1:4002" | | | runtime_size | 整数 | 服务器工作线程数量,默认为 2 | | influxdb | | | InfluxDB 协议选项 | | | enable | 布尔值 | 是否在 HTTP API 中启用 InfluxDB 协议,默认为 true | @@ -271,6 +271,8 @@ cache_capacity = "256MiB" datanode 和 standalone 在 `[wal]` 部分可以配置 Write-Ahead-Log 的对应参数: +#### Local WAL + ```toml [wal] file_size = "256MB" @@ -285,6 +287,29 @@ sync_write = false - `purge_threshold` 和 `purge_interval`: 控制清除任务的触发阈值和间隔 - `sync_write`: 是否在写入每条日志的时候调用 l `fsync` 刷盘。 + +#### Remote WAL + +```toml +[wal] +provider = "kafka" +broker_endpoints = ["127.0.0.1:9092"] +max_batch_bytes = "1MB" +consumer_wait_timeout = "100ms" +backoff_init = "500ms" +backoff_max = "10s" +backoff_base = 2 +backoff_deadline = "5mins" +``` + +- `broker_endpoints`:Kafka 端点 +- `max_batch_bytes`:单个 producer batch 的最大值 +- `consumer_wait_timeout`:consumer 的等待超时时间 +- `backoff_init`:backoff 初始延迟 +- `backoff_max`::backoff 最大延迟 +- `backoff_base`::backoff 指数 +- `backoff_deadline`:重试的截止时间 + ### Logging 选项 `frontend`、`metasrv`、`datanode` 和 `standalone` 都可以在 `[logging]` 部分配置 log、tracing 相关参数: @@ -353,29 +378,28 @@ fork_dictionary_bytes = "1GiB" 以下是可供使用的选项 -| 键 | 类型 | 默认值 | 描述 | -| --- | -----| ------- | ----------- | -| `num_workers` | 整数 | `8` | 写入线程数量 | -| `manifest_checkpoint_distance` | 整数 | `10` | 每写入 `manifest_checkpoint_distance` 个 manifest 文件创建一次 checkpoint | -| `max_background_jobs` | 整数 | `4` | 后台线程数量 | -| `auto_flush_interval` | 字符串 | `1h` | 自动 flush 超过 `auto_flush_interval` 没 flush 的 region | -| `global_write_buffer_size` | 字符串 | `1GB` | 写入缓冲区大小,默认值为内存总量的 1/8,但不会超过 1GB | -| `global_write_buffer_reject_size` | 字符串 | `2GB` | 写入缓冲区内数据的大小超过 `global_write_buffer_reject_size` 后拒 -绝写入请求,默认为 `global_write_buffer_size` 的 2 倍 | -| `sst_meta_cache_size` | 字符串 | `128MB` | SST 元数据缓存大小。设为 0 可关闭该缓存
默认为内存的 1/32,不超过 128MB | -| `vector_cache_size` | 字符串 | `512MB` | 内存向量和 arrow array 的缓存大小。设为 0 可关闭该缓存
默认为内存的 1/16,不超过 512MB | -| `page_cache_size` | 字符串 | `512MB` | SST 数据页的缓存。设为 0 可关闭该缓存
默认为内存的 1/16,不超过 512MB | -| `sst_write_buffer_size` | 字符串 | `8MB` | SST 的写缓存大小 | -| `scan_parallelism` | 整数 | `0` | 扫描并发度 (默认 1/4 CPU 核数)
- `0`: 使用默认值 (1/4 CPU 核数)
- `1`: 单线程扫描
- `n`: 按并行度 n 扫描 | -| `inverted_index.create_on_flush` | 字符串 | `auto` | 是否在 flush 时构建索引
- `auto`: 自动
- `disable`: 从不 | -| `inverted_index.create_on_compaction` | 字符串 | `auto` | 是否在 compaction 时构建索引
- `auto`: 自动
- `disable`: 从不 | -| `inverted_index.apply_on_query` | 字符串 | `auto` | 是否在查询时使用索引
- `auto`: 自动
- `disable`: 从不 | -| `inverted_index.mem_threshold_on_create` | 字符串 | `64M` | 创建索引时如果超过该内存阈值则改为使用外部排序
设置为空会关闭外排,在内存中完成所有排序 | -| `inverted_index.intermediate_path` | 字符串 | `""` | 存放外排临时文件的路径 (默认 `{data_home}/index_intermediate`). | -| `memtable.type` | 字符串 | `time_series` | Memtable type.
- `time_series`: time-series memtable
- `partition_tree`: partition tree memtable (实验性功能) | -| `memtable.index_max_keys_per_shard` | 整数 | `8192` | 一个 shard 内的主键数
只对 `partition_tree` memtable 生效 | -| `memtable.data_freeze_threshold` | 整数 | `32768` | 一个 shard 内写缓存可容纳的最大行数
只对 `partition_tree` memtable 生效 | -| `memtable.fork_dictionary_bytes` | 字符串 | `1GiB` | 主键字典的大小
只对 `partition_tree` memtable 生效 | +| 键 | 类型 | 默认值 | 描述 | +| ---------------------------------------- | ------ | ------------- | ---------------------------------------------------------------------------------------------------------------------- | +| `num_workers` | 整数 | `8` | 写入线程数量 | +| `manifest_checkpoint_distance` | 整数 | `10` | 每写入 `manifest_checkpoint_distance` 个 manifest 文件创建一次 checkpoint | +| `max_background_jobs` | 整数 | `4` | 后台线程数量 | +| `auto_flush_interval` | 字符串 | `1h` | 自动 flush 超过 `auto_flush_interval` 没 flush 的 region | +| `global_write_buffer_size` | 字符串 | `1GB` | 写入缓冲区大小,默认值为内存总量的 1/8,但不会超过 1GB | +| `global_write_buffer_reject_size` | 字符串 | `2GB` | 写入缓冲区内数据的大小超过 `global_write_buffer_reject_size` 后拒绝写入请求,默认为 `global_write_buffer_size` 的 2 倍 | +| `sst_meta_cache_size` | 字符串 | `128MB` | SST 元数据缓存大小。设为 0 可关闭该缓存
默认为内存的 1/32,不超过 128MB | +| `vector_cache_size` | 字符串 | `512MB` | 内存向量和 arrow array 的缓存大小。设为 0 可关闭该缓存
默认为内存的 1/16,不超过 512MB | +| `page_cache_size` | 字符串 | `512MB` | SST 数据页的缓存。设为 0 可关闭该缓存
默认为内存的 1/16,不超过 512MB | +| `sst_write_buffer_size` | 字符串 | `8MB` | SST 的写缓存大小 | +| `scan_parallelism` | 整数 | `0` | 扫描并发度 (默认 1/4 CPU 核数)
- `0`: 使用默认值 (1/4 CPU 核数)
- `1`: 单线程扫描
- `n`: 按并行度 n 扫描 | +| `inverted_index.create_on_flush` | 字符串 | `auto` | 是否在 flush 时构建索引
- `auto`: 自动
- `disable`: 从不 | +| `inverted_index.create_on_compaction` | 字符串 | `auto` | 是否在 compaction 时构建索引
- `auto`: 自动
- `disable`: 从不 | +| `inverted_index.apply_on_query` | 字符串 | `auto` | 是否在查询时使用索引
- `auto`: 自动
- `disable`: 从不 | +| `inverted_index.mem_threshold_on_create` | 字符串 | `64M` | 创建索引时如果超过该内存阈值则改为使用外部排序
设置为空会关闭外排,在内存中完成所有排序 | +| `inverted_index.intermediate_path` | 字符串 | `""` | 存放外排临时文件的路径 (默认 `{data_home}/index_intermediate`). | +| `memtable.type` | 字符串 | `time_series` | Memtable type.
- `time_series`: time-series memtable
- `partition_tree`: partition tree memtable (实验性功能) | +| `memtable.index_max_keys_per_shard` | 整数 | `8192` | 一个 shard 内的主键数
只对 `partition_tree` memtable 生效 | +| `memtable.data_freeze_threshold` | 整数 | `32768` | 一个 shard 内写缓存可容纳的最大行数
只对 `partition_tree` memtable 生效 | +| `memtable.fork_dictionary_bytes` | 字符串 | `1GiB` | 主键字典的大小
只对 `partition_tree` memtable 生效 | @@ -473,32 +497,89 @@ mode = "standalone" ### 仅限于 Metasrv 的配置 ```toml -# The working home directory. +# 工作主目录。 data_home = "/tmp/metasrv/" -# The bind address of metasrv, "127.0.0.1:3002" by default. +# metasrv 的绑定地址,默认为 "127.0.0.1:3002"。 bind_addr = "127.0.0.1:3002" -# The communication server address for frontend and datanode to connect to metasrv, "127.0.0.1:3002" by default for localhost. +# frontend 和 datanode 连接到 metasrv 的通信服务器地址,本地默认为 "127.0.0.1:3002"。 server_addr = "127.0.0.1:3002" -# Etcd server addresses, "127.0.0.1:2379" by default. +# Etcd 服务器地址,默认为 "127.0.0.1:2379"。 store_addr = "127.0.0.1:2379" -# Datanode selector type. -# - "lease_based" (default value). +# Datanode 选择器类型。 +# - "lease_based" (默认值) # - "load_based" -# For details, please see "https://docs.greptime.com/contributor-guide/meta/selector". +# 详情请参阅 "https://docs.greptime.com/contributor-guide/meta/selector" selector = "lease_based" -# Store data in memory, false by default. +# 将数据存储在内存中,默认值为 false。 use_memory_store = false -``` - -| 键 | 类型 | 描述 | -| ---------------- | ------ | ------------------------------------------------------------------------------------------------------ | -| data_home | 字符串 | Metasrv 的工作目录,默认为 `"/tmp/metasrv/"` | -| bind_addr | 字符串 | Metasrv 的绑定地址,默认为 `"127.0.0.1:3002"`。 | -| server_addr | 字符串 | 前端和数据节点连接到 Metasrv 的通信服务器地址,默认为 `"127.0.0.1:3002"`(适用于本地主机) | -| store_addr | 字符串 | etcd 服务器地址,默认为 `"127.0.0.1:2379"`,服务器地址由逗号分隔,格式为 `"ip1:port1,ip2:port2,..."`。 | -| selector | 字符串 | 创建新表时选择数据节点的负载均衡策略,参见 [选择器](/contributor-guide/metasrv/selector.md) | -| use_memory_store | 布尔值 | 仅在测试时使用,当你没有 etcd 集群时,将数据存储在内存中,默认为 `false` | +## 是否启用 region failover。 +## 该功能仅适用于运行在集群模式下的 GreptimeDB,并且 +## - 使用 Remote WAL +## - 使用共享存储(例如 s3)。 +enable_region_failover = false +[wal] +# 可用的 WAL 提供者: +# - `raft_engine`(默认):由于 metasrv 目前仅涉及远程 WAL,因此没有 raft-engine WAL 配置。 +# - `kafka`:在 datanode 中使用 kafka WAL 提供者时,metasrv **必须** 配置 kafka WAL 配置。 +provider = "raft_engine" + +# Kafka WAL 配置。 + +## Kafka 集群的代理端点。 +broker_endpoints = ["127.0.0.1:9092"] + +## 启动时创建的 topic 数量。 +num_topics = 64 + +## topic selector 类型。 +## 可用的 selector 类型: +## - `round_robin`(默认) +selector_type = "round_robin" + +## Kafka topic 通过连接 `topic_name_prefix` 和 `topic_id` 构建。 +topic_name_prefix = "greptimedb_wal_topic" + +## 每个分区的预期副本数。 +replication_factor = 1 + +## 超过此时间创建 topic 的操作将被取消。 +create_topic_timeout = "30s" + +## Kafka 客户端的 backoff 初始时间。 +backoff_init = "500ms" + +## Kafka 客户端的 backoff 最大时间。 +backoff_max = "10s" + +## backoff 指数,即下一个 backoff 时间 = 该指数 * 当前 backoff 时间。 +backoff_base = 2 + +## 如果总等待时间达到截止时间,则停止重新连接。如果此配置缺失,则重新连接不会终止。 +backoff_deadline = "5mins" + + +| 键 | 类型 | 默认值 | 描述 | +| ------------------------ | ------- | -------------------- | ---------------------------------------------------------------------------------------------------------------------------------- | +| `data_home` | String | `/tmp/metasrv/` | 工作目录。 | +| `bind_addr` | String | `127.0.0.1:3002` | Metasrv 的绑定地址。 | +| `server_addr` | String | `127.0.0.1:3002` | 前端和 datanode 连接到 Metasrv 的通信服务器地址,默认为本地主机的 `127.0.0.1:3002`。 | +| `store_addr` | String | `127.0.0.1:2379` | etcd 服务器地址,默认值为 `127.0.0.1:2379`,多个服务器地址用逗号分隔,格式为 `"ip1:port1,ip2:port2,..."`。 | +| `selector` | String | `lease_based` | 创建新表时选择 datanode 的负载均衡策略,详见 [选择器](/contributor-guide/metasrv/selector.md)。 | +| `use_memory_store` | Boolean | `false` | 仅用于在没有 etcd 集群时的测试,将数据存储在内存中,默认值为 `false`。 | +| enable_region_failover | Bool | false | 是否启用 region failover。
该功能仅在以集群模式运行的 GreptimeDB 上可用,并且
- 使用远程 WAL
- 使用共享存储(如 s3)。 | +| wal | -- | -- | -- | +| wal.provider | String | raft_engine | -- | +| wal.broker_endpoints | Array | -- | Kafka 集群的端点 | +| wal.num_topics | Integer | 64 | 启动时创建的 topic数 | +| wal.selector_type | String | round_robin | topic selector 类型
可用 selector 类型:
- round_robin(默认) | +| wal.topic_name_prefix | String | greptimedb_wal_topic | 一个 Kafka topic 是通过连接 topic_name_prefix 和 topic_id 构建的 | +| wal.replication_factor | Integer | 1 | 每个分区的副本数 | +| wal.create_topic_timeout | String | 30s | 超过该时间后,topic 创建操作将被取消 | +| wal.backoff_init | String | 500ms | Kafka 客户端的 backoff 初始时间 | +| wal.backoff_max | String | 10s | Kafka 客户端的 backoff 最大时间 | +| wal.backoff_base | Integer | 2 | backoff 指数,即下一个 backoff 时间 = 该指数 * 当前 backoff 时间 | +| wal.backoff_deadline | String | 5mins | 如果总等待时间达到截止时间,则停止重新连接。如果此配置缺失,则重新连接不会终止 | ### 仅限于 `Datanode` 的配置 @@ -509,10 +590,10 @@ rpc_addr = "127.0.0.1:3001" rpc_runtime_size = 8 ``` -| Key | Type | Description | -| ---------------- | ------- | ------------------------------------------- | -| node_id | 整数 | 该 `datanode` 的唯一标识符。 | -| rpc_hostname | 字符串 | 该 `datanode` 的 Hostname。 | -| rpc_addr | 字符串 | gRPC 服务端地址,默认为`"127.0.0.1:3001"`。 | -| rpc_runtime_size | 整数 | gRPC 服务器工作线程数,默认为 8。 | +| Key | Type | Description | +| ---------------- | ------ | ------------------------------------------- | +| node_id | 整数 | 该 `datanode` 的唯一标识符。 | +| rpc_hostname | 字符串 | 该 `datanode` 的 Hostname。 | +| rpc_addr | 字符串 | gRPC 服务端地址,默认为`"127.0.0.1:3001"`。 | +| rpc_runtime_size | 整数 | gRPC 服务器工作线程数,默认为 8。 | diff --git a/docs/nightly/zh/user-guide/operations/region-failover.md b/docs/nightly/zh/user-guide/operations/region-failover.md new file mode 100644 index 000000000..1e23cee39 --- /dev/null +++ b/docs/nightly/zh/user-guide/operations/region-failover.md @@ -0,0 +1,81 @@ +# Region Failover + +Region Failover 提供了在不丢失数据的情况下从 Region 故障中恢复的能力。这是通过 [Region 迁移](/user-guide/operations/region-migration) 实现的。 + +## 开启 Region Failover + + +该功能仅在 GreptimeDB 集群模式下可用,并且需要满足以下条件 + +- 使用 Kafka WAL +- 使用[共享存储](/user-guide/operations/configuration.md#storage-options) (例如:AWS S3) + + +### 通过配置文件 + +在 [metasrv](/user-guide/operations/configuration.md#metasrv-only-configuration) 配置文件中设置 `enable_region_failover=true`. + +### 通过 GreptimeDB Operator + +通过设置 `meta.enableRegionFailover=true`, 例如 + +```bash +helm install greptimedb greptime/greptimedb-cluster \ + --set meta.enableRegionFailover=true \ + ... +``` + +## Region Failover 的恢复用时 + +Region Failover 的恢复时间取决于: + +- 每个 Topic 的 region 数量 +- Kafka 集群的读取吞吐性能 + + +### 读放大 + +在最佳实践中,[Kafka 集群所支持的 topics/partitions 数量是有限的](https://docs.aws.amazon.com/msk/latest/developerguide/bestpractices.html)(超过这个数量可能会导致 Kafka 集群性能下降)。 +因此,GreptimeDB 允许多个 regions 共享一个 topic 作为 WAL,然而这可能会带来读放大的问题。 + +属于特定 Region 的数据由数据文件和 WAL 中的数据(通常为 WAL[LastCheckpoint...Latest])组成。特定 Region 的 failover 只需要读取该 Region 的 WAL 数据以重建内存状态,这被称为 Region 重放(region replaying)。然而,如果多个 Region 共享一个 Topic,则从 Topic 重放特定 Region 的数据需要过滤掉不相关的数据(即其他 Region 的数据)。这意味着从 Topic 重放特定 Region 的数据需要读取比该 Region 实际 WAL 数据大小更多的数据,这种现象被称为读取放大(read amplification)。 + +尽管多个 Region 共享同一个 Topic,可以让 Datanode 支持更多的 Region,但这种方法的代价是在 Region 重放过程中产生读取放大。 + +例如,为 [metasrv](/user-guide/operations/configuration.md#metasrv-only-configuration) 配置 128 个 Topic,如果整个集群包含 1024 个 Region(物理 Region),那么每 8 个 Region 将共享一个 Topic。 + +![Read Amplification](/remote-wal-read-amplification.png) + +

(图 1:恢复 Region 3 需要读取比实际大小大 7 倍的冗余数据)

+ +估算读取放大倍数(重放数据大小/实际数据大小)的简单模型: + +- 对于单个 Topic,如果我们尝试重放属于该 Topic 的所有 Region,那么放大倍数将是 7+6+...+1 = 28 倍。(图 1 显示了 Region WAL 数据分布。重放 Region 3 将读取约为实际大小 7 倍的数据;重放 Region 6 将读取约为实际大小 6 倍的数据,以此类推) +- 在恢复 100 个 Region 时(需要大约 13 个 Topic),放大倍数大约为 28 \* 13 = 364 倍。 + +假设要恢复 100 个 Region,所有 Region 的实际数据大小是 0.5 GB,下表根据每个 Topic 的 Region 数量展示了数据重放的总量。 + +| 每个 Topic 的 Region 数量 | 100 个 Region 所需 Topic 数量 | 单个 Topic 读放大系数 | 总读放大系数 | 重放数据大小(GB) | +| ------------------------- | ----------------------------- | --------------------- | ------------ | ------------------ | +| 1 | 100 | 0 | 0 | 0.5 | +| 2 | 50 | 1 | 50 | 25.5 | +| 4 | 25 | 6 | 150 | 75.5 | +| 8 | 13 | 28 | 364 | 182.5 | +| 16 | 7 | 120 | 840 | 420.5 | + +下表展示了在 Kafka 集群在不同读取吞吐量情况下,100 个 region 的恢复时间。例如在提供 300MB/s 的读取吞吐量的情况下,恢复 100 个 Region 大约需要 10 分钟(182.5GB/0.3GB = 10 分钟)。 + +| 每个主题的区域数 | 重放数据大小(GB) | Kafka 吞吐量 300MB/s- 恢复时间(秒) | Kafka 吞吐量 1000MB/s- 恢复时间(秒) | +| ---------------- | ------------------ | ------------------------------------ | ------------------------------------- | +| 1 | 0.5 | 2 | 1 | +| 2 | 25.5 | 85 | 26 | +| 4 | 75.5 | 252 | 76 | +| 8 | 182.5 | 608 | 183 | +| 16 | 420.5 | 1402 | 421 | + + +### 改进恢复时间的建议 + +在上文中我们根据不同的每个 Topic 包含的 Region 数量计算了恢复时间以供参考。 +在实际场景中,读取放大的现象可能会比这个模型更为严重。 +如果您对恢复时间非常敏感,我们建议每个 Region 都有自己的 Topic(即,每个 Topic 包含的 Region 数量为 1)。 diff --git a/docs/nightly/zh/user-guide/overview.md b/docs/nightly/zh/user-guide/overview.md index f8a0113a2..23a172ea7 100644 --- a/docs/nightly/zh/user-guide/overview.md +++ b/docs/nightly/zh/user-guide/overview.md @@ -52,7 +52,7 @@ ALIGN '5s' BY (host) FILL PREV - **独特的数据模型:** 独特的[数据模型](/user-guide/concepts/data-model.md)搭配时间索引和全文索引,大大提升了查询性能,并在超大数据集上也经受住了考验。它不仅支持[数据指标的插入](/user-guide/write-data/overview.md)和[查询](/user-guide/query-data/overview.md),也提供了非常友好的方式便于日志的[写入](/user-guide/logs/write-logs.md)和[查询](/user-guide/logs/query-logs.md)。 - **范围查询:** GreptimeDB 支持[范围查询](/user-guide/query-data/sql#aggregate-data-by-time-window)来计算一段时间内的[表达式](/reference/sql/functions/overview.md),从而了解指标趋势。你还可以[持续聚合](/user-guide/continuous-aggregation/overview)数据以进行进一步分析。 - **SQL 和多种协议:** GreptimeDB 使用 SQL 作为主要查询语言,并支持[多种协议](/user-guide/clients/overview.md#protocols),大大降低了学习曲线和接入成本。你可以轻松从 Prometheus 或 [Influxdb 迁移](/user-guide/migrate-to-greptimedb/migrate-from-influxdb)至 GreptimeDB,或者从 0 接入 GreptimeDB。 -- **JOIN 操作:** GreptimeDB 的时间序列表的数据模型,使其成为第一个支持[JOIN 操作](reference/sql/join.md)的时序数据库。 +- **JOIN 操作:** GreptimeDB 的时间序列表的数据模型,使其成为第一个支持[JOIN 操作](/reference/sql/join.md)的时序数据库。 了解了这些功能后,你现在可以直接探索感兴趣的功能,或按顺序继续阅读下一步骤。 diff --git a/docs/public/remote-wal-read-amplification.png b/docs/public/remote-wal-read-amplification.png new file mode 100644 index 000000000..e01a4ebec Binary files /dev/null and b/docs/public/remote-wal-read-amplification.png differ