From 841625e2f901ebd3a9748338b36a1d3af772d9b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Filipe=20Caba=C3=A7o?= Date: Tue, 2 Sep 2025 15:45:41 +0100 Subject: [PATCH 01/34] fix: runtime setup error (#1520) --- config/runtime.exs | 4 ++-- mix.exs | 2 +- run.sh | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/config/runtime.exs b/config/runtime.exs index 39310f093..ac0a2569b 100644 --- a/config/runtime.exs +++ b/config/runtime.exs @@ -3,12 +3,12 @@ import Config defmodule Env do def get_integer(env, default) do value = System.get_env(env) - if value, do: String.to_integer(env), else: default + if value, do: String.to_integer(value), else: default end def get_charlist(env, default) do value = System.get_env(env) - if value, do: String.to_charlist(env), else: default + if value, do: String.to_charlist(value), else: default end def get_boolean(env, default) do diff --git a/mix.exs b/mix.exs index d0f8a267b..13ffe985a 100644 --- a/mix.exs +++ b/mix.exs @@ -4,7 +4,7 @@ defmodule Realtime.MixProject do def project do [ app: :realtime, - version: "2.46.2", + version: "2.46.3", elixir: "~> 1.17.3", elixirc_paths: elixirc_paths(Mix.env()), start_permanent: Mix.env() == :prod, diff --git a/run.sh b/run.sh index 2dddbc1b8..66585dc2b 100755 --- a/run.sh +++ b/run.sh @@ -90,7 +90,7 @@ if [ "${ENABLE_ERL_CRASH_DUMP:-false}" = true ]; then trap upload_crash_dump_to_s3 INT TERM KILL EXIT fi -if [[ -n "${GENERATE_CLUSTER_CERTS}" ]] ; then +if [[ -n "${GENERATE_CLUSTER_CERTS:-}" ]] ; then generate_certs fi From 1b63b4fe2d34f063b6b0afbe7e6133df42e95e93 Mon Sep 17 00:00:00 2001 From: Eduardo Gurgel Date: Wed, 3 Sep 2025 16:49:57 +1200 Subject: [PATCH 02/34] fix: use primary instead of replica on rename_settings_field (#1521) --- lib/realtime/api.ex | 9 +++------ mix.exs | 2 +- test/realtime/api_test.exs | 4 ---- 3 files changed, 4 insertions(+), 11 deletions(-) diff --git a/lib/realtime/api.ex b/lib/realtime/api.ex index 23e28feab..c504d0187 100644 --- a/lib/realtime/api.ex +++ b/lib/realtime/api.ex @@ -186,12 +186,9 @@ defmodule Realtime.Api do |> repo.preload(:extensions) end - def list_extensions(type \\ "postgres_cdc_rls") do - from(e in Extensions, - where: e.type == ^type, - select: e - ) - |> Replica.replica().all() + defp list_extensions(type \\ "postgres_cdc_rls") do + from(e in Extensions, where: e.type == ^type, select: e) + |> Repo.all() end def rename_settings_field(from, to) do diff --git a/mix.exs b/mix.exs index 13ffe985a..c0d4e1516 100644 --- a/mix.exs +++ b/mix.exs @@ -4,7 +4,7 @@ defmodule Realtime.MixProject do def project do [ app: :realtime, - version: "2.46.3", + version: "2.46.4", elixir: "~> 1.17.3", elixirc_paths: elixirc_paths(Mix.env()), start_permanent: Mix.env() == :prod, diff --git a/test/realtime/api_test.exs b/test/realtime/api_test.exs index 1c4a816b0..55dc609eb 100644 --- a/test/realtime/api_test.exs +++ b/test/realtime/api_test.exs @@ -236,10 +236,6 @@ defmodule Realtime.ApiTest do end end - test "list_extensions/1 ", %{tenants: tenants} do - assert length(Api.list_extensions()) == length(tenants) - end - describe "preload_counters/1" do test "preloads counters for a given tenant ", %{tenants: [tenant | _]} do tenant = Repo.reload!(tenant) From da3404aec8da76c1c3a617d9b7e5185e25806416 Mon Sep 17 00:00:00 2001 From: Eduardo Gurgel Date: Fri, 5 Sep 2025 09:20:47 +1200 Subject: [PATCH 03/34] feat: upgrade cowboy & ranch (#1523) --- lib/realtime/api.ex | 2 +- .../monitoring/prom_ex/plugins/phoenix.ex | 13 ++++--------- mix.exs | 2 +- mix.lock | 6 +++--- .../monitoring/prom_ex/plugins/phoenix_test.exs | 17 +++++++++++------ 5 files changed, 20 insertions(+), 20 deletions(-) diff --git a/lib/realtime/api.ex b/lib/realtime/api.ex index c504d0187..16dc2bcd0 100644 --- a/lib/realtime/api.ex +++ b/lib/realtime/api.ex @@ -186,7 +186,7 @@ defmodule Realtime.Api do |> repo.preload(:extensions) end - defp list_extensions(type \\ "postgres_cdc_rls") do + defp list_extensions(type) do from(e in Extensions, where: e.type == ^type, select: e) |> Repo.all() end diff --git a/lib/realtime/monitoring/prom_ex/plugins/phoenix.ex b/lib/realtime/monitoring/prom_ex/plugins/phoenix.ex index d3f64afbe..6cc3709d2 100644 --- a/lib/realtime/monitoring/prom_ex/plugins/phoenix.ex +++ b/lib/realtime/monitoring/prom_ex/plugins/phoenix.ex @@ -57,15 +57,10 @@ if Code.ensure_loaded?(Phoenix) do def execute_metrics do active_conn = - case :ets.lookup(:ranch_server, {:listener_sup, HTTP}) do - [] -> - -1 - - _ -> - HTTP - |> :ranch_server.get_connections_sup() - |> :supervisor.count_children() - |> Keyword.get(:active) + if :ranch.info()[HTTP] do + :ranch.info(HTTP)[:all_connections] + else + -1 end :telemetry.execute(@event_all_connections, %{active: active_conn}, %{}) diff --git a/mix.exs b/mix.exs index c0d4e1516..f39513c99 100644 --- a/mix.exs +++ b/mix.exs @@ -4,7 +4,7 @@ defmodule Realtime.MixProject do def project do [ app: :realtime, - version: "2.46.4", + version: "2.47.0", elixir: "~> 1.17.3", elixirc_paths: elixirc_paths(Mix.env()), start_permanent: Mix.env() == :prod, diff --git a/mix.lock b/mix.lock index 76eb0d980..dd95486b6 100644 --- a/mix.lock +++ b/mix.lock @@ -7,9 +7,9 @@ "castore": {:hex, :castore, "1.0.11", "4bbd584741601eb658007339ea730b082cc61f3554cf2e8f39bf693a11b49073", [:mix], [], "hexpm", "e03990b4db988df56262852f20de0f659871c35154691427a5047f4967a16a62"}, "chatterbox": {:hex, :ts_chatterbox, "0.15.1", "5cac4d15dd7ad61fc3c4415ce4826fc563d4643dee897a558ec4ea0b1c835c9c", [:rebar3], [{:hpack, "~> 0.3.0", [hex: :hpack_erl, repo: "hexpm", optional: false]}], "hexpm", "4f75b91451338bc0da5f52f3480fa6ef6e3a2aeecfc33686d6b3d0a0948f31aa"}, "corsica": {:hex, :corsica, "2.1.3", "dccd094ffce38178acead9ae743180cdaffa388f35f0461ba1e8151d32e190e6", [:mix], [{:plug, "~> 1.0", [hex: :plug, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4.0 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "616c08f61a345780c2cf662ff226816f04d8868e12054e68963e95285b5be8bc"}, - "cowboy": {:hex, :cowboy, "2.12.0", "f276d521a1ff88b2b9b4c54d0e753da6c66dd7be6c9fca3d9418b561828a3731", [:make, :rebar3], [{:cowlib, "2.13.0", [hex: :cowlib, repo: "hexpm", optional: false]}, {:ranch, "1.8.0", [hex: :ranch, repo: "hexpm", optional: false]}], "hexpm", "8a7abe6d183372ceb21caa2709bec928ab2b72e18a3911aa1771639bef82651e"}, + "cowboy": {:hex, :cowboy, "2.13.0", "09d770dd5f6a22cc60c071f432cd7cb87776164527f205c5a6b0f24ff6b38990", [:make, :rebar3], [{:cowlib, ">= 2.14.0 and < 3.0.0", [hex: :cowlib, repo: "hexpm", optional: false]}, {:ranch, ">= 1.8.0 and < 3.0.0", [hex: :ranch, repo: "hexpm", optional: false]}], "hexpm", "e724d3a70995025d654c1992c7b11dbfea95205c047d86ff9bf1cda92ddc5614"}, "cowboy_telemetry": {:hex, :cowboy_telemetry, "0.4.0", "f239f68b588efa7707abce16a84d0d2acf3a0f50571f8bb7f56a15865aae820c", [:rebar3], [{:cowboy, "~> 2.7", [hex: :cowboy, repo: "hexpm", optional: false]}, {:telemetry, "~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "7d98bac1ee4565d31b62d59f8823dfd8356a169e7fcbb83831b8a5397404c9de"}, - "cowlib": {:hex, :cowlib, "2.13.0", "db8f7505d8332d98ef50a3ef34b34c1afddec7506e4ee4dd4a3a266285d282ca", [:make, :rebar3], [], "hexpm", "e1e1284dc3fc030a64b1ad0d8382ae7e99da46c3246b815318a4b848873800a4"}, + "cowlib": {:hex, :cowlib, "2.15.0", "3c97a318a933962d1c12b96ab7c1d728267d2c523c25a5b57b0f93392b6e9e25", [:make, :rebar3], [], "hexpm", "4f00c879a64b4fe7c8fcb42a4281925e9ffdb928820b03c3ad325a617e857532"}, "credo": {:hex, :credo, "1.7.11", "d3e805f7ddf6c9c854fd36f089649d7cf6ba74c42bc3795d587814e3c9847102", [:mix], [{:bunt, "~> 0.2.1 or ~> 1.0", [hex: :bunt, repo: "hexpm", optional: false]}, {:file_system, "~> 0.2 or ~> 1.0", [hex: :file_system, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}], "hexpm", "56826b4306843253a66e47ae45e98e7d284ee1f95d53d1612bb483f88a8cf219"}, "ctx": {:hex, :ctx, "0.6.0", "8ff88b70e6400c4df90142e7f130625b82086077a45364a78d208ed3ed53c7fe", [:rebar3], [], "hexpm", "a14ed2d1b67723dbebbe423b28d7615eb0bdcba6ff28f2d1f1b0a7e1d4aa5fc2"}, "db_connection": {:hex, :db_connection, "2.8.0", "64fd82cfa6d8e25ec6660cea73e92a4cbc6a18b31343910427b702838c4b33b2", [:mix], [{:telemetry, "~> 0.4 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "008399dae5eee1bf5caa6e86d204dcb44242c82b1ed5e22c881f2c34da201b15"}, @@ -82,7 +82,7 @@ "postgres_replication": {:git, "https://github.com/filipecabaco/postgres_replication.git", "69129221f0263aa13faa5fbb8af97c28aeb4f71c", []}, "postgrex": {:hex, :postgrex, "0.20.0", "363ed03ab4757f6bc47942eff7720640795eb557e1935951c1626f0d303a3aed", [:mix], [{:db_connection, "~> 2.1", [hex: :db_connection, repo: "hexpm", optional: false]}, {:decimal, "~> 1.5 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: true]}, {:table, "~> 0.1.0", [hex: :table, repo: "hexpm", optional: true]}], "hexpm", "d36ef8b36f323d29505314f704e21a1a038e2dc387c6409ee0cd24144e187c0f"}, "prom_ex": {:hex, :prom_ex, "1.9.0", "63e6dda6c05cdeec1f26c48443dcc38ffd2118b3665ae8d2bd0e5b79f2aea03e", [:mix], [{:absinthe, ">= 1.6.0", [hex: :absinthe, repo: "hexpm", optional: true]}, {:broadway, ">= 1.0.2", [hex: :broadway, repo: "hexpm", optional: true]}, {:ecto, ">= 3.5.0", [hex: :ecto, repo: "hexpm", optional: true]}, {:finch, "~> 0.15", [hex: :finch, repo: "hexpm", optional: false]}, {:jason, "~> 1.2", [hex: :jason, repo: "hexpm", optional: false]}, {:oban, ">= 2.4.0", [hex: :oban, repo: "hexpm", optional: true]}, {:octo_fetch, "~> 0.3", [hex: :octo_fetch, repo: "hexpm", optional: false]}, {:phoenix, ">= 1.5.0", [hex: :phoenix, repo: "hexpm", optional: true]}, {:phoenix_live_view, ">= 0.14.0", [hex: :phoenix_live_view, repo: "hexpm", optional: true]}, {:plug, ">= 1.12.1", [hex: :plug, repo: "hexpm", optional: true]}, {:plug_cowboy, "~> 2.5", [hex: :plug_cowboy, repo: "hexpm", optional: false]}, {:telemetry, ">= 1.0.0", [hex: :telemetry, repo: "hexpm", optional: false]}, {:telemetry_metrics, "~> 0.6", [hex: :telemetry_metrics, repo: "hexpm", optional: false]}, {:telemetry_metrics_prometheus_core, "~> 1.0", [hex: :telemetry_metrics_prometheus_core, repo: "hexpm", optional: false]}, {:telemetry_poller, "~> 1.0", [hex: :telemetry_poller, repo: "hexpm", optional: false]}], "hexpm", "01f3d4f69ec93068219e686cc65e58a29c42bea5429a8ff4e2121f19db178ee6"}, - "ranch": {:hex, :ranch, "1.8.0", "8c7a100a139fd57f17327b6413e4167ac559fbc04ca7448e9be9057311597a1d", [:make, :rebar3], [], "hexpm", "49fbcfd3682fab1f5d109351b61257676da1a2fdbe295904176d5e521a2ddfe5"}, + "ranch": {:hex, :ranch, "2.2.0", "25528f82bc8d7c6152c57666ca99ec716510fe0925cb188172f41ce93117b1b0", [:make, :rebar3], [], "hexpm", "fa0b99a1780c80218a4197a59ea8d3bdae32fbff7e88527d7d8a4787eff4f8e7"}, "recon": {:hex, :recon, "2.5.6", "9052588e83bfedfd9b72e1034532aee2a5369d9d9343b61aeb7fbce761010741", [:mix, :rebar3], [], "hexpm", "96c6799792d735cc0f0fd0f86267e9d351e63339cbe03df9d162010cefc26bb0"}, "req": {:hex, :req, "0.5.10", "a3a063eab8b7510785a467f03d30a8d95f66f5c3d9495be3474b61459c54376c", [:mix], [{:brotli, "~> 0.3.1", [hex: :brotli, repo: "hexpm", optional: true]}, {:ezstd, "~> 1.0", [hex: :ezstd, repo: "hexpm", optional: true]}, {:finch, "~> 0.17", [hex: :finch, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}, {:mime, "~> 2.0.6 or ~> 2.1", [hex: :mime, repo: "hexpm", optional: false]}, {:nimble_csv, "~> 1.0", [hex: :nimble_csv, repo: "hexpm", optional: true]}, {:plug, "~> 1.0", [hex: :plug, repo: "hexpm", optional: true]}], "hexpm", "8a604815743f8a2d3b5de0659fa3137fa4b1cffd636ecb69b30b2b9b2c2559be"}, "sleeplocks": {:hex, :sleeplocks, "1.1.3", "96a86460cc33b435c7310dbd27ec82ca2c1f24ae38e34f8edde97f756503441a", [:rebar3], [], "hexpm", "d3b3958552e6eb16f463921e70ae7c767519ef8f5be46d7696cc1ed649421321"}, diff --git a/test/realtime/monitoring/prom_ex/plugins/phoenix_test.exs b/test/realtime/monitoring/prom_ex/plugins/phoenix_test.exs index a73e6e2f5..ad9198c97 100644 --- a/test/realtime/monitoring/prom_ex/plugins/phoenix_test.exs +++ b/test/realtime/monitoring/prom_ex/plugins/phoenix_test.exs @@ -1,6 +1,7 @@ defmodule Realtime.PromEx.Plugins.PhoenixTest do use Realtime.DataCase, async: false alias Realtime.PromEx.Plugins + alias Realtime.Integration.WebsocketClient defmodule MetricsTest do use PromEx, otp_app: :realtime_test_phoenix @@ -13,16 +14,20 @@ defmodule Realtime.PromEx.Plugins.PhoenixTest do describe "pooling metrics" do setup do start_supervised!(MetricsTest) - :ok + %{tenant: Containers.checkout_tenant(run_migrations: true)} end - test "number of connections" do - # Trigger a connection by making a request to the endpoint - url = RealtimeWeb.Endpoint.url() <> "/healthcheck" - Req.get!(url) + test "number of connections", %{tenant: tenant} do + {:ok, token} = token_valid(tenant, "anon", %{}) + + {:ok, _} = + WebsocketClient.connect(self(), uri(tenant, 4002), Phoenix.Socket.V1.JSONSerializer, [{"x-api-key", token}]) + + {:ok, _} = + WebsocketClient.connect(self(), uri(tenant, 4002), Phoenix.Socket.V1.JSONSerializer, [{"x-api-key", token}]) Process.sleep(200) - assert metric_value() > 0 + assert metric_value() >= 2 end end From bd2c141386d88f328749f80b0904e8261a4a78bd Mon Sep 17 00:00:00 2001 From: Eduardo Gurgel Date: Mon, 8 Sep 2025 20:57:43 +1200 Subject: [PATCH 04/34] fix: Fix GenRpc to not try to connect to nodes that are not alive (#1525) --- lib/realtime/gen_rpc.ex | 17 +++++++++++++++++ mix.exs | 2 +- test/realtime/gen_rpc_test.exs | 12 ++++++++++++ 3 files changed, 30 insertions(+), 1 deletion(-) diff --git a/lib/realtime/gen_rpc.ex b/lib/realtime/gen_rpc.ex index bb7099242..3487cc933 100644 --- a/lib/realtime/gen_rpc.ex +++ b/lib/realtime/gen_rpc.ex @@ -41,6 +41,23 @@ defmodule Realtime.GenRpc do @spec call(node, module, atom, list(any), keyword()) :: result def call(node, mod, func, args, opts) when is_atom(node) and is_atom(mod) and is_atom(func) and is_list(args) and is_list(opts) do + if node == node() or node in Node.list() do + do_call(node, mod, func, args, opts) + else + tenant_id = Keyword.get(opts, :tenant_id) + + log_error( + "ErrorOnRpcCall", + %{target: node, mod: mod, func: func, error: :badnode}, + project: tenant_id, + external_id: tenant_id + ) + + {:error, :rpc_error, :badnode} + end + end + + defp do_call(node, mod, func, args, opts) do timeout = Keyword.get(opts, :timeout, default_rpc_timeout()) tenant_id = Keyword.get(opts, :tenant_id) key = Keyword.get(opts, :key, nil) diff --git a/mix.exs b/mix.exs index f39513c99..67d1f7706 100644 --- a/mix.exs +++ b/mix.exs @@ -4,7 +4,7 @@ defmodule Realtime.MixProject do def project do [ app: :realtime, - version: "2.47.0", + version: "2.47.1", elixir: "~> 1.17.3", elixirc_paths: elixirc_paths(Mix.env()), start_permanent: Mix.env() == :prod, diff --git a/test/realtime/gen_rpc_test.exs b/test/realtime/gen_rpc_test.exs index dd837aaf8..e14d2d054 100644 --- a/test/realtime/gen_rpc_test.exs +++ b/test/realtime/gen_rpc_test.exs @@ -172,6 +172,18 @@ defmodule Realtime.GenRpcTest do mechanism: :gen_rpc }} end + + test "bad node" do + node = :"unknown@1.1.1.1" + + log = + capture_log(fn -> + assert GenRpc.call(node, Map, :fetch, [%{a: 1}, :a], tenant_id: 123) == {:error, :rpc_error, :badnode} + end) + + assert log =~ + ~r/project=123 external_id=123 \[error\] ErrorOnRpcCall: %{+error: :badnode, mod: Map, func: :fetch, target: :"#{node}"/ + end end describe "multicast/4" do From 6cfe6e18ecb37bc87049feecdac640b04484313e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Filipe=20Caba=C3=A7o?= Date: Mon, 8 Sep 2025 23:32:18 +0100 Subject: [PATCH 05/34] fix: enable presence on track message (#1527) currently the user would need to have enabled from the beginning of the channel. this will enable users to enable presence later in the flow by sending a track message which will enable presence messages for them --- lib/realtime/api.ex | 5 +- lib/realtime_web/channels/realtime_channel.ex | 2 +- .../realtime_channel/presence_handler.ex | 16 ++-- mix.exs | 2 +- test/integration/rt_channel_test.exs | 50 +++++++++++ .../presence_handler_test.exs | 82 +++++++++++++++++-- 6 files changed, 137 insertions(+), 20 deletions(-) diff --git a/lib/realtime/api.ex b/lib/realtime/api.ex index 16dc2bcd0..f612a5c1e 100644 --- a/lib/realtime/api.ex +++ b/lib/realtime/api.ex @@ -187,8 +187,9 @@ defmodule Realtime.Api do end defp list_extensions(type) do - from(e in Extensions, where: e.type == ^type, select: e) - |> Repo.all() + query = from(e in Extensions, where: e.type == ^type, select: e) + + Repo.all(query) end def rename_settings_field(from, to) do diff --git a/lib/realtime_web/channels/realtime_channel.ex b/lib/realtime_web/channels/realtime_channel.ex index 26c033f5c..03bd91347 100644 --- a/lib/realtime_web/channels/realtime_channel.ex +++ b/lib/realtime_web/channels/realtime_channel.ex @@ -376,7 +376,7 @@ defmodule RealtimeWeb.RealtimeChannel do end def handle_in("presence", payload, %{assigns: %{private?: false}} = socket) do - with {:ok, socket} <- PresenceHandler.handle(payload, socket) do + with {:ok, socket} <- PresenceHandler.handle(payload, nil, socket) do {:reply, :ok, socket} else {:error, :rate_limit_exceeded} -> diff --git a/lib/realtime_web/channels/realtime_channel/presence_handler.ex b/lib/realtime_web/channels/realtime_channel/presence_handler.ex index 00ce77c02..9dc23d219 100644 --- a/lib/realtime_web/channels/realtime_channel/presence_handler.ex +++ b/lib/realtime_web/channels/realtime_channel/presence_handler.ex @@ -52,28 +52,22 @@ defmodule RealtimeWeb.RealtimeChannel.PresenceHandler do end end - @spec handle(map(), Socket.t()) :: - {:ok, Socket.t()} | {:error, :rls_policy_error | :unable_to_set_policies | :rate_limit_exceeded} - def handle(_, %{assigns: %{presence_enabled?: false}} = socket), do: {:ok, socket} - def handle(payload, socket) when not is_private?(socket), do: handle(payload, nil, socket) - @spec handle(map(), pid() | nil, Socket.t()) :: {:ok, Socket.t()} | {:error, :rls_policy_error | :unable_to_set_policies | :rate_limit_exceeded | :unable_to_track_presence} - def handle(_, _, %{assigns: %{presence_enabled?: false}} = socket), do: {:ok, socket} - def handle(%{"event" => event} = payload, db_conn, socket) do event = String.downcase(event, :ascii) handle_presence_event(event, payload, db_conn, socket) end - def handle(_payload, _db_conn, socket), do: {:ok, socket} + def handle(_, _, socket), do: {:ok, socket} - defp handle_presence_event("track", payload, _db_conn, socket) when not is_private?(socket) do + defp handle_presence_event("track", payload, _, socket) when not is_private?(socket) do track(socket, payload) end - defp handle_presence_event("track", payload, db_conn, socket) when is_nil(socket.assigns.policies.presence.write) do + defp handle_presence_event("track", payload, db_conn, socket) + when is_private?(socket) and is_nil(socket.assigns.policies.presence.write) do %{assigns: %{authorization_context: authorization_context, policies: policies}} = socket case Authorization.get_write_authorizations(policies, db_conn, authorization_context) do @@ -111,6 +105,8 @@ defmodule RealtimeWeb.RealtimeChannel.PresenceHandler do end defp track(socket, payload) do + socket = assign(socket, :presence_enabled?, true) + %{assigns: %{presence_key: presence_key, tenant_topic: tenant_topic}} = socket payload = Map.get(payload, "payload", %{}) diff --git a/mix.exs b/mix.exs index 67d1f7706..f4beca664 100644 --- a/mix.exs +++ b/mix.exs @@ -4,7 +4,7 @@ defmodule Realtime.MixProject do def project do [ app: :realtime, - version: "2.47.1", + version: "2.47.2", elixir: "~> 1.17.3", elixirc_paths: elixirc_paths(Mix.env()), start_permanent: Mix.env() == :prod, diff --git a/test/integration/rt_channel_test.exs b/test/integration/rt_channel_test.exs index 806a5ad7e..36955e5b8 100644 --- a/test/integration/rt_channel_test.exs +++ b/test/integration/rt_channel_test.exs @@ -909,6 +909,56 @@ defmodule Realtime.Integration.RtChannelTest do assert_receive %Message{event: "phx_reply", payload: %{"status" => "ok"}}, 500 refute_receive %Message{event: "presence_state"}, 500 end + + test "presence automatically enabled when user sends track message for public channel", %{tenant: tenant} do + {socket, _} = get_connection(tenant) + config = %{presence: %{key: "", enabled: false}, private: false} + topic = "realtime:any" + + WebsocketClient.join(socket, topic, %{config: config}) + + assert_receive %Message{event: "phx_reply", payload: %{"status" => "ok"}, topic: ^topic}, 300 + refute_receive %Message{event: "presence_state"}, 500 + + payload = %{ + type: "presence", + event: "TRACK", + payload: %{name: "realtime_presence_96", t: 1814.7000000029802} + } + + WebsocketClient.send_event(socket, topic, "presence", payload) + + assert_receive %Message{event: "presence_diff", payload: %{"joins" => joins, "leaves" => %{}}, topic: ^topic} + + join_payload = joins |> Map.values() |> hd() |> get_in(["metas"]) |> hd() + assert get_in(join_payload, ["name"]) == payload.payload.name + assert get_in(join_payload, ["t"]) == payload.payload.t + end + + @tag policies: [:authenticated_read_broadcast_and_presence, :authenticated_write_broadcast_and_presence] + test "presence automatically enabled when user sends track message for private channel", + %{tenant: tenant, topic: topic} do + {socket, _} = get_connection(tenant, "authenticated") + config = %{presence: %{key: "", enabled: false}, private: true} + topic = "realtime:#{topic}" + + WebsocketClient.join(socket, topic, %{config: config}) + assert_receive %Message{event: "phx_reply", payload: %{"status" => "ok"}, topic: ^topic}, 300 + refute_receive %Message{event: "presence_state"}, 500 + + payload = %{ + type: "presence", + event: "TRACK", + payload: %{name: "realtime_presence_96", t: 1814.7000000029802} + } + + WebsocketClient.send_event(socket, topic, "presence", payload) + + assert_receive %Message{event: "presence_diff", payload: %{"joins" => joins, "leaves" => %{}}, topic: ^topic}, 500 + join_payload = joins |> Map.values() |> hd() |> get_in(["metas"]) |> hd() + assert get_in(join_payload, ["name"]) == payload.payload.name + assert get_in(join_payload, ["t"]) == payload.payload.t + end end describe "token handling" do diff --git a/test/realtime_web/channels/realtime_channel/presence_handler_test.exs b/test/realtime_web/channels/realtime_channel/presence_handler_test.exs index e5ecd32ad..0cdf422e2 100644 --- a/test/realtime_web/channels/realtime_channel/presence_handler_test.exs +++ b/test/realtime_web/channels/realtime_channel/presence_handler_test.exs @@ -99,7 +99,7 @@ defmodule RealtimeWeb.RealtimeChannel.PresenceHandlerTest do end end - describe "handle/2" do + describe "handle/3" do test "with true policy and is private, user can track their presence and changes", %{ tenant: tenant, topic: topic, @@ -142,7 +142,7 @@ defmodule RealtimeWeb.RealtimeChannel.PresenceHandlerTest do policies = %Policies{presence: %PresencePolicies{read: false, write: false}} socket = socket_fixture(tenant, topic, key, policies: policies, private?: false) - assert {:ok, _socket} = PresenceHandler.handle(%{"event" => "track"}, socket) + assert {:ok, _socket} = PresenceHandler.handle(%{"event" => "track"}, nil, socket) topic = socket.assigns.tenant_topic assert_receive %Broadcast{topic: ^topic, event: "presence_diff", payload: %{joins: joins, leaves: %{}}} @@ -229,6 +229,7 @@ defmodule RealtimeWeb.RealtimeChannel.PresenceHandlerTest do assert {:ok, socket} = PresenceHandler.handle( %{"event" => "track", "payload" => %{"metadata" => random_string()}}, + nil, socket ) @@ -248,7 +249,7 @@ defmodule RealtimeWeb.RealtimeChannel.PresenceHandlerTest do assert log =~ "UnknownPresenceEvent" end - test "socket with presence enabled false will ignore presence events in public channel", %{ + test "socket with presence enabled false will ignore non-track presence events in public channel", %{ tenant: tenant, topic: topic } do @@ -256,12 +257,12 @@ defmodule RealtimeWeb.RealtimeChannel.PresenceHandlerTest do policies = %Policies{presence: %PresencePolicies{read: true, write: true}} socket = socket_fixture(tenant, topic, key, policies: policies, private?: false, enabled?: false) - assert {:ok, _socket} = PresenceHandler.handle(%{"event" => "track"}, socket) + assert {:ok, _socket} = PresenceHandler.handle(%{"event" => "untrack"}, nil, socket) topic = socket.assigns.tenant_topic refute_receive %Broadcast{topic: ^topic, event: "presence_diff"} end - test "socket with presence enabled false will ignore presence events in private channel", %{ + test "socket with presence enabled false will ignore non-track presence events in private channel", %{ tenant: tenant, topic: topic, db_conn: db_conn @@ -270,11 +271,80 @@ defmodule RealtimeWeb.RealtimeChannel.PresenceHandlerTest do policies = %Policies{presence: %PresencePolicies{read: true, write: true}} socket = socket_fixture(tenant, topic, key, policies: policies, private?: false, enabled?: false) - assert {:ok, _socket} = PresenceHandler.handle(%{"event" => "track"}, db_conn, socket) + assert {:ok, _socket} = PresenceHandler.handle(%{"event" => "untrack"}, db_conn, socket) topic = socket.assigns.tenant_topic refute_receive %Broadcast{topic: ^topic, event: "presence_diff"} end + test "socket with presence disabled will enable presence on track message for public channel", %{ + tenant: tenant, + topic: topic + } do + key = random_string() + policies = %Policies{presence: %PresencePolicies{read: true, write: true}} + socket = socket_fixture(tenant, topic, key, policies: policies, private?: false, enabled?: false) + + refute socket.assigns.presence_enabled? + + assert {:ok, updated_socket} = PresenceHandler.handle(%{"event" => "track"}, nil, socket) + + assert updated_socket.assigns.presence_enabled? + topic = socket.assigns.tenant_topic + assert_receive %Broadcast{topic: ^topic, event: "presence_diff", payload: %{joins: joins, leaves: %{}}} + assert Map.has_key?(joins, key) + end + + test "socket with presence disabled will enable presence on track message for private channel", %{ + tenant: tenant, + topic: topic, + db_conn: db_conn + } do + key = random_string() + policies = %Policies{presence: %PresencePolicies{read: true, write: true}} + socket = socket_fixture(tenant, topic, key, policies: policies, private?: true, enabled?: false) + + refute socket.assigns.presence_enabled? + + assert {:ok, updated_socket} = PresenceHandler.handle(%{"event" => "track"}, db_conn, socket) + + assert updated_socket.assigns.presence_enabled? + topic = socket.assigns.tenant_topic + assert_receive %Broadcast{topic: ^topic, event: "presence_diff", payload: %{joins: joins, leaves: %{}}} + assert Map.has_key?(joins, key) + end + + test "socket with presence disabled will not enable presence on untrack message", %{ + tenant: tenant, + topic: topic, + db_conn: db_conn + } do + key = random_string() + policies = %Policies{presence: %PresencePolicies{read: true, write: true}} + socket = socket_fixture(tenant, topic, key, policies: policies, enabled?: false) + + refute socket.assigns.presence_enabled? + + assert {:ok, updated_socket} = PresenceHandler.handle(%{"event" => "untrack"}, db_conn, socket) + + refute updated_socket.assigns.presence_enabled? + topic = socket.assigns.tenant_topic + refute_receive %Broadcast{topic: ^topic, event: "presence_diff"} + end + + test "socket with presence disabled will not enable presence on unknown event", %{ + tenant: tenant, + topic: topic, + db_conn: db_conn + } do + key = random_string() + policies = %Policies{presence: %PresencePolicies{read: true, write: true}} + socket = socket_fixture(tenant, topic, key, policies: policies, enabled?: false) + + refute socket.assigns.presence_enabled? + + assert {:error, :unknown_presence_event} = PresenceHandler.handle(%{"event" => "unknown"}, db_conn, socket) + end + @tag policies: [:authenticated_read_broadcast_and_presence, :authenticated_write_broadcast_and_presence] test "rate limit is checked on private channel", %{tenant: tenant, topic: topic, db_conn: db_conn} do key = random_string() From b13bb214ca7abe5988d122854994c7963f844416 Mon Sep 17 00:00:00 2001 From: Eduardo Gurgel Date: Thu, 11 Sep 2025 08:58:42 +1200 Subject: [PATCH 06/34] fix: set cowboy active_n=100 as cowboy 2.12.0 (#1530) cowboy 2.13.0 set the default active_n=1 --- lib/realtime_web/endpoint.ex | 6 ++++++ mix.exs | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/lib/realtime_web/endpoint.ex b/lib/realtime_web/endpoint.ex index 917ab65b9..17ee13747 100644 --- a/lib/realtime_web/endpoint.ex +++ b/lib/realtime_web/endpoint.ex @@ -16,6 +16,12 @@ defmodule RealtimeWeb.Endpoint do connect_info: [:peer_data, :uri, :x_headers], fullsweep_after: 20, max_frame_size: 8_000_000, + # https://github.com/ninenines/cowboy/blob/24d32de931a0c985ff7939077463fc8be939f0e9/doc/src/manual/cowboy_websocket.asciidoc#L228 + # active_n: The number of packets Cowboy will request from the socket at once. + # This can be used to tweak the performance of the server. Higher values reduce + # the number of times Cowboy need to request more packets from the port driver at + # the expense of potentially higher memory being used. + active_n: 100, serializer: [ {Phoenix.Socket.V1.JSONSerializer, "~> 1.0.0"}, {Phoenix.Socket.V2.JSONSerializer, "~> 2.0.0"} diff --git a/mix.exs b/mix.exs index f4beca664..41f81567e 100644 --- a/mix.exs +++ b/mix.exs @@ -4,7 +4,7 @@ defmodule Realtime.MixProject do def project do [ app: :realtime, - version: "2.47.2", + version: "2.47.3", elixir: "~> 1.17.3", elixirc_paths: elixirc_paths(Mix.env()), start_permanent: Mix.env() == :prod, From a17ce3e59aa73ba73816c923917d7e5f838f0e88 Mon Sep 17 00:00:00 2001 From: Eduardo Gurgel Date: Fri, 12 Sep 2025 14:36:53 +1200 Subject: [PATCH 07/34] fix: provide error_code metadata on RealtimeChannel.Logging (#1531) --- config/test.exs | 2 +- .../channels/realtime_channel/logging.ex | 10 +++---- mix.exs | 2 +- .../realtime_channel/logging_test.exs | 27 ++++++++++++------- 4 files changed, 25 insertions(+), 16 deletions(-) diff --git a/config/test.exs b/config/test.exs index 4c7c66ae8..a69c51701 100644 --- a/config/test.exs +++ b/config/test.exs @@ -47,7 +47,7 @@ config :logger, # Configures Elixir's Logger config :logger, :console, format: "$time $metadata[$level] $message\n", - metadata: [:request_id, :project, :external_id, :application_name, :sub, :iss, :exp] + metadata: [:error_code, :request_id, :project, :external_id, :application_name, :sub, :iss, :exp] config :opentelemetry, span_processor: :simple, diff --git a/lib/realtime_web/channels/realtime_channel/logging.ex b/lib/realtime_web/channels/realtime_channel/logging.ex index 296dce1bc..2f6c91fdb 100644 --- a/lib/realtime_web/channels/realtime_channel/logging.ex +++ b/lib/realtime_web/channels/realtime_channel/logging.ex @@ -21,7 +21,7 @@ defmodule RealtimeWeb.RealtimeChannel.Logging do def log_error(socket, code, msg) do msg = build_msg(code, msg) emit_system_error(:error, code) - log(socket, :error, msg) + log(socket, :error, code, msg) {:error, %{reason: msg}} end @@ -32,7 +32,7 @@ defmodule RealtimeWeb.RealtimeChannel.Logging do {:error, %{reason: binary}} def log_warning(socket, code, msg) do msg = build_msg(code, msg) - log(socket, :warning, msg) + log(socket, :warning, code, msg) {:error, %{reason: msg}} end @@ -59,16 +59,16 @@ defmodule RealtimeWeb.RealtimeChannel.Logging do if code, do: "#{code}: #{msg}", else: msg end - defp log(%{assigns: %{tenant: tenant, access_token: access_token}}, level, msg) do + defp log(%{assigns: %{tenant: tenant, access_token: access_token}}, level, code, msg) do Logger.metadata(external_id: tenant, project: tenant) if level in [:error, :warning], do: update_metadata_with_token_claims(access_token) - Logger.log(level, msg) + Logger.log(level, msg, error_code: code) end defp maybe_log(%{assigns: %{log_level: log_level}} = socket, level, code, msg) do msg = build_msg(code, msg) emit_system_error(level, code) - if Logger.compare_levels(log_level, level) != :gt, do: log(socket, level, msg) + if Logger.compare_levels(log_level, level) != :gt, do: log(socket, level, code, msg) if level in [:error, :warning], do: {:error, %{reason: msg}}, else: :ok end diff --git a/mix.exs b/mix.exs index 41f81567e..d12783f2a 100644 --- a/mix.exs +++ b/mix.exs @@ -4,7 +4,7 @@ defmodule Realtime.MixProject do def project do [ app: :realtime, - version: "2.47.3", + version: "2.47.4", elixir: "~> 1.17.3", elixirc_paths: elixirc_paths(Mix.env()), start_permanent: Mix.env() == :prod, diff --git a/test/realtime_web/channels/realtime_channel/logging_test.exs b/test/realtime_web/channels/realtime_channel/logging_test.exs index 92634daef..cd131d16e 100644 --- a/test/realtime_web/channels/realtime_channel/logging_test.exs +++ b/test/realtime_web/channels/realtime_channel/logging_test.exs @@ -37,6 +37,7 @@ defmodule RealtimeWeb.RealtimeChannel.LoggingTest do assert log =~ "sub=#{sub}" assert log =~ "exp=#{exp}" assert log =~ "iss=#{iss}" + assert log =~ "error_code=TestError" end end @@ -57,6 +58,7 @@ defmodule RealtimeWeb.RealtimeChannel.LoggingTest do assert log =~ "sub=#{sub}" assert log =~ "exp=#{exp}" assert log =~ "iss=#{iss}" + assert log =~ "error_code=TestWarning" end end @@ -67,10 +69,14 @@ defmodule RealtimeWeb.RealtimeChannel.LoggingTest do for log_level <- log_levels do socket = %{assigns: %{log_level: log_level, tenant: random_string(), access_token: "test_token"}} - assert capture_log(fn -> - assert Logging.maybe_log_error(socket, "TestCode", "test message") == - {:error, %{reason: "TestCode: test message"}} - end) =~ "TestCode: test message" + log = + capture_log(fn -> + assert Logging.maybe_log_error(socket, "TestCode", "test message") == + {:error, %{reason: "TestCode: test message"}} + end) + + assert log =~ "TestCode: test message" + assert log =~ "error_code=TestCode" assert capture_log(fn -> assert Logging.maybe_log_error(socket, "TestCode", %{a: "b"}) == @@ -103,11 +109,14 @@ defmodule RealtimeWeb.RealtimeChannel.LoggingTest do for log_level <- log_levels do socket = %{assigns: %{log_level: log_level, tenant: random_string(), access_token: "test_token"}} - assert capture_log(fn -> - assert Logging.maybe_log_warning(socket, "TestCode", "test message") == - {:error, %{reason: "TestCode: test message"}} - end) =~ - "TestCode: test message" + log = + capture_log(fn -> + assert Logging.maybe_log_warning(socket, "TestCode", "test message") == + {:error, %{reason: "TestCode: test message"}} + end) + + assert log =~ "TestCode: test message" + assert log =~ "error_code=TestCode" assert capture_log(fn -> assert Logging.maybe_log_warning(socket, "TestCode", %{a: "b"}) == From eeba3067b269bf0e316f3d18e484688007b5ea51 Mon Sep 17 00:00:00 2001 From: Eduardo Gurgel Date: Mon, 15 Sep 2025 09:58:32 +1200 Subject: [PATCH 08/34] feat: disable UTF8 validation on websocket frames (#1532) Currently all text frames as handled only with JSON which already requires UTF-8 --- lib/realtime_web/endpoint.ex | 3 +++ mix.exs | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/lib/realtime_web/endpoint.ex b/lib/realtime_web/endpoint.ex index 17ee13747..190e1a917 100644 --- a/lib/realtime_web/endpoint.ex +++ b/lib/realtime_web/endpoint.ex @@ -22,6 +22,9 @@ defmodule RealtimeWeb.Endpoint do # the number of times Cowboy need to request more packets from the port driver at # the expense of potentially higher memory being used. active_n: 100, + # Skip validating UTF8 for faster frame processing. + # Currently all text frames as handled only with JSON which already requires UTF-8 + validate_utf8: false, serializer: [ {Phoenix.Socket.V1.JSONSerializer, "~> 1.0.0"}, {Phoenix.Socket.V2.JSONSerializer, "~> 2.0.0"} diff --git a/mix.exs b/mix.exs index d12783f2a..849a97b7b 100644 --- a/mix.exs +++ b/mix.exs @@ -4,7 +4,7 @@ defmodule Realtime.MixProject do def project do [ app: :realtime, - version: "2.47.4", + version: "2.48.0", elixir: "~> 1.17.3", elixirc_paths: elixirc_paths(Mix.env()), start_permanent: Mix.env() == :prod, From 70339c737f54855c200ab1c8ae671bc6171f480a Mon Sep 17 00:00:00 2001 From: Eduardo Gurgel Date: Tue, 16 Sep 2025 09:42:22 +1200 Subject: [PATCH 09/34] fix: move DB setup to happen after Connect.init (#1533) This change reduces the impact of slow DB setup impacting other tenants trying to connect at the same time that landed on the same partition --- lib/realtime/syn_handler.ex | 6 +- lib/realtime/tenants/connect.ex | 47 ++++++++------- .../tenants/connect/check_connection.ex | 4 +- .../tenants/connect/start_counters.ex | 60 ------------------- mix.exs | 2 +- test/integration/rt_channel_test.exs | 6 +- test/realtime/syn_handler_test.exs | 16 +++-- test/realtime/tenants/connect_test.exs | 20 +++---- 8 files changed, 52 insertions(+), 109 deletions(-) delete mode 100644 lib/realtime/tenants/connect/start_counters.ex diff --git a/lib/realtime/syn_handler.ex b/lib/realtime/syn_handler.ex index 397c8cf8f..d2fa5541c 100644 --- a/lib/realtime/syn_handler.ex +++ b/lib/realtime/syn_handler.ex @@ -10,9 +10,9 @@ defmodule Realtime.SynHandler do @behaviour :syn_event_handler @impl true - def on_registry_process_updated(Connect, tenant_id, _pid, %{conn: conn}, :normal) when is_pid(conn) do + def on_registry_process_updated(Connect, tenant_id, pid, %{conn: conn}, :normal) when is_pid(conn) do # Update that a database connection is ready - Endpoint.local_broadcast(Connect.syn_topic(tenant_id), "ready", %{conn: conn}) + Endpoint.local_broadcast(Connect.syn_topic(tenant_id), "ready", %{pid: pid, conn: conn}) end def on_registry_process_updated(PostgresCdcRls, tenant_id, _pid, meta, _reason) do @@ -38,7 +38,7 @@ defmodule Realtime.SynHandler do end topic = topic(mod) - Endpoint.local_broadcast(topic <> ":" <> name, topic <> "_down", nil) + Endpoint.local_broadcast(topic <> ":" <> name, topic <> "_down", %{pid: pid, reason: reason}) :ok end diff --git a/lib/realtime/tenants/connect.ex b/lib/realtime/tenants/connect.ex index b9bf00eb4..3c206a785 100644 --- a/lib/realtime/tenants/connect.ex +++ b/lib/realtime/tenants/connect.ex @@ -19,7 +19,6 @@ defmodule Realtime.Tenants.Connect do alias Realtime.Tenants.Connect.GetTenant alias Realtime.Tenants.Connect.Piper alias Realtime.Tenants.Connect.RegisterProcess - alias Realtime.Tenants.Connect.StartCounters alias Realtime.Tenants.Migrations alias Realtime.Tenants.ReplicationConnection alias Realtime.UsersCounter @@ -83,14 +82,13 @@ defmodule Realtime.Tenants.Connect do | {:error, :tenant_database_connection_initializing} def get_status(tenant_id) do case :syn.lookup(__MODULE__, tenant_id) do - {_pid, %{conn: nil}} -> - wait_for_connection(tenant_id) + {pid, %{conn: nil}} -> + wait_for_connection(pid, tenant_id) {_, %{conn: conn}} -> {:ok, conn} :undefined -> - Logger.warning("Connection process starting up") {:error, :tenant_database_connection_initializing} error -> @@ -101,7 +99,7 @@ defmodule Realtime.Tenants.Connect do def syn_topic(tenant_id), do: "connect:#{tenant_id}" - defp wait_for_connection(tenant_id) do + defp wait_for_connection(pid, tenant_id) do RealtimeWeb.Endpoint.subscribe(syn_topic(tenant_id)) # We do a lookup after subscribing because we could've missed a message while subscribing @@ -112,9 +110,18 @@ defmodule Realtime.Tenants.Connect do _ -> # Wait for up to 5 seconds for the ready event receive do - %{event: "ready", payload: %{conn: conn}} -> {:ok, conn} + %{event: "ready", payload: %{pid: ^pid, conn: conn}} -> + {:ok, conn} + + %{event: "connect_down", payload: %{pid: ^pid, reason: {:shutdown, :tenant_db_too_many_connections}}} -> + {:error, :tenant_db_too_many_connections} + + %{event: "connect_down", payload: %{pid: ^pid, reason: _reason}} -> + metadata = [external_id: tenant_id, project: tenant_id] + log_error("UnableToConnectToTenantDatabase", "Unable to connect to tenant database", metadata) + {:error, :tenant_database_unavailable} after - 5_000 -> {:error, :initializing} + 15_000 -> {:error, :initializing} end end after @@ -139,16 +146,6 @@ defmodule Realtime.Tenants.Connect do {:error, {:already_started, _}} -> get_status(tenant_id) - {:error, {:shutdown, :tenant_db_too_many_connections}} -> - {:error, :tenant_db_too_many_connections} - - {:error, {:shutdown, :tenant_not_found}} -> - {:error, :tenant_not_found} - - {:error, :shutdown} -> - log_error("UnableToConnectToTenantDatabase", "Unable to connect to tenant database", metadata) - {:error, :tenant_database_unavailable} - {:error, error} -> log_error("UnableToConnectToTenantDatabase", error, metadata) {:error, :tenant_database_unavailable} @@ -209,30 +206,33 @@ defmodule Realtime.Tenants.Connect do def init(%{tenant_id: tenant_id} = state) do Logger.metadata(external_id: tenant_id, project: tenant_id) + {:ok, state, {:continue, :db_connect}} + end + + @impl true + def handle_continue(:db_connect, state) do pipes = [ GetTenant, CheckConnection, - StartCounters, RegisterProcess ] case Piper.run(pipes, state) do {:ok, acc} -> - {:ok, acc, {:continue, :run_migrations}} + {:noreply, acc, {:continue, :run_migrations}} {:error, :tenant_not_found} -> - {:stop, {:shutdown, :tenant_not_found}} + {:stop, {:shutdown, :tenant_not_found}, state} {:error, :tenant_db_too_many_connections} -> - {:stop, {:shutdown, :tenant_db_too_many_connections}} + {:stop, {:shutdown, :tenant_db_too_many_connections}, state} {:error, error} -> log_error("UnableToConnectToTenantDatabase", error) - {:stop, :shutdown} + {:stop, :shutdown, state} end end - @impl true def handle_continue(:run_migrations, state) do %{tenant: tenant, db_conn_pid: db_conn_pid} = state Logger.warning("Tenant #{tenant.external_id} is initializing: #{inspect(node())}") @@ -375,6 +375,7 @@ defmodule Realtime.Tenants.Connect do ## Private functions defp call_external_node(tenant_id, opts) do + Logger.warning("Connection process starting up") rpc_timeout = Keyword.get(opts, :rpc_timeout, @rpc_timeout_default) with tenant <- Tenants.Cache.get_tenant_by_external_id(tenant_id), diff --git a/lib/realtime/tenants/connect/check_connection.ex b/lib/realtime/tenants/connect/check_connection.ex index 697c08b6c..53cd8e480 100644 --- a/lib/realtime/tenants/connect/check_connection.ex +++ b/lib/realtime/tenants/connect/check_connection.ex @@ -2,16 +2,14 @@ defmodule Realtime.Tenants.Connect.CheckConnection do @moduledoc """ Check tenant database connection. """ - alias Realtime.Database @behaviour Realtime.Tenants.Connect.Piper @impl true def run(acc) do %{tenant: tenant} = acc - case Database.check_tenant_connection(tenant) do + case Realtime.Database.check_tenant_connection(tenant) do {:ok, conn} -> - Process.link(conn) db_conn_reference = Process.monitor(conn) {:ok, %{acc | db_conn_pid: conn, db_conn_reference: db_conn_reference}} diff --git a/lib/realtime/tenants/connect/start_counters.ex b/lib/realtime/tenants/connect/start_counters.ex deleted file mode 100644 index f8ce6c378..000000000 --- a/lib/realtime/tenants/connect/start_counters.ex +++ /dev/null @@ -1,60 +0,0 @@ -defmodule Realtime.Tenants.Connect.StartCounters do - @moduledoc """ - Start tenant counters. - """ - - alias Realtime.RateCounter - alias Realtime.Tenants - - @behaviour Realtime.Tenants.Connect.Piper - - @impl true - def run(acc) do - %{tenant: tenant} = acc - - with :ok <- start_joins_per_second_counter(tenant), - :ok <- start_max_events_counter(tenant), - :ok <- start_db_events_counter(tenant) do - {:ok, acc} - end - end - - def start_joins_per_second_counter(tenant) do - res = - tenant - |> Tenants.joins_per_second_rate() - |> RateCounter.new() - - case res do - {:ok, _} -> :ok - {:error, {:already_started, _}} -> :ok - {:error, reason} -> {:error, reason} - end - end - - def start_max_events_counter(tenant) do - res = - tenant - |> Tenants.events_per_second_rate() - |> RateCounter.new() - - case res do - {:ok, _} -> :ok - {:error, {:already_started, _}} -> :ok - {:error, reason} -> {:error, reason} - end - end - - def start_db_events_counter(tenant) do - res = - tenant - |> Tenants.db_events_per_second_rate() - |> RateCounter.new() - - case res do - {:ok, _} -> :ok - {:error, {:already_started, _}} -> :ok - {:error, reason} -> {:error, reason} - end - end -end diff --git a/mix.exs b/mix.exs index 849a97b7b..75a7bbd6f 100644 --- a/mix.exs +++ b/mix.exs @@ -4,7 +4,7 @@ defmodule Realtime.MixProject do def project do [ app: :realtime, - version: "2.48.0", + version: "2.48.1", elixir: "~> 1.17.3", elixirc_paths: elixirc_paths(Mix.env()), start_permanent: Mix.env() == :prod, diff --git a/test/integration/rt_channel_test.exs b/test/integration/rt_channel_test.exs index 36955e5b8..2ae4cd449 100644 --- a/test/integration/rt_channel_test.exs +++ b/test/integration/rt_channel_test.exs @@ -653,8 +653,8 @@ defmodule Realtime.Integration.RtChannelTest do :syn.update_registry(Connect, tenant.external_id, fn _pid, meta -> %{meta | conn: nil} end) payload = %{"event" => "TEST", "payload" => %{"msg" => 1}, "type" => "broadcast"} WebsocketClient.send_event(service_role_socket, topic, "broadcast", payload) - # Waiting more than 5 seconds as this is the amount of time we will wait for the Connection to be ready - refute_receive %Message{event: "broadcast", payload: ^payload, topic: ^topic}, 6000 + # Waiting more than 15 seconds as this is the amount of time we will wait for the Connection to be ready + refute_receive %Message{event: "broadcast", payload: ^payload, topic: ^topic}, 16000 end) assert log =~ "UnableToHandleBroadcast" @@ -831,7 +831,7 @@ defmodule Realtime.Integration.RtChannelTest do refute_receive %Message{event: "presence_diff"}, 500 # Waiting more than 5 seconds as this is the amount of time we will wait for the Connection to be ready - refute_receive %Message{event: "phx_leave", topic: ^topic}, 6000 + refute_receive %Message{event: "phx_leave", topic: ^topic}, 16000 end) assert log =~ "UnableToHandlePresence" diff --git a/test/realtime/syn_handler_test.exs b/test/realtime/syn_handler_test.exs index 2b27cf322..1cf0d3bad 100644 --- a/test/realtime/syn_handler_test.exs +++ b/test/realtime/syn_handler_test.exs @@ -168,32 +168,40 @@ defmodule Realtime.SynHandlerTest do test "it handles :syn_conflict_resolution reason" do reason = :syn_conflict_resolution + pid = self() log = capture_log(fn -> - assert SynHandler.on_process_unregistered(@mod, @name, self(), %{}, reason) == :ok + assert SynHandler.on_process_unregistered(@mod, @name, pid, %{}, reason) == :ok end) topic = "#{@topic}:#{@name}" event = "#{@topic}_down" assert log =~ "#{@mod} terminated due to syn conflict resolution: #{inspect(@name)} #{inspect(self())}" - assert_receive %Phoenix.Socket.Broadcast{topic: ^topic, event: ^event, payload: nil} + assert_receive %Phoenix.Socket.Broadcast{topic: ^topic, event: ^event, payload: %{reason: ^reason, pid: ^pid}} end test "it handles other reasons" do reason = :other_reason + pid = self() log = capture_log(fn -> - assert SynHandler.on_process_unregistered(@mod, @name, self(), %{}, reason) == :ok + assert SynHandler.on_process_unregistered(@mod, @name, pid, %{}, reason) == :ok end) topic = "#{@topic}:#{@name}" event = "#{@topic}_down" refute log =~ "#{@mod} terminated: #{inspect(@name)} #{node()}" - assert_receive %Phoenix.Socket.Broadcast{topic: ^topic, event: ^event, payload: nil}, 500 + + assert_receive %Phoenix.Socket.Broadcast{ + topic: ^topic, + event: ^event, + payload: %{reason: ^reason, pid: ^pid} + }, + 500 end end end diff --git a/test/realtime/tenants/connect_test.exs b/test/realtime/tenants/connect_test.exs index 290fb1c8d..18cb6e7f7 100644 --- a/test/realtime/tenants/connect_test.exs +++ b/test/realtime/tenants/connect_test.exs @@ -78,31 +78,27 @@ defmodule Realtime.Tenants.ConnectTest do assert_receive {:ok, ^pid} end - test "more than 5 seconds passed error out", %{tenant: tenant} do + test "more than 15 seconds passed error out", %{tenant: tenant} do parent = self() # Let's slow down Connect starting expect(Database, :check_tenant_connection, fn t -> - :timer.sleep(5500) + Process.sleep(15500) call_original(Database, :check_tenant_connection, [t]) end) connect = fn -> send(parent, Connect.lookup_or_start_connection(tenant.external_id)) end - # Start an early connect - spawn(connect) - :timer.sleep(100) - - # Start others spawn(connect) spawn(connect) - {:error, :tenant_database_unavailable} = Connect.lookup_or_start_connection(tenant.external_id) + {:error, :initializing} = Connect.lookup_or_start_connection(tenant.external_id) + # The above call waited 15 seconds + assert_receive {:error, :initializing} + assert_receive {:error, :initializing} - # Only one will succeed the others timed out waiting - assert_receive {:error, :tenant_database_unavailable} - assert_receive {:error, :tenant_database_unavailable} - assert_receive {:ok, _pid}, 7000 + # This one will succeed + {:ok, _pid} = Connect.lookup_or_start_connection(tenant.external_id) end end From 50891cd8b9ca8fd7c0760d8badbb8de48c4be770 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Filipe=20Caba=C3=A7o?= Date: Mon, 15 Sep 2025 23:11:25 +0100 Subject: [PATCH 10/34] fix: handle wal bloat (#1528) Verify that replication connection is able to reconnect when faced with WAL bloat issues --- lib/realtime/tenants/connect.ex | 76 ++++++---- .../tenants/replication_connection.ex | 2 +- mix.exs | 2 +- test/integration/rt_channel_test.exs | 130 ++++++++++++++++++ test/realtime/tenants/connect_test.exs | 26 +++- .../tenants/replication_connection_test.exs | 36 +++++ test/support/containers.ex | 8 +- 7 files changed, 245 insertions(+), 35 deletions(-) diff --git a/lib/realtime/tenants/connect.ex b/lib/realtime/tenants/connect.ex index 3c206a785..920205e95 100644 --- a/lib/realtime/tenants/connect.ex +++ b/lib/realtime/tenants/connect.ex @@ -252,31 +252,10 @@ defmodule Realtime.Tenants.Connect do end def handle_continue(:start_replication, state) do - %{tenant: tenant} = state - - with {:ok, replication_connection_pid} <- ReplicationConnection.start(tenant, self()) do - replication_connection_reference = Process.monitor(replication_connection_pid) - - state = %{ - state - | replication_connection_pid: replication_connection_pid, - replication_connection_reference: replication_connection_reference - } - - {:noreply, state, {:continue, :setup_connected_user_events}} - else - {:error, :max_wal_senders_reached} -> - log_error("ReplicationMaxWalSendersReached", "Tenant database has reached the maximum number of WAL senders") - {:stop, :shutdown, state} - - {:error, error} -> - log_error("StartReplicationFailed", error) - {:stop, :shutdown, state} + case start_replication_connection(state) do + {:ok, state} -> {:noreply, state, {:continue, :setup_connected_user_events}} + {:error, state} -> {:stop, :shutdown, state} end - rescue - error -> - log_error("StartReplicationFailed", error) - {:stop, :shutdown, state} end def handle_continue(:setup_connected_user_events, state) do @@ -348,13 +327,30 @@ defmodule Realtime.Tenants.Connect do {:stop, :shutdown, state} end + @replication_recovery_backoff 1000 + # Handle replication connection termination def handle_info( {:DOWN, replication_connection_reference, _, _, _}, %{replication_connection_reference: replication_connection_reference} = state ) do - Logger.warning("Replication connection has died") - {:stop, :shutdown, state} + log_warning("ReplicationConnectionDown", "Replication connection has been terminated") + Process.send_after(self(), :recover_replication_connection, @replication_recovery_backoff) + state = %{state | replication_connection_pid: nil, replication_connection_reference: nil} + {:noreply, state} + end + + @replication_connection_query "SELECT 1 from pg_stat_activity where application_name='realtime_replication_connection'" + def handle_info(:recover_replication_connection, state) do + with %{num_rows: 0} <- Postgrex.query!(state.db_conn_pid, @replication_connection_query, []), + {:ok, state} <- start_replication_connection(state) do + {:noreply, state} + else + _ -> + log_error("ReplicationConnectionRecoveryFailed", "Replication connection recovery failed") + Process.send_after(self(), :recover_replication_connection, @replication_recovery_backoff) + {:noreply, state} + end end def handle_info(_, state), do: {:noreply, state} @@ -414,4 +410,32 @@ defmodule Realtime.Tenants.Connect do defp tenant_suspended?(_), do: :ok defp rebalance_check_interval_in_ms(), do: Application.fetch_env!(:realtime, :rebalance_check_interval_in_ms) + + defp start_replication_connection(state) do + %{tenant: tenant} = state + + with {:ok, replication_connection_pid} <- ReplicationConnection.start(tenant, self()) do + replication_connection_reference = Process.monitor(replication_connection_pid) + + state = %{ + state + | replication_connection_pid: replication_connection_pid, + replication_connection_reference: replication_connection_reference + } + + {:ok, state} + else + {:error, :max_wal_senders_reached} -> + log_error("ReplicationMaxWalSendersReached", "Tenant database has reached the maximum number of WAL senders") + {:error, state} + + {:error, error} -> + log_error("StartReplicationFailed", error) + {:error, state} + end + rescue + error -> + log_error("StartReplicationFailed", error) + {:error, state} + end end diff --git a/lib/realtime/tenants/replication_connection.ex b/lib/realtime/tenants/replication_connection.ex index 45e03c66e..58b1de191 100644 --- a/lib/realtime/tenants/replication_connection.ex +++ b/lib/realtime/tenants/replication_connection.ex @@ -144,8 +144,8 @@ defmodule Realtime.Tenants.ReplicationConnection do port: connection_opts.port, socket_options: connection_opts.socket_options, ssl: connection_opts.ssl, - backoff_type: :stop, sync_connect: true, + auto_reconnect: false, parameters: [application_name: "realtime_replication_connection"] ] diff --git a/mix.exs b/mix.exs index 75a7bbd6f..372ff12c4 100644 --- a/mix.exs +++ b/mix.exs @@ -4,7 +4,7 @@ defmodule Realtime.MixProject do def project do [ app: :realtime, - version: "2.48.1", + version: "2.48.2", elixir: "~> 1.17.3", elixirc_paths: elixirc_paths(Mix.env()), start_permanent: Mix.env() == :prod, diff --git a/test/integration/rt_channel_test.exs b/test/integration/rt_channel_test.exs index 2ae4cd449..23b1a3a7f 100644 --- a/test/integration/rt_channel_test.exs +++ b/test/integration/rt_channel_test.exs @@ -25,6 +25,7 @@ defmodule Realtime.Integration.RtChannelTest do alias Realtime.Tenants alias Realtime.Tenants.Authorization alias Realtime.Tenants.Connect + alias Realtime.Tenants.ReplicationConnection alias RealtimeWeb.RealtimeChannel.Tracker alias RealtimeWeb.SocketDisconnect @@ -2354,6 +2355,135 @@ defmodule Realtime.Integration.RtChannelTest do assert count == 2 end + describe "WAL bloat handling" do + setup %{tenant: tenant} do + topic = random_string() + {:ok, db_conn} = Database.connect(tenant, "realtime_test", :stop) + + %{rows: [[max_wal_size]]} = Postgrex.query!(db_conn, "SHOW max_wal_size", []) + %{rows: [[wal_keep_size]]} = Postgrex.query!(db_conn, "SHOW wal_keep_size", []) + %{rows: [[max_slot_wal_keep_size]]} = Postgrex.query!(db_conn, "SHOW max_slot_wal_keep_size", []) + + assert max_wal_size == "32MB" + assert wal_keep_size == "32MB" + assert max_slot_wal_keep_size == "32MB" + + Postgrex.query!(db_conn, "CREATE TABLE IF NOT EXISTS wal_test (id INT, data TEXT)", []) + + Postgrex.query!( + db_conn, + """ + CREATE OR REPLACE FUNCTION wal_test_trigger_func() RETURNS TRIGGER AS $$ + BEGIN + PERFORM realtime.send(json_build_object ('value', 'test' :: text)::jsonb, 'test', '#{topic}', false); + RETURN NULL; + END; + $$ LANGUAGE plpgsql; + """, + [] + ) + + Postgrex.query!(db_conn, "DROP TRIGGER IF EXISTS wal_test_trigger ON wal_test", []) + + Postgrex.query!( + db_conn, + """ + CREATE TRIGGER wal_test_trigger + AFTER INSERT OR UPDATE OR DELETE ON wal_test + FOR EACH ROW + EXECUTE FUNCTION wal_test_trigger_func() + """, + [] + ) + + GenServer.stop(db_conn) + + on_exit(fn -> + {:ok, db_conn} = Database.connect(tenant, "realtime_test", :stop) + + Postgrex.query!(db_conn, "DROP TABLE IF EXISTS wal_test CASCADE", []) + end) + + %{topic: topic} + end + + test "track PID changes during WAL bloat creation", %{tenant: tenant, topic: topic} do + {socket, _} = get_connection(tenant, "authenticated") + config = %{broadcast: %{self: true}, private: false} + full_topic = "realtime:#{topic}" + + active_slot_query = + "SELECT active_pid FROM pg_replication_slots where active_pid is not null and slot_name = 'supabase_realtime_messages_replication_slot_'" + + WebsocketClient.join(socket, full_topic, %{config: config}) + + assert_receive %Message{event: "phx_reply", payload: %{"status" => "ok"}}, 500 + assert_receive %Message{event: "presence_state"}, 500 + + assert Connect.ready?(tenant.external_id) + + {:ok, db_conn} = Connect.lookup_or_start_connection(tenant.external_id) + + original_connect_pid = Connect.whereis(tenant.external_id) + original_replication_pid = ReplicationConnection.whereis(tenant.external_id) + %{rows: [[original_db_pid]]} = Postgrex.query!(db_conn, active_slot_query, []) + + tasks = + for _ <- 1..5 do + Task.async(fn -> + {:ok, bloat_conn} = Database.connect(tenant, "realtime_bloat", :stop) + + Postgrex.transaction(bloat_conn, fn conn -> + Postgrex.query(conn, "INSERT INTO wal_test SELECT generate_series(1, 100000), repeat('x', 2000)", []) + {:error, "test"} + end) + + Process.exit(bloat_conn, :normal) + end) + end + + Task.await_many(tasks, 20000) + + # Kill all pending transactions still running + Postgrex.query!( + db_conn, + "SELECT pg_terminate_backend(pid) from pg_stat_activity where application_name='realtime_bloat'", + [] + ) + + # Does it recover? + assert Connect.ready?(tenant.external_id) + {:ok, db_conn} = Connect.lookup_or_start_connection(tenant.external_id) + Process.sleep(1000) + %{rows: [[new_db_pid]]} = Postgrex.query!(db_conn, active_slot_query, []) + + assert new_db_pid != original_db_pid + assert ^original_connect_pid = Connect.whereis(tenant.external_id) + assert original_replication_pid != ReplicationConnection.whereis(tenant.external_id) + + # Check if socket is still connected + payload = %{"event" => "TEST", "payload" => %{"msg" => 1}, "type" => "broadcast"} + WebsocketClient.send_event(socket, full_topic, "broadcast", payload) + assert_receive %Message{event: "broadcast", payload: ^payload, topic: ^full_topic}, 500 + + # Check if we are receiving the message from replication connection + Postgrex.query!(db_conn, "INSERT INTO wal_test VALUES (1, 'test')", []) + + assert_receive %Phoenix.Socket.Message{ + event: "broadcast", + payload: %{ + "event" => "test", + "payload" => %{"value" => "test"}, + "type" => "broadcast" + }, + join_ref: nil, + ref: nil, + topic: ^full_topic + }, + 5000 + end + end + defp mode(%{mode: :distributed}) do tenant = Api.get_tenant_by_external_id("dev_tenant") diff --git a/test/realtime/tenants/connect_test.exs b/test/realtime/tenants/connect_test.exs index 18cb6e7f7..fdc3d6385 100644 --- a/test/realtime/tenants/connect_test.exs +++ b/test/realtime/tenants/connect_test.exs @@ -348,11 +348,13 @@ defmodule Realtime.Tenants.ConnectTest do assert replication_connection_before == replication_connection_after end - test "on replication connection postgres pid being stopped, also kills the Connect module", %{tenant: tenant} do + test "on replication connection postgres pid being stopped, Connect module recovers it", %{tenant: tenant} do assert {:ok, db_conn} = Connect.lookup_or_start_connection(tenant.external_id) assert Connect.ready?(tenant.external_id) replication_connection_pid = ReplicationConnection.whereis(tenant.external_id) + Process.monitor(replication_connection_pid) + assert Process.alive?(replication_connection_pid) pid = Connect.whereis(tenant.external_id) @@ -362,21 +364,33 @@ defmodule Realtime.Tenants.ConnectTest do [] ) - assert_process_down(replication_connection_pid) - assert_process_down(pid) + assert_receive {:DOWN, _, :process, ^replication_connection_pid, _} + + Process.sleep(1500) + new_replication_connection_pid = ReplicationConnection.whereis(tenant.external_id) + + assert replication_connection_pid != new_replication_connection_pid + assert Process.alive?(new_replication_connection_pid) + assert Process.alive?(pid) end - test "on replication connection exit, also kills the Connect module", %{tenant: tenant} do + test "on replication connection exit, Connect module recovers it", %{tenant: tenant} do assert {:ok, _db_conn} = Connect.lookup_or_start_connection(tenant.external_id) assert Connect.ready?(tenant.external_id) replication_connection_pid = ReplicationConnection.whereis(tenant.external_id) + Process.monitor(replication_connection_pid) assert Process.alive?(replication_connection_pid) pid = Connect.whereis(tenant.external_id) Process.exit(replication_connection_pid, :kill) + assert_receive {:DOWN, _, :process, ^replication_connection_pid, _} - assert_process_down(replication_connection_pid) - assert_process_down(pid) + Process.sleep(1500) + new_replication_connection_pid = ReplicationConnection.whereis(tenant.external_id) + + assert replication_connection_pid != new_replication_connection_pid + assert Process.alive?(new_replication_connection_pid) + assert Process.alive?(pid) end test "handles max_wal_senders by logging the correct operational code", %{tenant: tenant} do diff --git a/test/realtime/tenants/replication_connection_test.exs b/test/realtime/tenants/replication_connection_test.exs index 783270313..2d367a846 100644 --- a/test/realtime/tenants/replication_connection_test.exs +++ b/test/realtime/tenants/replication_connection_test.exs @@ -331,6 +331,26 @@ defmodule Realtime.Tenants.ReplicationConnectionTest do assert {:error, :max_wal_senders_reached} = ReplicationConnection.start(tenant, self()) end + + test "handles WAL pressure gracefully", %{tenant: tenant} do + {:ok, replication_pid} = ReplicationConnection.start(tenant, self()) + + {:ok, conn} = Database.connect(tenant, "realtime_test", :stop) + on_exit(fn -> Process.exit(conn, :normal) end) + + large_payload = String.duplicate("x", 10 * 1024 * 1024) + + for i <- 1..5 do + message_fixture_with_conn(tenant, conn, %{ + "topic" => "stress_#{i}", + "private" => true, + "event" => "INSERT", + "payload" => %{"data" => large_payload} + }) + end + + assert Process.alive?(replication_pid) + end end describe "whereis/1" do @@ -409,4 +429,20 @@ defmodule Realtime.Tenants.ReplicationConnectionTest do ref = Process.monitor(pid) assert_receive {:DOWN, ^ref, :process, ^pid, _reason}, timeout end + + defp message_fixture_with_conn(_tenant, conn, override) do + create_attrs = %{ + "topic" => random_string(), + "extension" => "broadcast" + } + + override = override |> Enum.map(fn {k, v} -> {"#{k}", v} end) |> Map.new() + + {:ok, message} = + create_attrs + |> Map.merge(override) + |> TenantConnection.create_message(conn) + + message + end end diff --git a/test/support/containers.ex b/test/support/containers.ex index cd66f2699..bc49fa275 100644 --- a/test/support/containers.ex +++ b/test/support/containers.ex @@ -267,7 +267,13 @@ defmodule Containers do @image, "postgres", "-c", - "config_file=/etc/postgresql/postgresql.conf" + "config_file=/etc/postgresql/postgresql.conf", + "-c", + "wal_keep_size=32MB", + "-c", + "max_wal_size=32MB", + "-c", + "max_slot_wal_keep_size=32MB" ]) end end From 5ccea17be1a7c6220b8f742aa7b8fb3dede22e53 Mon Sep 17 00:00:00 2001 From: Eduardo Gurgel Date: Tue, 16 Sep 2025 12:16:32 +1200 Subject: [PATCH 11/34] feat: replay realtime.messages (#1526) A new index was created on inserted_at DESC, topic WHERE private IS TRUE AND extension = "broadast" The hardcoded limit is 25 for now. --- README.md | 3 +- lib/realtime/api/message.ex | 4 +- lib/realtime/messages.ex | 55 +++++ lib/realtime/tenants/batch_broadcast.ex | 38 +-- lib/realtime/tenants/migrations.ex | 6 +- .../tenants/replication_connection.ex | 8 +- ...0905041441_create_messages_replay_index.ex | 11 + .../channels/payloads/broadcast.ex | 2 + .../channels/payloads/broadcast/replay.ex | 17 ++ lib/realtime_web/channels/realtime_channel.ex | 51 +++- .../realtime_channel/message_dispatcher.ex | 41 ++- mix.exs | 2 +- test/realtime/messages_test.exs | 233 ++++++++++++++++-- .../tenants/janitor/maintenance_task_test.exs | 11 +- test/realtime/tenants/janitor_test.exs | 14 +- .../tenants/replication_connection_test.exs | 37 ++- .../channels/payloads/join_test.exs | 17 +- .../message_dispatcher_test.exs | 47 +++- .../channels/realtime_channel_test.exs | 162 ++++++++++++ 19 files changed, 678 insertions(+), 81 deletions(-) create mode 100644 lib/realtime/tenants/repo/migrations/20250905041441_create_messages_replay_index.ex create mode 100644 lib/realtime_web/channels/payloads/broadcast/replay.ex diff --git a/README.md b/README.md index 2235bf388..6a16a79ba 100644 --- a/README.md +++ b/README.md @@ -94,7 +94,7 @@ You can add your own by making a `POST` request to the server. You must change b "region": "us-west-1", "poll_interval_ms": 100, "poll_max_record_bytes": 1048576, - "ssl_enforced": false + "ssl_enforced": false } } ] @@ -284,6 +284,7 @@ This is the list of operational codes that can help you understand your deployme | UnknownErrorOnController | An error we are not handling correctly was triggered on a controller | | UnknownErrorOnChannel | An error we are not handling correctly was triggered on a channel | | PresenceRateLimitReached | Limit of presence events reached | +| UnableToReplayMessages | An error while replaying messages | ## License diff --git a/lib/realtime/api/message.ex b/lib/realtime/api/message.ex index 90ebc5bc9..18bbc9a87 100644 --- a/lib/realtime/api/message.ex +++ b/lib/realtime/api/message.ex @@ -8,6 +8,8 @@ defmodule Realtime.Api.Message do @primary_key {:id, Ecto.UUID, autogenerate: true} @schema_prefix "realtime" + @type t :: %__MODULE__{} + schema "messages" do field(:topic, :string) field(:extension, Ecto.Enum, values: [:broadcast, :presence]) @@ -39,7 +41,7 @@ defmodule Realtime.Api.Message do end defp maybe_put_timestamp(changeset, field) do - case Map.get(changeset.data, field) do + case get_field(changeset, field) do nil -> put_timestamp(changeset, field) _ -> changeset end diff --git a/lib/realtime/messages.ex b/lib/realtime/messages.ex index c6d571db7..804a48d66 100644 --- a/lib/realtime/messages.ex +++ b/lib/realtime/messages.ex @@ -3,6 +3,61 @@ defmodule Realtime.Messages do Handles `realtime.messages` table operations """ + alias Realtime.Api.Message + + import Ecto.Query, only: [from: 2] + + @hard_limit 25 + @default_timeout 5_000 + + @doc """ + Fetch last `limit ` messages for a given `topic` inserted after `since` + + Automatically uses RPC if the database connection is not in the same node + + Only allowed for private channels + """ + @spec replay(pid, String.t(), non_neg_integer, non_neg_integer) :: + {:ok, Message.t(), [String.t()]} | {:error, term} | {:error, :rpc_error, term} + def replay(conn, topic, since, limit) when node(conn) == node() and is_integer(since) and is_integer(limit) do + limit = max(min(limit, @hard_limit), 1) + + with {:ok, since} <- DateTime.from_unix(since, :millisecond), + {:ok, messages} <- messages(conn, topic, since, limit) do + {:ok, Enum.reverse(messages), MapSet.new(messages, & &1.id)} + else + {:error, :postgrex_exception} -> {:error, :failed_to_replay_messages} + {:error, :invalid_unix_time} -> {:error, :invalid_replay_params} + error -> error + end + end + + def replay(conn, topic, since, limit) when is_integer(since) and is_integer(limit) do + Realtime.GenRpc.call(node(conn), __MODULE__, :replay, [conn, topic, since, limit], key: topic) + end + + def replay(_, _, _, _), do: {:error, :invalid_replay_params} + + defp messages(conn, topic, since, limit) do + since = DateTime.to_naive(since) + # We want to avoid searching partitions in the future as they should be empty + # so we limit to 1 minute in the future to account for any potential drift + now = NaiveDateTime.utc_now() |> NaiveDateTime.add(1, :minute) + + query = + from m in Message, + where: + m.topic == ^topic and + m.private == true and + m.extension == :broadcast and + m.inserted_at >= ^since and + m.inserted_at < ^now, + limit: ^limit, + order_by: [desc: m.inserted_at] + + Realtime.Repo.all(conn, query, Message, timeout: @default_timeout) + end + @doc """ Deletes messages older than 72 hours for a given tenant connection """ diff --git a/lib/realtime/tenants/batch_broadcast.ex b/lib/realtime/tenants/batch_broadcast.ex index 4fc31aa0f..98427621b 100644 --- a/lib/realtime/tenants/batch_broadcast.ex +++ b/lib/realtime/tenants/batch_broadcast.ex @@ -29,7 +29,9 @@ defmodule Realtime.Tenants.BatchBroadcast do @spec broadcast( auth_params :: map() | nil, tenant :: Tenant.t(), - messages :: %{messages: list(%{topic: String.t(), payload: map(), event: String.t(), private: boolean()})}, + messages :: %{ + messages: list(%{id: String.t(), topic: String.t(), payload: map(), event: String.t(), private: boolean()}) + }, super_user :: boolean() ) :: :ok | {:error, atom()} def broadcast(auth_params, tenant, messages, super_user \\ false) @@ -59,8 +61,8 @@ defmodule Realtime.Tenants.BatchBroadcast do # Handle events for public channel events |> Map.get(false, []) - |> Enum.each(fn %{topic: sub_topic, payload: payload, event: event} -> - send_message_and_count(tenant, events_per_second_rate, sub_topic, event, payload, true) + |> Enum.each(fn message -> + send_message_and_count(tenant, events_per_second_rate, message, true) end) # Handle events for private channel @@ -69,14 +71,14 @@ defmodule Realtime.Tenants.BatchBroadcast do |> Enum.group_by(fn event -> Map.get(event, :topic) end) |> Enum.each(fn {topic, events} -> if super_user do - Enum.each(events, fn %{topic: sub_topic, payload: payload, event: event} -> - send_message_and_count(tenant, events_per_second_rate, sub_topic, event, payload, false) + Enum.each(events, fn message -> + send_message_and_count(tenant, events_per_second_rate, message, false) end) else case permissions_for_message(tenant, auth_params, topic) do %Policies{broadcast: %BroadcastPolicies{write: true}} -> - Enum.each(events, fn %{topic: sub_topic, payload: payload, event: event} -> - send_message_and_count(tenant, events_per_second_rate, sub_topic, event, payload, false) + Enum.each(events, fn message -> + send_message_and_count(tenant, events_per_second_rate, message, false) end) _ -> @@ -91,15 +93,15 @@ defmodule Realtime.Tenants.BatchBroadcast do def broadcast(_, nil, _, _), do: {:error, :tenant_not_found} - def changeset(payload, attrs) do + defp changeset(payload, attrs) do payload |> cast(attrs, []) |> cast_embed(:messages, required: true, with: &message_changeset/2) end - def message_changeset(message, attrs) do + defp message_changeset(message, attrs) do message - |> cast(attrs, [:topic, :payload, :event, :private]) + |> cast(attrs, [:id, :topic, :payload, :event, :private]) |> maybe_put_private_change() |> validate_required([:topic, :payload, :event]) end @@ -112,11 +114,19 @@ defmodule Realtime.Tenants.BatchBroadcast do end @event_type "broadcast" - defp send_message_and_count(tenant, events_per_second_rate, topic, event, payload, public?) do - tenant_topic = Tenants.tenant_topic(tenant, topic, public?) - payload = %{"payload" => payload, "event" => event, "type" => "broadcast"} + defp send_message_and_count(tenant, events_per_second_rate, message, public?) do + tenant_topic = Tenants.tenant_topic(tenant, message.topic, public?) - broadcast = %Phoenix.Socket.Broadcast{topic: topic, event: @event_type, payload: payload} + payload = %{"payload" => message.payload, "event" => message.event, "type" => "broadcast"} + + payload = + if message[:id] do + Map.put(payload, "meta", %{"id" => message.id}) + else + payload + end + + broadcast = %Phoenix.Socket.Broadcast{topic: message.topic, event: @event_type, payload: payload} GenCounter.add(events_per_second_rate.id) TenantBroadcaster.pubsub_broadcast(tenant.external_id, tenant_topic, broadcast, RealtimeChannel.MessageDispatcher) diff --git a/lib/realtime/tenants/migrations.ex b/lib/realtime/tenants/migrations.ex index 04475c2b7..a5fa1eb8b 100644 --- a/lib/realtime/tenants/migrations.ex +++ b/lib/realtime/tenants/migrations.ex @@ -74,7 +74,8 @@ defmodule Realtime.Tenants.Migrations do RealtimeSendSetsTopicConfig, SubscriptionIndexBridgingDisabled, RunSubscriptionIndexBridgingDisabled, - BroadcastSendErrorLogging + BroadcastSendErrorLogging, + CreateMessagesReplayIndex } @migrations [ @@ -140,7 +141,8 @@ defmodule Realtime.Tenants.Migrations do {20_250_128_220_012, RealtimeSendSetsTopicConfig}, {20_250_506_224_012, SubscriptionIndexBridgingDisabled}, {20_250_523_164_012, RunSubscriptionIndexBridgingDisabled}, - {20_250_714_121_412, BroadcastSendErrorLogging} + {20_250_714_121_412, BroadcastSendErrorLogging}, + {20_250_905_041_441, CreateMessagesReplayIndex} ] defstruct [:tenant_external_id, :settings] diff --git a/lib/realtime/tenants/replication_connection.ex b/lib/realtime/tenants/replication_connection.ex index 58b1de191..4ebb1f8e8 100644 --- a/lib/realtime/tenants/replication_connection.ex +++ b/lib/realtime/tenants/replication_connection.ex @@ -310,7 +310,13 @@ defmodule Realtime.Tenants.ReplicationConnection do {:ok, topic} <- get_or_error(to_broadcast, "topic", :topic_missing), {:ok, private} <- get_or_error(to_broadcast, "private", :private_missing), %Tenant{} = tenant <- Cache.get_tenant_by_external_id(tenant_id), - broadcast_message = %{topic: topic, event: event, private: private, payload: Map.put_new(payload, "id", id)}, + broadcast_message = %{ + id: id, + topic: topic, + event: event, + private: private, + payload: Map.put_new(payload, "id", id) + }, :ok <- BatchBroadcast.broadcast(nil, tenant, %{messages: [broadcast_message]}, true) do inserted_at = NaiveDateTime.from_iso8601!(inserted_at) latency_inserted_at = NaiveDateTime.utc_now() |> NaiveDateTime.diff(inserted_at) diff --git a/lib/realtime/tenants/repo/migrations/20250905041441_create_messages_replay_index.ex b/lib/realtime/tenants/repo/migrations/20250905041441_create_messages_replay_index.ex new file mode 100644 index 000000000..77afde6e0 --- /dev/null +++ b/lib/realtime/tenants/repo/migrations/20250905041441_create_messages_replay_index.ex @@ -0,0 +1,11 @@ +defmodule Realtime.Tenants.Migrations.CreateMessagesReplayIndex do + @moduledoc false + + use Ecto.Migration + + def change do + create_if_not_exists index(:messages, [{:desc, :inserted_at}, :topic], + where: "extension = 'broadcast' and private IS TRUE" + ) + end +end diff --git a/lib/realtime_web/channels/payloads/broadcast.ex b/lib/realtime_web/channels/payloads/broadcast.ex index 7feddb043..e2881fd54 100644 --- a/lib/realtime_web/channels/payloads/broadcast.ex +++ b/lib/realtime_web/channels/payloads/broadcast.ex @@ -9,9 +9,11 @@ defmodule RealtimeWeb.Channels.Payloads.Broadcast do embedded_schema do field :ack, :boolean, default: false field :self, :boolean, default: false + embeds_one :replay, RealtimeWeb.Channels.Payloads.Broadcast.Replay end def changeset(broadcast, attrs) do cast(broadcast, attrs, [:ack, :self], message: &Join.error_message/2) + |> cast_embed(:replay, invalid_message: "unable to parse, expected a map") end end diff --git a/lib/realtime_web/channels/payloads/broadcast/replay.ex b/lib/realtime_web/channels/payloads/broadcast/replay.ex new file mode 100644 index 000000000..b0a5804a2 --- /dev/null +++ b/lib/realtime_web/channels/payloads/broadcast/replay.ex @@ -0,0 +1,17 @@ +defmodule RealtimeWeb.Channels.Payloads.Broadcast.Replay do + @moduledoc """ + Validate broadcast replay field of the join payload. + """ + use Ecto.Schema + import Ecto.Changeset + alias RealtimeWeb.Channels.Payloads.Join + + embedded_schema do + field :limit, :integer, default: 10 + field :since, :integer, default: 0 + end + + def changeset(broadcast, attrs) do + cast(broadcast, attrs, [:limit, :since], message: &Join.error_message/2) + end +end diff --git a/lib/realtime_web/channels/realtime_channel.ex b/lib/realtime_web/channels/realtime_channel.ex index 03bd91347..1d58d9da7 100644 --- a/lib/realtime_web/channels/realtime_channel.ex +++ b/lib/realtime_web/channels/realtime_channel.ex @@ -72,12 +72,21 @@ defmodule RealtimeWeb.RealtimeChannel do {:ok, claims, confirm_token_ref} <- confirm_token(socket), socket = assign_authorization_context(socket, sub_topic, claims), {:ok, db_conn} <- Connect.lookup_or_start_connection(tenant_id), - {:ok, socket} <- maybe_assign_policies(sub_topic, db_conn, socket) do + {:ok, socket} <- maybe_assign_policies(sub_topic, db_conn, socket), + {:ok, replayed_message_ids} <- + maybe_replay_messages(params["config"], sub_topic, db_conn, socket.assigns.private?) do tenant_topic = Tenants.tenant_topic(tenant_id, sub_topic, !socket.assigns.private?) # fastlane subscription metadata = - MessageDispatcher.fastlane_metadata(transport_pid, serializer, topic, socket.assigns.log_level, tenant_id) + MessageDispatcher.fastlane_metadata( + transport_pid, + serializer, + topic, + log_level, + tenant_id, + replayed_message_ids + ) RealtimeWeb.Endpoint.subscribe(tenant_topic, metadata: metadata) @@ -198,6 +207,12 @@ defmodule RealtimeWeb.RealtimeChannel do {:error, :shutdown_in_progress} -> log_error(socket, "RealtimeRestarting", "Realtime is restarting, please standby") + {:error, :failed_to_replay_messages} -> + log_error(socket, "UnableToReplayMessages", "Realtime was unable to replay messages") + + {:error, :invalid_replay_params} -> + log_error(socket, "UnableToReplayMessages", "Replay params are not valid") + {:error, error} -> log_error(socket, "UnknownErrorOnChannel", error) {:error, %{reason: "Unknown Error on Channel"}} @@ -205,6 +220,17 @@ defmodule RealtimeWeb.RealtimeChannel do end @impl true + def handle_info({:replay, messages}, socket) do + for message <- messages do + meta = %{"replayed" => true, "id" => message.id} + payload = %{"payload" => message.payload, "event" => message.event, "type" => "broadcast", "meta" => meta} + + push(socket, "broadcast", payload) + end + + {:noreply, socket} + end + def handle_info(:update_rate_counter, socket) do count(socket) @@ -762,4 +788,25 @@ defmodule RealtimeWeb.RealtimeChannel do do: {:error, :private_only}, else: :ok end + + defp maybe_replay_messages(%{"broadcast" => %{"replay" => _}}, _sub_topic, _db_conn, false = _private?) do + {:error, :invalid_replay_params} + end + + defp maybe_replay_messages(%{"broadcast" => %{"replay" => replay_params}}, sub_topic, db_conn, true = _private?) + when is_map(replay_params) do + with {:ok, messages, message_ids} <- + Realtime.Messages.replay( + db_conn, + sub_topic, + replay_params["since"], + replay_params["limit"] || 25 + ) do + # Send to self because we can't write to the socket before finishing the join process + send(self(), {:replay, messages}) + {:ok, message_ids} + end + end + + defp maybe_replay_messages(_, _, _, _), do: {:ok, MapSet.new()} end diff --git a/lib/realtime_web/channels/realtime_channel/message_dispatcher.ex b/lib/realtime_web/channels/realtime_channel/message_dispatcher.ex index b5db97f95..ef486c4e8 100644 --- a/lib/realtime_web/channels/realtime_channel/message_dispatcher.ex +++ b/lib/realtime_web/channels/realtime_channel/message_dispatcher.ex @@ -5,12 +5,14 @@ defmodule RealtimeWeb.RealtimeChannel.MessageDispatcher do require Logger - def fastlane_metadata(fastlane_pid, serializer, topic, :info, tenant_id) do - {:realtime_channel_fastlane, fastlane_pid, serializer, topic, {:log, tenant_id}} + def fastlane_metadata(fastlane_pid, serializer, topic, log_level, tenant_id, replayed_message_ids \\ MapSet.new()) + + def fastlane_metadata(fastlane_pid, serializer, topic, :info, tenant_id, replayed_message_ids) do + {:rc_fastlane, fastlane_pid, serializer, topic, {:log, tenant_id}, replayed_message_ids} end - def fastlane_metadata(fastlane_pid, serializer, topic, _log_level, _tenant_id) do - {:realtime_channel_fastlane, fastlane_pid, serializer, topic} + def fastlane_metadata(fastlane_pid, serializer, topic, _log_level, _tenant_id, replayed_message_ids) do + {:rc_fastlane, fastlane_pid, serializer, topic, replayed_message_ids} end @doc """ @@ -23,22 +25,34 @@ defmodule RealtimeWeb.RealtimeChannel.MessageDispatcher do # This reduce caches the serialization and bypasses the channel process going straight to the # transport process + message_id = msg.payload["meta"]["id"] + # Credo doesn't like that we don't use the result aggregation _ = Enum.reduce(subscribers, %{}, fn {pid, _}, cache when pid == from -> cache - {pid, {:realtime_channel_fastlane, fastlane_pid, serializer, join_topic}}, cache -> - send(pid, :update_rate_counter) - do_dispatch(msg, fastlane_pid, serializer, join_topic, cache) + {pid, {:rc_fastlane, fastlane_pid, serializer, join_topic, replayed_message_ids}}, cache -> + if already_replayed?(message_id, replayed_message_ids) do + # skip already replayed message + cache + else + send(pid, :update_rate_counter) + do_dispatch(msg, fastlane_pid, serializer, join_topic, cache) + end - {pid, {:realtime_channel_fastlane, fastlane_pid, serializer, join_topic, {:log, tenant_id}}}, cache -> - send(pid, :update_rate_counter) - log = "Received message on #{join_topic} with payload: #{inspect(msg, pretty: true)}" - Logger.info(log, external_id: tenant_id, project: tenant_id) + {pid, {:rc_fastlane, fastlane_pid, serializer, join_topic, {:log, tenant_id}, replayed_message_ids}}, cache -> + if already_replayed?(message_id, replayed_message_ids) do + # skip already replayed message + cache + else + send(pid, :update_rate_counter) + log = "Received message on #{join_topic} with payload: #{inspect(msg, pretty: true)}" + Logger.info(log, external_id: tenant_id, project: tenant_id) - do_dispatch(msg, fastlane_pid, serializer, join_topic, cache) + do_dispatch(msg, fastlane_pid, serializer, join_topic, cache) + end {pid, _}, cache -> send(pid, msg) @@ -48,6 +62,9 @@ defmodule RealtimeWeb.RealtimeChannel.MessageDispatcher do :ok end + defp already_replayed?(nil, _replayed_message_ids), do: false + defp already_replayed?(message_id, replayed_message_ids), do: MapSet.member?(replayed_message_ids, message_id) + defp do_dispatch(msg, fastlane_pid, serializer, join_topic, cache) do case cache do %{^serializer => encoded_msg} -> diff --git a/mix.exs b/mix.exs index 372ff12c4..1e17ec551 100644 --- a/mix.exs +++ b/mix.exs @@ -4,7 +4,7 @@ defmodule Realtime.MixProject do def project do [ app: :realtime, - version: "2.48.2", + version: "2.49.0", elixir: "~> 1.17.3", elixirc_paths: elixirc_paths(Mix.env()), start_permanent: Mix.env() == :prod, diff --git a/test/realtime/messages_test.exs b/test/realtime/messages_test.exs index 3bef9a5e0..cca0ce742 100644 --- a/test/realtime/messages_test.exs +++ b/test/realtime/messages_test.exs @@ -16,32 +16,221 @@ defmodule Realtime.MessagesTest do %{conn: conn, tenant: tenant, date_start: date_start, date_end: date_end} end - test "delete_old_messages/1 deletes messages older than 72 hours", %{ - conn: conn, - tenant: tenant, - date_start: date_start, - date_end: date_end - } do - utc_now = NaiveDateTime.utc_now() - limit = NaiveDateTime.add(utc_now, -72, :hour) - - messages = - for date <- Date.range(date_start, date_end) do - inserted_at = date |> NaiveDateTime.new!(Time.new!(0, 0, 0)) - message_fixture(tenant, %{inserted_at: inserted_at}) + describe "replay/5" do + test "invalid replay params" do + assert Messages.replay(self(), "a topic", "not a number", 123) == + {:error, :invalid_replay_params} + + assert Messages.replay(self(), "a topic", 123, "not a number") == + {:error, :invalid_replay_params} + + assert Messages.replay(self(), "a topic", 253_402_300_800_000, 10) == + {:error, :invalid_replay_params} + end + + test "empty replay", %{conn: conn} do + assert Messages.replay(conn, "test", 0, 10) == {:ok, [], MapSet.new()} + end + + test "replay respects limit", %{conn: conn, tenant: tenant} do + m1 = + message_fixture(tenant, %{ + "inserted_at" => NaiveDateTime.utc_now() |> NaiveDateTime.add(-1, :minute), + "event" => "new", + "extension" => "broadcast", + "topic" => "test", + "private" => true, + "payload" => %{"value" => "new"} + }) + + message_fixture(tenant, %{ + "inserted_at" => NaiveDateTime.utc_now() |> NaiveDateTime.add(-2, :minute), + "event" => "old", + "extension" => "broadcast", + "topic" => "test", + "private" => true, + "payload" => %{"value" => "old"} + }) + + assert Messages.replay(conn, "test", 0, 1) == {:ok, [m1], MapSet.new([m1.id])} + end + + test "replay private topic only", %{conn: conn, tenant: tenant} do + privatem = + message_fixture(tenant, %{ + "private" => true, + "inserted_at" => NaiveDateTime.utc_now() |> NaiveDateTime.add(-1, :minute), + "event" => "new", + "extension" => "broadcast", + "topic" => "test", + "payload" => %{"value" => "new"} + }) + + message_fixture(tenant, %{ + "private" => false, + "inserted_at" => NaiveDateTime.utc_now() |> NaiveDateTime.add(-2, :minute), + "event" => "old", + "extension" => "broadcast", + "topic" => "test", + "payload" => %{"value" => "old"} + }) + + assert Messages.replay(conn, "test", 0, 10) == {:ok, [privatem], MapSet.new([privatem.id])} + end + + test "replay extension=broadcast", %{conn: conn, tenant: tenant} do + privatem = + message_fixture(tenant, %{ + "private" => true, + "inserted_at" => NaiveDateTime.utc_now() |> NaiveDateTime.add(-1, :minute), + "event" => "new", + "extension" => "broadcast", + "topic" => "test", + "payload" => %{"value" => "new"} + }) + + message_fixture(tenant, %{ + "private" => true, + "inserted_at" => NaiveDateTime.utc_now() |> NaiveDateTime.add(-2, :minute), + "event" => "old", + "extension" => "presence", + "topic" => "test", + "payload" => %{"value" => "old"} + }) + + assert Messages.replay(conn, "test", 0, 10) == {:ok, [privatem], MapSet.new([privatem.id])} + end + + test "replay respects since", %{conn: conn, tenant: tenant} do + m1 = + message_fixture(tenant, %{ + "inserted_at" => NaiveDateTime.utc_now() |> NaiveDateTime.add(-2, :minute), + "event" => "first", + "extension" => "broadcast", + "topic" => "test", + "private" => true, + "payload" => %{"value" => "first"} + }) + + m2 = + message_fixture(tenant, %{ + "inserted_at" => NaiveDateTime.utc_now() |> NaiveDateTime.add(-1, :minute), + "event" => "second", + "extension" => "broadcast", + "topic" => "test", + "private" => true, + "payload" => %{"value" => "second"} + }) + + message_fixture(tenant, %{ + "inserted_at" => NaiveDateTime.utc_now() |> NaiveDateTime.add(-10, :minute), + "event" => "old", + "extension" => "broadcast", + "topic" => "test", + "private" => true, + "payload" => %{"value" => "old"} + }) + + since = DateTime.utc_now() |> DateTime.add(-3, :minute) |> DateTime.to_unix(:millisecond) + + assert Messages.replay(conn, "test", since, 10) == {:ok, [m1, m2], MapSet.new([m1.id, m2.id])} + end + + test "replay respects hard max limit of 25", %{conn: conn, tenant: tenant} do + for _i <- 1..30 do + message_fixture(tenant, %{ + "inserted_at" => NaiveDateTime.utc_now(), + "event" => "event", + "extension" => "broadcast", + "topic" => "test", + "private" => true, + "payload" => %{"value" => "message"} + }) end - assert length(messages) == 11 + assert {:ok, messages, set} = Messages.replay(conn, "test", 0, 30) + assert length(messages) == 25 + assert MapSet.size(set) == 25 + end + + test "replay respects hard min limit of 1", %{conn: conn, tenant: tenant} do + message_fixture(tenant, %{ + "inserted_at" => NaiveDateTime.utc_now(), + "event" => "event", + "extension" => "broadcast", + "topic" => "test", + "private" => true, + "payload" => %{"value" => "message"} + }) + + assert {:ok, messages, set} = Messages.replay(conn, "test", 0, 0) + assert length(messages) == 1 + assert MapSet.size(set) == 1 + end + + test "distributed replay", %{conn: conn, tenant: tenant} do + m = + message_fixture(tenant, %{ + "inserted_at" => NaiveDateTime.utc_now(), + "event" => "event", + "extension" => "broadcast", + "topic" => "test", + "private" => true, + "payload" => %{"value" => "message"} + }) + + {:ok, node} = Clustered.start() + + # Call remote node passing the database connection that is local to this node + assert :erpc.call(node, Messages, :replay, [conn, "test", 0, 30]) == {:ok, [m], MapSet.new([m.id])} + end + + test "distributed replay error", %{tenant: tenant} do + message_fixture(tenant, %{ + "inserted_at" => NaiveDateTime.utc_now(), + "event" => "event", + "extension" => "broadcast", + "topic" => "test", + "private" => true, + "payload" => %{"value" => "message"} + }) + + {:ok, node} = Clustered.start() + + # Call remote node passing the database connection that is local to this node + pid = spawn(fn -> :ok end) + assert :erpc.call(node, Messages, :replay, [pid, "test", 0, 30]) == {:error, :failed_to_replay_messages} + end + end + + describe "delete_old_messages/1" do + test "delete_old_messages/1 deletes messages older than 72 hours", %{ + conn: conn, + tenant: tenant, + date_start: date_start, + date_end: date_end + } do + utc_now = NaiveDateTime.utc_now() + limit = NaiveDateTime.add(utc_now, -72, :hour) + + messages = + for date <- Date.range(date_start, date_end) do + inserted_at = date |> NaiveDateTime.new!(Time.new!(0, 0, 0)) + message_fixture(tenant, %{inserted_at: inserted_at}) + end + + assert length(messages) == 11 - to_keep = - Enum.reject( - messages, - &(NaiveDateTime.compare(limit, &1.inserted_at) == :gt) - ) + to_keep = + Enum.reject( + messages, + &(NaiveDateTime.compare(NaiveDateTime.beginning_of_day(limit), &1.inserted_at) == :gt) + ) - assert :ok = Messages.delete_old_messages(conn) - {:ok, current} = Repo.all(conn, from(m in Message), Message) + assert :ok = Messages.delete_old_messages(conn) + {:ok, current} = Repo.all(conn, from(m in Message), Message) - assert Enum.sort(current) == Enum.sort(to_keep) + assert Enum.sort(current) == Enum.sort(to_keep) + end end end diff --git a/test/realtime/tenants/janitor/maintenance_task_test.exs b/test/realtime/tenants/janitor/maintenance_task_test.exs index f4c51436e..4c42b7ab3 100644 --- a/test/realtime/tenants/janitor/maintenance_task_test.exs +++ b/test/realtime/tenants/janitor/maintenance_task_test.exs @@ -15,9 +15,15 @@ defmodule Realtime.Tenants.Janitor.MaintenanceTaskTest do end test "cleans messages older than 72 hours and creates partitions", %{tenant: tenant} do + {:ok, conn} = Database.connect(tenant, "realtime_test", :stop) + utc_now = NaiveDateTime.utc_now() limit = NaiveDateTime.add(utc_now, -72, :hour) + date_start = Date.utc_today() |> Date.add(-10) + date_end = Date.utc_today() + create_messages_partitions(conn, date_start, date_end) + messages = for days <- -5..0 do inserted_at = NaiveDateTime.add(utc_now, days, :day) @@ -27,12 +33,11 @@ defmodule Realtime.Tenants.Janitor.MaintenanceTaskTest do to_keep = messages - |> Enum.reject(&(NaiveDateTime.compare(limit, &1.inserted_at) == :gt)) + |> Enum.reject(&(NaiveDateTime.compare(NaiveDateTime.beginning_of_day(limit), &1.inserted_at) == :gt)) |> MapSet.new() assert MaintenanceTask.run(tenant.external_id) == :ok - {:ok, conn} = Database.connect(tenant, "realtime_test", :stop) {:ok, res} = Repo.all(conn, from(m in Message), Message) verify_partitions(conn) @@ -80,7 +85,7 @@ defmodule Realtime.Tenants.Janitor.MaintenanceTaskTest do defp verify_partitions(conn) do today = Date.utc_today() - yesterday = Date.add(today, -1) + yesterday = Date.add(today, -3) future = Date.add(today, 3) dates = Date.range(yesterday, future) diff --git a/test/realtime/tenants/janitor_test.exs b/test/realtime/tenants/janitor_test.exs index 4ac1a0eda..fb597a4c4 100644 --- a/test/realtime/tenants/janitor_test.exs +++ b/test/realtime/tenants/janitor_test.exs @@ -31,6 +31,14 @@ defmodule Realtime.Tenants.JanitorTest do end ) + date_start = Date.utc_today() |> Date.add(-10) + date_end = Date.utc_today() + + Enum.map(tenants, fn tenant -> + {:ok, conn} = Database.connect(tenant, "realtime_test", :stop) + create_messages_partitions(conn, date_start, date_end) + end) + start_supervised!( {Task.Supervisor, name: Realtime.Tenants.Janitor.TaskSupervisor, max_children: 5, max_seconds: 500, max_restarts: 1} @@ -62,7 +70,7 @@ defmodule Realtime.Tenants.JanitorTest do to_keep = messages - |> Enum.reject(&(NaiveDateTime.compare(limit, &1.inserted_at) == :gt)) + |> Enum.reject(&(NaiveDateTime.compare(NaiveDateTime.beginning_of_day(limit), &1.inserted_at) == :gt)) |> MapSet.new() start_supervised!(Janitor) @@ -105,7 +113,7 @@ defmodule Realtime.Tenants.JanitorTest do to_keep = messages - |> Enum.reject(&(NaiveDateTime.compare(limit, &1.inserted_at) == :gt)) + |> Enum.reject(&(NaiveDateTime.compare(NaiveDateTime.beginning_of_day(limit), &1.inserted_at) == :gt)) |> MapSet.new() start_supervised!(Janitor) @@ -162,7 +170,7 @@ defmodule Realtime.Tenants.JanitorTest do defp verify_partitions(conn) do today = Date.utc_today() - yesterday = Date.add(today, -1) + yesterday = Date.add(today, -3) future = Date.add(today, 3) dates = Date.range(yesterday, future) diff --git a/test/realtime/tenants/replication_connection_test.exs b/test/realtime/tenants/replication_connection_test.exs index 2d367a846..b28a23988 100644 --- a/test/realtime/tenants/replication_connection_test.exs +++ b/test/realtime/tenants/replication_connection_test.exs @@ -98,6 +98,7 @@ defmodule Realtime.Tenants.ReplicationConnectionTest do payload = %{ "event" => "INSERT", + "meta" => %{"id" => row.id}, "payload" => %{ "id" => row.id, "value" => value @@ -139,8 +140,9 @@ defmodule Realtime.Tenants.ReplicationConnectionTest do "event" => "broadcast", "payload" => %{ "event" => "INSERT", + "meta" => %{"id" => id}, "payload" => %{ - "id" => _, + "id" => id, "value" => ^value } }, @@ -222,21 +224,26 @@ defmodule Realtime.Tenants.ReplicationConnectionTest do "payload" => %{"value" => "something"} }) + fixture_id = fixture.id + assert_receive {:socket_push, :text, data}, 500 message = data |> IO.iodata_to_binary() |> Jason.decode!() assert %{ "event" => "broadcast", - "payload" => %{"event" => "INSERT", "payload" => payload, "type" => "broadcast"}, + "payload" => %{ + "event" => "INSERT", + "meta" => %{"id" => ^fixture_id}, + "payload" => payload, + "type" => "broadcast" + }, "ref" => nil, "topic" => ^topic } = message - id = fixture.id - assert payload == %{ "value" => "something", - "id" => id + "id" => fixture_id } end @@ -252,19 +259,25 @@ defmodule Realtime.Tenants.ReplicationConnectionTest do payload = %{"value" => "something", "id" => "123456"} - message_fixture(tenant, %{ - "topic" => topic, - "private" => true, - "event" => "INSERT", - "payload" => payload - }) + %{id: fixture_id} = + message_fixture(tenant, %{ + "topic" => topic, + "private" => true, + "event" => "INSERT", + "payload" => payload + }) assert_receive {:socket_push, :text, data}, 500 message = data |> IO.iodata_to_binary() |> Jason.decode!() assert %{ "event" => "broadcast", - "payload" => %{"event" => "INSERT", "payload" => ^payload, "type" => "broadcast"}, + "payload" => %{ + "meta" => %{"id" => ^fixture_id}, + "event" => "INSERT", + "payload" => ^payload, + "type" => "broadcast" + }, "ref" => nil, "topic" => ^topic } = message diff --git a/test/realtime_web/channels/payloads/join_test.exs b/test/realtime_web/channels/payloads/join_test.exs index 32bf1b397..c1ea54a67 100644 --- a/test/realtime_web/channels/payloads/join_test.exs +++ b/test/realtime_web/channels/payloads/join_test.exs @@ -6,6 +6,7 @@ defmodule RealtimeWeb.Channels.Payloads.JoinTest do alias RealtimeWeb.Channels.Payloads.Join alias RealtimeWeb.Channels.Payloads.Config alias RealtimeWeb.Channels.Payloads.Broadcast + alias RealtimeWeb.Channels.Payloads.Broadcast.Replay alias RealtimeWeb.Channels.Payloads.Presence alias RealtimeWeb.Channels.Payloads.PostgresChange @@ -17,7 +18,7 @@ defmodule RealtimeWeb.Channels.Payloads.JoinTest do config = %{ "config" => %{ "private" => false, - "broadcast" => %{"ack" => false, "self" => false}, + "broadcast" => %{"ack" => false, "self" => false, "replay" => %{"since" => 1, "limit" => 10}}, "presence" => %{"enabled" => true, "key" => key}, "postgres_changes" => [ %{"event" => "INSERT", "schema" => "public", "table" => "users", "filter" => "id=eq.1"}, @@ -37,8 +38,9 @@ defmodule RealtimeWeb.Channels.Payloads.JoinTest do postgres_changes: postgres_changes } = config - assert %Broadcast{ack: false, self: false} = broadcast + assert %Broadcast{ack: false, self: false, replay: replay} = broadcast assert %Presence{enabled: true, key: ^key} = presence + assert %Replay{since: 1, limit: 10} = replay assert [ %PostgresChange{event: "INSERT", schema: "public", table: "users", filter: "id=eq.1"}, @@ -56,6 +58,17 @@ defmodule RealtimeWeb.Channels.Payloads.JoinTest do assert is_binary(key) end + test "invalid replay" do + config = %{"config" => %{"broadcast" => %{"replay" => 123}}} + + assert { + :error, + :invalid_join_payload, + %{config: %{broadcast: %{replay: ["unable to parse, expected a map"]}}} + } = + Join.validate(config) + end + test "missing enabled presence defaults to true" do config = %{"config" => %{"presence" => %{}}} diff --git a/test/realtime_web/channels/realtime_channel/message_dispatcher_test.exs b/test/realtime_web/channels/realtime_channel/message_dispatcher_test.exs index 7a9e2eb25..91b16c089 100644 --- a/test/realtime_web/channels/realtime_channel/message_dispatcher_test.exs +++ b/test/realtime_web/channels/realtime_channel/message_dispatcher_test.exs @@ -16,12 +16,12 @@ defmodule RealtimeWeb.RealtimeChannel.MessageDispatcherTest do describe "fastlane_metadata/5" do test "info level" do assert MessageDispatcher.fastlane_metadata(self(), Serializer, "realtime:topic", :info, "tenant_id") == - {:realtime_channel_fastlane, self(), Serializer, "realtime:topic", {:log, "tenant_id"}} + {:rc_fastlane, self(), Serializer, "realtime:topic", {:log, "tenant_id"}, MapSet.new()} end test "non-info level" do assert MessageDispatcher.fastlane_metadata(self(), Serializer, "realtime:topic", :warning, "tenant_id") == - {:realtime_channel_fastlane, self(), Serializer, "realtime:topic"} + {:rc_fastlane, self(), Serializer, "realtime:topic", MapSet.new()} end end @@ -50,12 +50,11 @@ defmodule RealtimeWeb.RealtimeChannel.MessageDispatcherTest do from_pid = :erlang.list_to_pid(~c'<0.2.1>') subscribers = [ - {subscriber_pid, {:realtime_channel_fastlane, self(), TestSerializer, "realtime:topic", {:log, "tenant123"}}}, - {subscriber_pid, {:realtime_channel_fastlane, self(), TestSerializer, "realtime:topic"}} + {subscriber_pid, {:rc_fastlane, self(), TestSerializer, "realtime:topic", {:log, "tenant123"}, MapSet.new()}}, + {subscriber_pid, {:rc_fastlane, self(), TestSerializer, "realtime:topic", MapSet.new()}} ] msg = %Broadcast{topic: "some:other:topic", event: "event", payload: %{data: "test"}} - require Logger log = capture_log(fn -> @@ -75,6 +74,44 @@ defmodule RealtimeWeb.RealtimeChannel.MessageDispatcherTest do refute_receive _any end + test "does not dispatch messages to fastlane subscribers if they already replayed it" do + parent = self() + + subscriber_pid = + spawn(fn -> + loop = fn loop -> + receive do + msg -> + send(parent, {:subscriber, msg}) + loop.(loop) + end + end + + loop.(loop) + end) + + from_pid = :erlang.list_to_pid(~c'<0.2.1>') + replaeyd_message_ids = MapSet.new(["123"]) + + subscribers = [ + {subscriber_pid, + {:rc_fastlane, self(), TestSerializer, "realtime:topic", {:log, "tenant123"}, replaeyd_message_ids}}, + {subscriber_pid, {:rc_fastlane, self(), TestSerializer, "realtime:topic", replaeyd_message_ids}} + ] + + msg = %Broadcast{ + topic: "some:other:topic", + event: "event", + payload: %{"data" => "test", "meta" => %{"id" => "123"}} + } + + assert MessageDispatcher.dispatch(subscribers, from_pid, msg) == :ok + + assert Agent.get(TestSerializer, & &1) == 0 + + refute_receive _any + end + test "dispatches messages to non fastlane subscribers" do from_pid = :erlang.list_to_pid(~c'<0.2.1>') diff --git a/test/realtime_web/channels/realtime_channel_test.exs b/test/realtime_web/channels/realtime_channel_test.exs index 2dff83da3..4d90c3588 100644 --- a/test/realtime_web/channels/realtime_channel_test.exs +++ b/test/realtime_web/channels/realtime_channel_test.exs @@ -28,6 +28,168 @@ defmodule RealtimeWeb.RealtimeChannelTest do setup :rls_context + describe "broadcast" do + @describetag policies: [:authenticated_all_topic_read] + + test "wrong replay params", %{tenant: tenant} do + jwt = Generators.generate_jwt_token(tenant) + {:ok, %Socket{} = socket} = connect(UserSocket, %{"log_level" => "warning"}, conn_opts(tenant, jwt)) + + config = %{ + "private" => true, + "broadcast" => %{ + "replay" => %{"limit" => "not a number", "since" => :erlang.system_time(:millisecond) - 5 * 60000} + } + } + + assert {:error, %{reason: "UnableToReplayMessages: Replay params are not valid"}} = + subscribe_and_join(socket, "realtime:test", %{"config" => config}) + + config = %{ + "private" => true, + "broadcast" => %{ + "replay" => %{"limit" => 1, "since" => "not a number"} + } + } + + assert {:error, %{reason: "UnableToReplayMessages: Replay params are not valid"}} = + subscribe_and_join(socket, "realtime:test", %{"config" => config}) + + config = %{ + "private" => true, + "broadcast" => %{ + "replay" => %{} + } + } + + assert {:error, %{reason: "UnableToReplayMessages: Replay params are not valid"}} = + subscribe_and_join(socket, "realtime:test", %{"config" => config}) + end + + test "failure to replay", %{tenant: tenant} do + jwt = Generators.generate_jwt_token(tenant) + {:ok, %Socket{} = socket} = connect(UserSocket, %{"log_level" => "warning"}, conn_opts(tenant, jwt)) + + config = %{ + "private" => true, + "broadcast" => %{ + "replay" => %{"limit" => 12, "since" => :erlang.system_time(:millisecond) - 5 * 60000} + } + } + + Authorization + |> expect(:get_read_authorizations, fn _, _, _ -> + {:ok, + %Authorization.Policies{ + broadcast: %Authorization.Policies.BroadcastPolicies{read: true, write: nil} + }} + end) + + # Broken database connection + conn = spawn(fn -> :ok end) + Connect.lookup_or_start_connection(tenant.external_id) + {:ok, _} = :syn.update_registry(Connect, tenant.external_id, fn _pid, meta -> %{meta | conn: conn} end) + + assert {:error, %{reason: "UnableToReplayMessages: Realtime was unable to replay messages"}} = + subscribe_and_join(socket, "realtime:test", %{"config" => config}) + end + + test "replay messages on public topic not allowed", %{tenant: tenant} do + jwt = Generators.generate_jwt_token(tenant) + {:ok, %Socket{} = socket} = connect(UserSocket, %{"log_level" => "warning"}, conn_opts(tenant, jwt)) + + config = %{ + "presence" => %{"enabled" => false}, + "broadcast" => %{"replay" => %{"limit" => 2, "since" => :erlang.system_time(:millisecond) - 5 * 60000}} + } + + assert { + :error, + %{reason: "UnableToReplayMessages: Replay params are not valid"} + } = subscribe_and_join(socket, "realtime:test", %{"config" => config}) + + refute_receive _any + end + + @tag policies: [:authenticated_all_topic_read] + test "replay messages on private topic", %{tenant: tenant} do + jwt = Generators.generate_jwt_token(tenant) + {:ok, %Socket{} = socket} = connect(UserSocket, %{"log_level" => "warning"}, conn_opts(tenant, jwt)) + + # Old message + message_fixture(tenant, %{ + "private" => true, + "inserted_at" => NaiveDateTime.utc_now() |> NaiveDateTime.add(-1, :day), + "event" => "old", + "extension" => "broadcast", + "topic" => "test", + "payload" => %{"value" => "old"} + }) + + %{id: message1_id} = + message_fixture(tenant, %{ + "private" => true, + "inserted_at" => NaiveDateTime.utc_now() |> NaiveDateTime.add(-1, :minute), + "event" => "first", + "extension" => "broadcast", + "topic" => "test", + "payload" => %{"value" => "first"} + }) + + %{id: message2_id} = + message_fixture(tenant, %{ + "private" => true, + "inserted_at" => NaiveDateTime.utc_now() |> NaiveDateTime.add(-2, :minute), + "event" => "second", + "extension" => "broadcast", + "topic" => "test", + "payload" => %{"value" => "second"} + }) + + # This one should not be received because of the limit + message_fixture(tenant, %{ + "private" => true, + "inserted_at" => NaiveDateTime.utc_now() |> NaiveDateTime.add(-3, :minute), + "event" => "third", + "extension" => "broadcast", + "topic" => "test", + "payload" => %{"value" => "third"} + }) + + config = %{ + "private" => true, + "presence" => %{"enabled" => false}, + "broadcast" => %{"replay" => %{"limit" => 2, "since" => :erlang.system_time(:millisecond) - 5 * 60000}} + } + + assert {:ok, _, %Socket{}} = subscribe_and_join(socket, "realtime:test", %{"config" => config}) + + assert_receive %Socket.Message{ + topic: "realtime:test", + event: "broadcast", + payload: %{ + "event" => "first", + "meta" => %{"id" => ^message1_id, "replayed" => true}, + "payload" => %{"value" => "first"}, + "type" => "broadcast" + } + } + + assert_receive %Socket.Message{ + topic: "realtime:test", + event: "broadcast", + payload: %{ + "event" => "second", + "meta" => %{"id" => ^message2_id, "replayed" => true}, + "payload" => %{"value" => "second"}, + "type" => "broadcast" + } + } + + refute_receive %Socket.Message{} + end + end + describe "presence" do test "events are counted", %{tenant: tenant} do jwt = Generators.generate_jwt_token(tenant) From c4ba2aa63901dd2a48affdf60b2d20b3398b8e55 Mon Sep 17 00:00:00 2001 From: Eduardo Gurgel Date: Tue, 16 Sep 2025 14:28:57 +1200 Subject: [PATCH 12/34] feat: gen_rpc pub sub adapter (#1529) Add a PubSub adapter that uses gen_rpc to send messages to other nodes. It uses :gen_rpc.abcast/3 instead of :erlang.send/2 The adapter works very similarly to the PG2 adapter. It consists of multiple workers that forward to the local node using PubSub.local_broadcast. The way to choose the worker to be used is based on the sending process just like PG2 adapter does The number of workers is controlled by `:pool_size` or `:broadcast_pool_size`. This distinction exists because Phoenix.PubSub uses `:pool_size` to define how many partitions the PubSub registry will use. It's possible to control them separately by using `:broadcast_pool_size` --- README.md | 2 + config/runtime.exs | 4 +- lib/realtime/application.ex | 4 +- lib/realtime/gen_rpc.ex | 16 ++++++ lib/realtime/gen_rpc/pub_sub.ex | 78 ++++++++++++++++++++++++++ lib/realtime_web/tenant_broadcaster.ex | 10 +--- mix.exs | 4 +- mix.lock | 2 +- test/realtime/gen_rpc_pub_sub_test.exs | 2 + test/realtime/gen_rpc_test.exs | 33 +++++++++++ 10 files changed, 142 insertions(+), 13 deletions(-) create mode 100644 lib/realtime/gen_rpc/pub_sub.ex create mode 100644 test/realtime/gen_rpc_pub_sub_test.exs diff --git a/README.md b/README.md index 6a16a79ba..3cbe10ad1 100644 --- a/README.md +++ b/README.md @@ -190,6 +190,8 @@ If you're using the default tenant, the URL is `ws://realtime-dev.localhost:4000 | MAX_GEN_RPC_CLIENTS | number | Max amount of `gen_rpc` TCP connections per node-to-node channel | | REBALANCE_CHECK_INTERVAL_IN_MS | number | Time in ms to check if process is in the right region | | DISCONNECT_SOCKET_ON_NO_CHANNELS_INTERVAL_IN_MS | number | Time in ms to check if a socket has no channels open and if so, disconnect it | +| BROADCAST_POOL_SIZE | number | Number of processes to relay Phoenix.PubSub messages across the cluster | + The OpenTelemetry variables mentioned above are not an exhaustive list of all [supported environment variables](https://opentelemetry.io/docs/languages/sdk-configuration/). diff --git a/config/runtime.exs b/config/runtime.exs index ac0a2569b..f20f40ad7 100644 --- a/config/runtime.exs +++ b/config/runtime.exs @@ -67,6 +67,7 @@ janitor_run_after_in_ms = Env.get_integer("JANITOR_RUN_AFTER_IN_MS", :timer.minu janitor_children_timeout = Env.get_integer("JANITOR_CHILDREN_TIMEOUT", :timer.seconds(5)) janitor_schedule_timer = Env.get_integer("JANITOR_SCHEDULE_TIMER_IN_MS", :timer.hours(4)) platform = if System.get_env("AWS_EXECUTION_ENV") == "AWS_ECS_FARGATE", do: :aws, else: :fly +broadcast_pool_size = Env.get_integer("BROADCAST_POOL_SIZE", 10) no_channel_timeout_in_ms = if config_env() == :test, @@ -120,7 +121,8 @@ config :realtime, rpc_timeout: rpc_timeout, max_gen_rpc_clients: max_gen_rpc_clients, no_channel_timeout_in_ms: no_channel_timeout_in_ms, - platform: platform + platform: platform, + broadcast_pool_size: broadcast_pool_size if config_env() != :test && run_janitor? do config :realtime, diff --git a/lib/realtime/application.ex b/lib/realtime/application.ex index 0f4c9ae50..cda853150 100644 --- a/lib/realtime/application.ex +++ b/lib/realtime/application.ex @@ -52,6 +52,7 @@ defmodule Realtime.Application do region = Application.get_env(:realtime, :region) :syn.join(RegionNodes, region, self(), node: node()) + broadcast_pool_size = Application.get_env(:realtime, :broadcast_pool_size, 10) migration_partition_slots = Application.get_env(:realtime, :migration_partition_slots) connect_partition_slots = Application.get_env(:realtime, :connect_partition_slots) no_channel_timeout_in_ms = Application.get_env(:realtime, :no_channel_timeout_in_ms) @@ -65,7 +66,8 @@ defmodule Realtime.Application do Realtime.Repo, RealtimeWeb.Telemetry, {Cluster.Supervisor, [topologies, [name: Realtime.ClusterSupervisor]]}, - {Phoenix.PubSub, name: Realtime.PubSub, pool_size: 10}, + {Phoenix.PubSub, + name: Realtime.PubSub, pool_size: 10, adapter: Realtime.GenRpcPubSub, broadcast_pool_size: broadcast_pool_size}, {Cachex, name: Realtime.RateCounter}, Realtime.Tenants.Cache, Realtime.RateCounter.DynamicSupervisor, diff --git a/lib/realtime/gen_rpc.ex b/lib/realtime/gen_rpc.ex index 3487cc933..a7b46a869 100644 --- a/lib/realtime/gen_rpc.ex +++ b/lib/realtime/gen_rpc.ex @@ -10,6 +10,22 @@ defmodule Realtime.GenRpc do @type result :: any | {:error, :rpc_error, reason :: any} + @doc """ + Broadcasts the message `msg` asynchronously to the registered process `name` on the specified `nodes`. + + Options: + + - `:key` - Optional key to consistently select the same gen_rpc clients to guarantee message order between nodes + """ + @spec abcast([node], atom, any, keyword()) :: :ok + def abcast(nodes, name, msg, opts) when is_list(nodes) and is_atom(name) and is_list(opts) do + key = Keyword.get(opts, :key, nil) + nodes = rpc_nodes(nodes, key) + + :gen_rpc.abcast(nodes, name, msg) + :ok + end + @doc """ Fire and forget apply(mod, func, args) on all nodes diff --git a/lib/realtime/gen_rpc/pub_sub.ex b/lib/realtime/gen_rpc/pub_sub.ex new file mode 100644 index 000000000..b2a90b165 --- /dev/null +++ b/lib/realtime/gen_rpc/pub_sub.ex @@ -0,0 +1,78 @@ +defmodule Realtime.GenRpcPubSub do + @moduledoc """ + gen_rpc Phoenix.PubSub adapter + """ + + @behaviour Phoenix.PubSub.Adapter + alias Realtime.GenRpc + use Supervisor + + @impl true + def node_name(_), do: node() + + # Supervisor callbacks + + def start_link(opts) do + adapter_name = Keyword.fetch!(opts, :adapter_name) + name = Keyword.fetch!(opts, :name) + pool_size = Keyword.get(opts, :pool_size, 1) + broadcast_pool_size = Keyword.get(opts, :broadcast_pool_size, pool_size) + + Supervisor.start_link(__MODULE__, {adapter_name, name, broadcast_pool_size}, + name: :"#{name}#{adapter_name}_supervisor" + ) + end + + @impl true + def init({adapter_name, pubsub, pool_size}) do + workers = for number <- 1..pool_size, do: :"#{pubsub}#{adapter_name}_#{number}" + + :persistent_term.put(adapter_name, List.to_tuple(workers)) + + children = + for worker <- workers do + Supervisor.child_spec({Realtime.GenRpcPubSub.Worker, {pubsub, worker}}, id: worker) + end + + Supervisor.init(children, strategy: :one_for_one) + end + + defp worker_name(adapter_name, key) do + workers = :persistent_term.get(adapter_name) + elem(workers, :erlang.phash2(key, tuple_size(workers))) + end + + @impl true + def broadcast(adapter_name, topic, message, dispatcher) do + worker = worker_name(adapter_name, self()) + GenRpc.abcast(Node.list(), worker, forward_to_local(topic, message, dispatcher), key: worker) + end + + @impl true + def direct_broadcast(adapter_name, node_name, topic, message, dispatcher) do + worker = worker_name(adapter_name, self()) + GenRpc.abcast([node_name], worker, forward_to_local(topic, message, dispatcher), key: worker) + end + + defp forward_to_local(topic, message, dispatcher), do: {:ftl, topic, message, dispatcher} +end + +defmodule Realtime.GenRpcPubSub.Worker do + @moduledoc false + use GenServer + + @doc false + def start_link({pubsub, worker}), do: GenServer.start_link(__MODULE__, pubsub, name: worker) + + @impl true + def init(pubsub), do: {:ok, pubsub} + + @impl true + def handle_info({:ftl, topic, message, dispatcher}, pubsub) do + Phoenix.PubSub.local_broadcast(pubsub, topic, message, dispatcher) + {:noreply, pubsub} + end + + @impl true + def handle_info(_, pubsub), do: {:noreply, pubsub} +end diff --git a/lib/realtime_web/tenant_broadcaster.ex b/lib/realtime_web/tenant_broadcaster.ex index ee8646614..9995f2f27 100644 --- a/lib/realtime_web/tenant_broadcaster.ex +++ b/lib/realtime_web/tenant_broadcaster.ex @@ -9,7 +9,7 @@ defmodule RealtimeWeb.TenantBroadcaster do def pubsub_broadcast(tenant_id, topic, message, dispatcher) do collect_payload_size(tenant_id, message) - Realtime.GenRpc.multicast(PubSub, :local_broadcast, [Realtime.PubSub, topic, message, dispatcher], key: topic) + PubSub.broadcast(Realtime.PubSub, topic, message, dispatcher) :ok end @@ -25,13 +25,7 @@ defmodule RealtimeWeb.TenantBroadcaster do def pubsub_broadcast_from(tenant_id, from, topic, message, dispatcher) do collect_payload_size(tenant_id, message) - Realtime.GenRpc.multicast( - PubSub, - :local_broadcast_from, - [Realtime.PubSub, from, topic, message, dispatcher], - key: topic - ) - + PubSub.broadcast_from(Realtime.PubSub, from, topic, message, dispatcher) :ok end diff --git a/mix.exs b/mix.exs index 1e17ec551..0866b0476 100644 --- a/mix.exs +++ b/mix.exs @@ -4,7 +4,7 @@ defmodule Realtime.MixProject do def project do [ app: :realtime, - version: "2.49.0", + version: "2.50.0", elixir: "~> 1.17.3", elixirc_paths: elixirc_paths(Mix.env()), start_permanent: Mix.env() == :prod, @@ -90,7 +90,7 @@ defmodule Realtime.MixProject do {:opentelemetry_phoenix, "~> 2.0"}, {:opentelemetry_cowboy, "~> 1.0"}, {:opentelemetry_ecto, "~> 1.2"}, - {:gen_rpc, git: "https://github.com/supabase/gen_rpc.git", ref: "d161cf263c661a534eaabf80aac7a34484dac772"}, + {:gen_rpc, git: "https://github.com/supabase/gen_rpc.git", ref: "5aea098b300a0a6ad13533e030230132cbe9ca2c"}, {:mimic, "~> 1.0", only: :test}, {:floki, ">= 0.30.0", only: :test}, {:mint_web_socket, "~> 1.0", only: :test}, diff --git a/mix.lock b/mix.lock index dd95486b6..df5f70f4d 100644 --- a/mix.lock +++ b/mix.lock @@ -29,7 +29,7 @@ "file_system": {:hex, :file_system, "1.1.0", "08d232062284546c6c34426997dd7ef6ec9f8bbd090eb91780283c9016840e8f", [:mix], [], "hexpm", "bfcf81244f416871f2a2e15c1b515287faa5db9c6bcf290222206d120b3d43f6"}, "finch": {:hex, :finch, "0.19.0", "c644641491ea854fc5c1bbaef36bfc764e3f08e7185e1f084e35e0672241b76d", [:mix], [{:mime, "~> 1.0 or ~> 2.0", [hex: :mime, repo: "hexpm", optional: false]}, {:mint, "~> 1.6.2 or ~> 1.7", [hex: :mint, repo: "hexpm", optional: false]}, {:nimble_options, "~> 0.4 or ~> 1.0", [hex: :nimble_options, repo: "hexpm", optional: false]}, {:nimble_pool, "~> 1.1", [hex: :nimble_pool, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "fc5324ce209125d1e2fa0fcd2634601c52a787aff1cd33ee833664a5af4ea2b6"}, "floki": {:hex, :floki, "0.37.0", "b83e0280bbc6372f2a403b2848013650b16640cd2470aea6701f0632223d719e", [:mix], [], "hexpm", "516a0c15a69f78c47dc8e0b9b3724b29608aa6619379f91b1ffa47109b5d0dd3"}, - "gen_rpc": {:git, "https://github.com/supabase/gen_rpc.git", "d161cf263c661a534eaabf80aac7a34484dac772", [ref: "d161cf263c661a534eaabf80aac7a34484dac772"]}, + "gen_rpc": {:git, "https://github.com/supabase/gen_rpc.git", "5aea098b300a0a6ad13533e030230132cbe9ca2c", [ref: "5aea098b300a0a6ad13533e030230132cbe9ca2c"]}, "gettext": {:hex, :gettext, "0.26.2", "5978aa7b21fada6deabf1f6341ddba50bc69c999e812211903b169799208f2a8", [:mix], [{:expo, "~> 0.5.1 or ~> 1.0", [hex: :expo, repo: "hexpm", optional: false]}], "hexpm", "aa978504bcf76511efdc22d580ba08e2279caab1066b76bb9aa81c4a1e0a32a5"}, "gproc": {:hex, :gproc, "0.9.1", "f1df0364423539cf0b80e8201c8b1839e229e5f9b3ccb944c5834626998f5b8c", [:rebar3], [], "hexpm", "905088e32e72127ed9466f0bac0d8e65704ca5e73ee5a62cb073c3117916d507"}, "grpcbox": {:hex, :grpcbox, "0.17.1", "6e040ab3ef16fe699ffb513b0ef8e2e896da7b18931a1ef817143037c454bcce", [:rebar3], [{:acceptor_pool, "~> 1.0.0", [hex: :acceptor_pool, repo: "hexpm", optional: false]}, {:chatterbox, "~> 0.15.1", [hex: :ts_chatterbox, repo: "hexpm", optional: false]}, {:ctx, "~> 0.6.0", [hex: :ctx, repo: "hexpm", optional: false]}, {:gproc, "~> 0.9.1", [hex: :gproc, repo: "hexpm", optional: false]}], "hexpm", "4a3b5d7111daabc569dc9cbd9b202a3237d81c80bf97212fbc676832cb0ceb17"}, diff --git a/test/realtime/gen_rpc_pub_sub_test.exs b/test/realtime/gen_rpc_pub_sub_test.exs new file mode 100644 index 000000000..0013c2e7b --- /dev/null +++ b/test/realtime/gen_rpc_pub_sub_test.exs @@ -0,0 +1,2 @@ +Application.put_env(:phoenix_pubsub, :test_adapter, {Realtime.GenRpcPubSub, []}) +Code.require_file("../../deps/phoenix_pubsub/test/shared/pubsub_test.exs", __DIR__) diff --git a/test/realtime/gen_rpc_test.exs b/test/realtime/gen_rpc_test.exs index e14d2d054..0c41d3ea1 100644 --- a/test/realtime/gen_rpc_test.exs +++ b/test/realtime/gen_rpc_test.exs @@ -186,6 +186,39 @@ defmodule Realtime.GenRpcTest do end end + describe "abcast/4" do + test "abcast to registered process", %{node: node} do + name = + System.unique_integer() + |> to_string() + |> String.to_atom() + + :erlang.register(name, self()) + + # Use erpc to make the other node abcast to this one + :erpc.call(node, GenRpc, :abcast, [[node()], name, "a message", []]) + + assert_receive "a message" + refute_receive _any + end + + @tag extra_config: [{:gen_rpc, :tcp_server_port, 9999}] + test "tcp error" do + Logger.put_process_level(self(), :debug) + + log = + capture_log(fn -> + assert GenRpc.abcast(Node.list(), :some_process_name, "a message", []) == :ok + # We have to wait for gen_rpc logs to show up + Process.sleep(100) + end) + + assert log =~ "[error] event=connect_to_remote_server" + + refute_receive _any + end + end + describe "multicast/4" do test "evals everywhere" do parent = self() From e8a343a9fd899e6e68dcaa3d393575420e910a8a Mon Sep 17 00:00:00 2001 From: Eduardo Gurgel Date: Wed, 17 Sep 2025 14:27:57 +1200 Subject: [PATCH 13/34] fix: ensure message id doesn't raise on non-map payloads (#1534) --- .../realtime_channel/message_dispatcher.ex | 5 ++- mix.exs | 2 +- .../message_dispatcher_test.exs | 43 +++++++++++++++++++ .../channels/realtime_channel_test.exs | 40 +++++++++++++++++ 4 files changed, 88 insertions(+), 2 deletions(-) diff --git a/lib/realtime_web/channels/realtime_channel/message_dispatcher.ex b/lib/realtime_web/channels/realtime_channel/message_dispatcher.ex index ef486c4e8..32e1528f3 100644 --- a/lib/realtime_web/channels/realtime_channel/message_dispatcher.ex +++ b/lib/realtime_web/channels/realtime_channel/message_dispatcher.ex @@ -25,7 +25,7 @@ defmodule RealtimeWeb.RealtimeChannel.MessageDispatcher do # This reduce caches the serialization and bypasses the channel process going straight to the # transport process - message_id = msg.payload["meta"]["id"] + message_id = message_id(msg.payload) # Credo doesn't like that we don't use the result aggregation _ = @@ -62,6 +62,9 @@ defmodule RealtimeWeb.RealtimeChannel.MessageDispatcher do :ok end + defp message_id(%{"meta" => %{"id" => id}}), do: id + defp message_id(_), do: nil + defp already_replayed?(nil, _replayed_message_ids), do: false defp already_replayed?(message_id, replayed_message_ids), do: MapSet.member?(replayed_message_ids, message_id) diff --git a/mix.exs b/mix.exs index 0866b0476..893c32f57 100644 --- a/mix.exs +++ b/mix.exs @@ -4,7 +4,7 @@ defmodule Realtime.MixProject do def project do [ app: :realtime, - version: "2.50.0", + version: "2.50.1", elixir: "~> 1.17.3", elixirc_paths: elixirc_paths(Mix.env()), start_permanent: Mix.env() == :prod, diff --git a/test/realtime_web/channels/realtime_channel/message_dispatcher_test.exs b/test/realtime_web/channels/realtime_channel/message_dispatcher_test.exs index 91b16c089..44ce83b99 100644 --- a/test/realtime_web/channels/realtime_channel/message_dispatcher_test.exs +++ b/test/realtime_web/channels/realtime_channel/message_dispatcher_test.exs @@ -112,6 +112,49 @@ defmodule RealtimeWeb.RealtimeChannel.MessageDispatcherTest do refute_receive _any end + test "payload is not a map" do + parent = self() + + subscriber_pid = + spawn(fn -> + loop = fn loop -> + receive do + msg -> + send(parent, {:subscriber, msg}) + loop.(loop) + end + end + + loop.(loop) + end) + + from_pid = :erlang.list_to_pid(~c'<0.2.1>') + + subscribers = [ + {subscriber_pid, {:rc_fastlane, self(), TestSerializer, "realtime:topic", {:log, "tenant123"}, MapSet.new()}}, + {subscriber_pid, {:rc_fastlane, self(), TestSerializer, "realtime:topic", MapSet.new()}} + ] + + msg = %Broadcast{topic: "some:other:topic", event: "event", payload: "not a map"} + + log = + capture_log(fn -> + assert MessageDispatcher.dispatch(subscribers, from_pid, msg) == :ok + end) + + assert log =~ "Received message on realtime:topic with payload: #{inspect(msg, pretty: true)}" + + assert_receive {:encoded, %Broadcast{event: "event", payload: "not a map", topic: "realtime:topic"}} + assert_receive {:encoded, %Broadcast{event: "event", payload: "not a map", topic: "realtime:topic"}} + + assert Agent.get(TestSerializer, & &1) == 1 + + assert_receive {:subscriber, :update_rate_counter} + assert_receive {:subscriber, :update_rate_counter} + + refute_receive _any + end + test "dispatches messages to non fastlane subscribers" do from_pid = :erlang.list_to_pid(~c'<0.2.1>') diff --git a/test/realtime_web/channels/realtime_channel_test.exs b/test/realtime_web/channels/realtime_channel_test.exs index 4d90c3588..5269ff448 100644 --- a/test/realtime_web/channels/realtime_channel_test.exs +++ b/test/realtime_web/channels/realtime_channel_test.exs @@ -31,6 +31,46 @@ defmodule RealtimeWeb.RealtimeChannelTest do describe "broadcast" do @describetag policies: [:authenticated_all_topic_read] + test "broadcast map payload", %{tenant: tenant} do + jwt = Generators.generate_jwt_token(tenant) + {:ok, %Socket{} = socket} = connect(UserSocket, %{}, conn_opts(tenant, jwt)) + + config = %{ + "presence" => %{"enabled" => false}, + "broadcast" => %{"self" => true} + } + + assert {:ok, _, socket} = subscribe_and_join(socket, "realtime:test", %{"config" => config}) + + push(socket, "broadcast", %{"event" => "my_event", "payload" => %{"hello" => "world"}}) + + assert_receive %Phoenix.Socket.Message{ + topic: "realtime:test", + event: "broadcast", + payload: %{"event" => "my_event", "payload" => %{"hello" => "world"}} + } + end + + test "broadcast non-map payload", %{tenant: tenant} do + jwt = Generators.generate_jwt_token(tenant) + {:ok, %Socket{} = socket} = connect(UserSocket, %{}, conn_opts(tenant, jwt)) + + config = %{ + "presence" => %{"enabled" => false}, + "broadcast" => %{"self" => true} + } + + assert {:ok, _, socket} = subscribe_and_join(socket, "realtime:test", %{"config" => config}) + + push(socket, "broadcast", "not a map") + + assert_receive %Phoenix.Socket.Message{ + topic: "realtime:test", + event: "broadcast", + payload: "not a map" + } + end + test "wrong replay params", %{tenant: tenant} do jwt = Generators.generate_jwt_token(tenant) {:ok, %Socket{} = socket} = connect(UserSocket, %{"log_level" => "warning"}, conn_opts(tenant, jwt)) From 380b882fd963cb058717d8dfea62b3253ab40c19 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Filipe=20Caba=C3=A7o?= Date: Thu, 18 Sep 2025 22:54:42 +0100 Subject: [PATCH 14/34] fix: match error on Connect (#1536) --------- Co-authored-by: Eduardo Gurgel Pinho --- lib/realtime/tenants/connect.ex | 7 +- mix.exs | 2 +- test/realtime/tenants/connect_test.exs | 100 +++++++++++++++++++------ 3 files changed, 83 insertions(+), 26 deletions(-) diff --git a/lib/realtime/tenants/connect.ex b/lib/realtime/tenants/connect.ex index 920205e95..3d8f39833 100644 --- a/lib/realtime/tenants/connect.ex +++ b/lib/realtime/tenants/connect.ex @@ -55,6 +55,7 @@ defmodule Realtime.Tenants.Connect do | {:error, :tenant_database_unavailable} | {:error, :initializing} | {:error, :tenant_database_connection_initializing} + | {:error, :tenant_db_too_many_connections} | {:error, :rpc_error, term()} def lookup_or_start_connection(tenant_id, opts \\ []) when is_binary(tenant_id) do case get_status(tenant_id) do @@ -62,13 +63,16 @@ defmodule Realtime.Tenants.Connect do {:ok, conn} {:error, :tenant_database_unavailable} -> - call_external_node(tenant_id, opts) + {:error, :tenant_database_unavailable} {:error, :tenant_database_connection_initializing} -> call_external_node(tenant_id, opts) {:error, :initializing} -> {:error, :tenant_database_unavailable} + + {:error, :tenant_db_too_many_connections} -> + {:error, :tenant_db_too_many_connections} end end @@ -80,6 +84,7 @@ defmodule Realtime.Tenants.Connect do | {:error, :tenant_database_unavailable} | {:error, :initializing} | {:error, :tenant_database_connection_initializing} + | {:error, :tenant_db_too_many_connections} def get_status(tenant_id) do case :syn.lookup(__MODULE__, tenant_id) do {pid, %{conn: nil}} -> diff --git a/mix.exs b/mix.exs index 893c32f57..5ea9c627f 100644 --- a/mix.exs +++ b/mix.exs @@ -4,7 +4,7 @@ defmodule Realtime.MixProject do def project do [ app: :realtime, - version: "2.50.1", + version: "2.50.2", elixir: "~> 1.17.3", elixirc_paths: elixirc_paths(Mix.env()), start_permanent: Mix.env() == :prod, diff --git a/test/realtime/tenants/connect_test.exs b/test/realtime/tenants/connect_test.exs index fdc3d6385..8ba462b27 100644 --- a/test/realtime/tenants/connect_test.exs +++ b/test/realtime/tenants/connect_test.exs @@ -100,6 +100,54 @@ defmodule Realtime.Tenants.ConnectTest do # This one will succeed {:ok, _pid} = Connect.lookup_or_start_connection(tenant.external_id) end + + test "too many db connections", %{tenant: tenant} do + extension = %{ + "type" => "postgres_cdc_rls", + "settings" => %{ + "db_host" => "127.0.0.1", + "db_name" => "postgres", + "db_user" => "supabase_admin", + "db_password" => "postgres", + "poll_interval" => 100, + "poll_max_changes" => 100, + "poll_max_record_bytes" => 1_048_576, + "region" => "us-east-1", + "ssl_enforced" => false, + "db_pool" => 100, + "subcriber_pool_size" => 100, + "subs_pool_size" => 100 + } + } + + {:ok, tenant} = update_extension(tenant, extension) + + parent = self() + + # Let's slow down Connect starting + expect(Database, :check_tenant_connection, fn t -> + :timer.sleep(1000) + call_original(Database, :check_tenant_connection, [t]) + end) + + connect = fn -> send(parent, Connect.lookup_or_start_connection(tenant.external_id)) end + + # Start an early connect + spawn(connect) + :timer.sleep(100) + + # Start others + spawn(connect) + spawn(connect) + + # This one should block and wait for the first Connect + {:error, :tenant_db_too_many_connections} = Connect.lookup_or_start_connection(tenant.external_id) + + assert_receive {:error, :tenant_db_too_many_connections} + assert_receive {:error, :tenant_db_too_many_connections} + assert_receive {:error, :tenant_db_too_many_connections} + refute_receive _any + end end describe "region rebalancing" do @@ -263,6 +311,34 @@ defmodule Realtime.Tenants.ConnectTest do assert {:error, :tenant_suspended} = Connect.lookup_or_start_connection(tenant.external_id) end + test "tenant not able to connect if database has not enough connections", %{ + tenant: tenant + } do + extension = %{ + "type" => "postgres_cdc_rls", + "settings" => %{ + "db_host" => "127.0.0.1", + "db_name" => "postgres", + "db_user" => "supabase_admin", + "db_password" => "postgres", + "poll_interval" => 100, + "poll_max_changes" => 100, + "poll_max_record_bytes" => 1_048_576, + "region" => "us-east-1", + "ssl_enforced" => false, + "db_pool" => 100, + "subcriber_pool_size" => 100, + "subs_pool_size" => 100 + } + } + + {:ok, tenant} = update_extension(tenant, extension) + + assert capture_log(fn -> + assert {:error, :tenant_db_too_many_connections} = Connect.lookup_or_start_connection(tenant.external_id) + end) =~ ~r/Only \d+ available connections\. At least \d+ connections are required/ + end + test "handles tenant suspension and unsuspension in a reactive way", %{tenant: tenant} do assert {:ok, db_conn} = Connect.lookup_or_start_connection(tenant.external_id) assert Connect.ready?(tenant.external_id) @@ -459,30 +535,6 @@ defmodule Realtime.Tenants.ConnectTest do test "if tenant does not exist, does nothing" do assert :ok = Connect.shutdown("none") end - - test "tenant not able to connect if database has not enough connections", %{tenant: tenant} do - extension = %{ - "type" => "postgres_cdc_rls", - "settings" => %{ - "db_host" => "127.0.0.1", - "db_name" => "postgres", - "db_user" => "supabase_admin", - "db_password" => "postgres", - "poll_interval" => 100, - "poll_max_changes" => 100, - "poll_max_record_bytes" => 1_048_576, - "region" => "us-east-1", - "ssl_enforced" => false, - "db_pool" => 100, - "subcriber_pool_size" => 100, - "subs_pool_size" => 100 - } - } - - {:ok, tenant} = update_extension(tenant, extension) - - assert {:error, :tenant_db_too_many_connections} = Connect.lookup_or_start_connection(tenant.external_id) - end end describe "registers into local registry" do From 4ba956fc5eee91b484e030ccb6066dad2e68b0c6 Mon Sep 17 00:00:00 2001 From: Eduardo Gurgel Date: Mon, 22 Sep 2025 12:58:18 +1200 Subject: [PATCH 15/34] feat: websocket max heap size configuration (#1538) * fix: set max process heap size to 500MB instead of 8GB * feat: set websocket transport max heap size WEBSOCKET_MAX_HEAP_SIZE can be used to configure it --- README.md | 1 + config/runtime.exs | 2 ++ lib/realtime_web/channels/user_socket.ex | 10 ++++++++++ mix.exs | 2 +- rel/vm.args.eex | 6 +++--- test/realtime_web/channels/realtime_channel_test.exs | 8 ++++++++ 6 files changed, 25 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 3cbe10ad1..4e13e44df 100644 --- a/README.md +++ b/README.md @@ -169,6 +169,7 @@ If you're using the default tenant, the URL is `ws://realtime-dev.localhost:4000 | CONNECT_PARTITION_SLOTS | number | Number of dynamic supervisor partitions used by the Connect, ReplicationConnect processes | | METRICS_CLEANER_SCHEDULE_TIMER_IN_MS | number | Time in ms to run the Metric Cleaner task | | METRICS_RPC_TIMEOUT_IN_MS | number | Time in ms to wait for RPC call to fetch Metric per node | +| WEBSOCKET_MAX_HEAP_SIZE | number | Max number of bytes to be allocated as heap for the WebSocket transport process. If the limit is reached the process is brutally killed. Defaults to 50MB. | | REQUEST_ID_BAGGAGE_KEY | string | OTEL Baggage key to be used as request id | | OTEL_SDK_DISABLED | boolean | Disable OpenTelemetry tracing completely when 'true' | | OTEL_TRACES_EXPORTER | string | Possible values: `otlp` or `none`. See [https://github.com/open-telemetry/opentelemetry-erlang/tree/v1.4.0/apps#os-environment] for more details on how to configure the traces exporter. | diff --git a/config/runtime.exs b/config/runtime.exs index f20f40ad7..39a69135a 100644 --- a/config/runtime.exs +++ b/config/runtime.exs @@ -68,6 +68,7 @@ janitor_children_timeout = Env.get_integer("JANITOR_CHILDREN_TIMEOUT", :timer.se janitor_schedule_timer = Env.get_integer("JANITOR_SCHEDULE_TIMER_IN_MS", :timer.hours(4)) platform = if System.get_env("AWS_EXECUTION_ENV") == "AWS_ECS_FARGATE", do: :aws, else: :fly broadcast_pool_size = Env.get_integer("BROADCAST_POOL_SIZE", 10) +websocket_max_heap_size = div(Env.get_integer("WEBSOCKET_MAX_HEAP_SIZE", 50_000_000), :erlang.system_info(:wordsize)) no_channel_timeout_in_ms = if config_env() == :test, @@ -107,6 +108,7 @@ config :realtime, Realtime.Repo, ssl: ssl_opts config :realtime, + websocket_max_heap_size: websocket_max_heap_size, migration_partition_slots: migration_partition_slots, connect_partition_slots: connect_partition_slots, rebalance_check_interval_in_ms: rebalance_check_interval_in_ms, diff --git a/lib/realtime_web/channels/user_socket.ex b/lib/realtime_web/channels/user_socket.ex index 09dd15906..849aa052d 100644 --- a/lib/realtime_web/channels/user_socket.ex +++ b/lib/realtime_web/channels/user_socket.ex @@ -1,4 +1,12 @@ defmodule RealtimeWeb.UserSocket do + # This is defined up here before `use Phoenix.Socket` is called so that we can define `Phoenix.Socket.init/1` + # It has to be overridden because we need to set the `max_heap_size` flag from the transport process context + @impl true + def init(state) when is_tuple(state) do + Process.flag(:max_heap_size, max_heap_size()) + Phoenix.Socket.__init__(state) + end + use Phoenix.Socket use Realtime.Logs @@ -122,4 +130,6 @@ defmodule RealtimeWeb.UserSocket do _ -> @default_log_level end end + + defp max_heap_size(), do: Application.fetch_env!(:realtime, :websocket_max_heap_size) end diff --git a/mix.exs b/mix.exs index 5ea9c627f..170e161a7 100644 --- a/mix.exs +++ b/mix.exs @@ -4,7 +4,7 @@ defmodule Realtime.MixProject do def project do [ app: :realtime, - version: "2.50.2", + version: "2.51.0", elixir: "~> 1.17.3", elixirc_paths: elixirc_paths(Mix.env()), start_permanent: Mix.env() == :prod, diff --git a/rel/vm.args.eex b/rel/vm.args.eex index 278da5524..9de4e952f 100644 --- a/rel/vm.args.eex +++ b/rel/vm.args.eex @@ -10,8 +10,8 @@ ## Tweak GC to run more often ##-env ERL_FULLSWEEP_AFTER 10 -## Limit process heap for all procs to 1000 MB -+hmax 1000000000 +## Limit process heap for all procs to 500 MB. The number here is the number of words ++hmax <%= div(500_000_000, :erlang.system_info(:wordsize)) %> ## Set distribution buffer busy limit (default is 1024) +zdbbl 100000 @@ -19,4 +19,4 @@ ## Disable Busy Wait +sbwt none +sbwtdio none -+sbwtdcpu none \ No newline at end of file ++sbwtdcpu none diff --git a/test/realtime_web/channels/realtime_channel_test.exs b/test/realtime_web/channels/realtime_channel_test.exs index 5269ff448..0a0d8aca9 100644 --- a/test/realtime_web/channels/realtime_channel_test.exs +++ b/test/realtime_web/channels/realtime_channel_test.exs @@ -28,6 +28,14 @@ defmodule RealtimeWeb.RealtimeChannelTest do setup :rls_context + test "max heap size is set", %{tenant: tenant} do + jwt = Generators.generate_jwt_token(tenant) + {:ok, %Socket{} = socket} = connect(UserSocket, %{}, conn_opts(tenant, jwt)) + + assert Process.info(socket.transport_pid, :max_heap_size) == + {:max_heap_size, %{error_logger: true, include_shared_binaries: false, kill: true, size: 6_250_000}} + end + describe "broadcast" do @describetag policies: [:authenticated_all_topic_read] From 1df809e1aa9f4167bfe7fc7a5cfd38d44b4da8ff Mon Sep 17 00:00:00 2001 From: Eduardo Gurgel Date: Mon, 22 Sep 2025 19:15:23 +1200 Subject: [PATCH 16/34] fix: update gen_rpc to fix gen_rpc_dispatcher issues (#1537) Issues: * Single gen_rpc_dispatcher that can be a bottleneck if the connecting takes some time * Many calls can land on the dispatcher but the node might be gone already. If we don't validate the node it might keep trying to connect until it times out instead of quickly giving up due to not being an actively connected node. --- mix.exs | 4 ++-- mix.lock | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mix.exs b/mix.exs index 170e161a7..b4f626b9f 100644 --- a/mix.exs +++ b/mix.exs @@ -4,7 +4,7 @@ defmodule Realtime.MixProject do def project do [ app: :realtime, - version: "2.51.0", + version: "2.51.1", elixir: "~> 1.17.3", elixirc_paths: elixirc_paths(Mix.env()), start_permanent: Mix.env() == :prod, @@ -90,7 +90,7 @@ defmodule Realtime.MixProject do {:opentelemetry_phoenix, "~> 2.0"}, {:opentelemetry_cowboy, "~> 1.0"}, {:opentelemetry_ecto, "~> 1.2"}, - {:gen_rpc, git: "https://github.com/supabase/gen_rpc.git", ref: "5aea098b300a0a6ad13533e030230132cbe9ca2c"}, + {:gen_rpc, git: "https://github.com/supabase/gen_rpc.git", ref: "901aada9adb307ff89a8be197a5d384e69dd57d6"}, {:mimic, "~> 1.0", only: :test}, {:floki, ">= 0.30.0", only: :test}, {:mint_web_socket, "~> 1.0", only: :test}, diff --git a/mix.lock b/mix.lock index df5f70f4d..c5fce6022 100644 --- a/mix.lock +++ b/mix.lock @@ -29,7 +29,7 @@ "file_system": {:hex, :file_system, "1.1.0", "08d232062284546c6c34426997dd7ef6ec9f8bbd090eb91780283c9016840e8f", [:mix], [], "hexpm", "bfcf81244f416871f2a2e15c1b515287faa5db9c6bcf290222206d120b3d43f6"}, "finch": {:hex, :finch, "0.19.0", "c644641491ea854fc5c1bbaef36bfc764e3f08e7185e1f084e35e0672241b76d", [:mix], [{:mime, "~> 1.0 or ~> 2.0", [hex: :mime, repo: "hexpm", optional: false]}, {:mint, "~> 1.6.2 or ~> 1.7", [hex: :mint, repo: "hexpm", optional: false]}, {:nimble_options, "~> 0.4 or ~> 1.0", [hex: :nimble_options, repo: "hexpm", optional: false]}, {:nimble_pool, "~> 1.1", [hex: :nimble_pool, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "fc5324ce209125d1e2fa0fcd2634601c52a787aff1cd33ee833664a5af4ea2b6"}, "floki": {:hex, :floki, "0.37.0", "b83e0280bbc6372f2a403b2848013650b16640cd2470aea6701f0632223d719e", [:mix], [], "hexpm", "516a0c15a69f78c47dc8e0b9b3724b29608aa6619379f91b1ffa47109b5d0dd3"}, - "gen_rpc": {:git, "https://github.com/supabase/gen_rpc.git", "5aea098b300a0a6ad13533e030230132cbe9ca2c", [ref: "5aea098b300a0a6ad13533e030230132cbe9ca2c"]}, + "gen_rpc": {:git, "https://github.com/supabase/gen_rpc.git", "901aada9adb307ff89a8be197a5d384e69dd57d6", [ref: "901aada9adb307ff89a8be197a5d384e69dd57d6"]}, "gettext": {:hex, :gettext, "0.26.2", "5978aa7b21fada6deabf1f6341ddba50bc69c999e812211903b169799208f2a8", [:mix], [{:expo, "~> 0.5.1 or ~> 1.0", [hex: :expo, repo: "hexpm", optional: false]}], "hexpm", "aa978504bcf76511efdc22d580ba08e2279caab1066b76bb9aa81c4a1e0a32a5"}, "gproc": {:hex, :gproc, "0.9.1", "f1df0364423539cf0b80e8201c8b1839e229e5f9b3ccb944c5834626998f5b8c", [:rebar3], [], "hexpm", "905088e32e72127ed9466f0bac0d8e65704ca5e73ee5a62cb073c3117916d507"}, "grpcbox": {:hex, :grpcbox, "0.17.1", "6e040ab3ef16fe699ffb513b0ef8e2e896da7b18931a1ef817143037c454bcce", [:rebar3], [{:acceptor_pool, "~> 1.0.0", [hex: :acceptor_pool, repo: "hexpm", optional: false]}, {:chatterbox, "~> 0.15.1", [hex: :ts_chatterbox, repo: "hexpm", optional: false]}, {:ctx, "~> 0.6.0", [hex: :ctx, repo: "hexpm", optional: false]}, {:gproc, "~> 0.9.1", [hex: :gproc, repo: "hexpm", optional: false]}], "hexpm", "4a3b5d7111daabc569dc9cbd9b202a3237d81c80bf97212fbc676832cb0ceb17"}, From 9a21897acd7aa789cab5372311765f337c7c29e2 Mon Sep 17 00:00:00 2001 From: Eduardo Gurgel Date: Mon, 22 Sep 2025 20:27:33 +1200 Subject: [PATCH 17/34] fix: improve ErlSysMon logging for processes (#1540) Include initial_call, ancestors, registered_name, message_queue_len and total_heap_size Also bump long_schedule and long_gc --- lib/realtime/monitoring/erl_sys_mon.ex | 34 +++++++++++++++++-- mix.exs | 2 +- test/realtime/monitoring/erl_sys_mon_test.exs | 27 ++++++++++----- 3 files changed, 50 insertions(+), 13 deletions(-) diff --git a/lib/realtime/monitoring/erl_sys_mon.ex b/lib/realtime/monitoring/erl_sys_mon.ex index 32a4f857b..3278886d6 100644 --- a/lib/realtime/monitoring/erl_sys_mon.ex +++ b/lib/realtime/monitoring/erl_sys_mon.ex @@ -10,8 +10,8 @@ defmodule Realtime.ErlSysMon do @defaults [ :busy_dist_port, :busy_port, - {:long_gc, 250}, - {:long_schedule, 100}, + {:long_gc, 500}, + {:long_schedule, 500}, {:long_message_queue, {0, 1_000}} ] @@ -24,8 +24,36 @@ defmodule Realtime.ErlSysMon do {:ok, []} end + def handle_info({:monitor, pid, _type, _meta} = msg, state) when is_pid(pid) do + log_process_info(msg, pid) + {:noreply, state} + end + def handle_info(msg, state) do - Logger.error("#{__MODULE__} message: " <> inspect(msg)) + Logger.warning("#{__MODULE__} message: " <> inspect(msg)) {:noreply, state} end + + defp log_process_info(msg, pid) do + pid_info = + pid + |> Process.info(:dictionary) + |> case do + {:dictionary, dict} when is_list(dict) -> + {List.keyfind(dict, :"$initial_call", 0), List.keyfind(dict, :"$ancestors", 0)} + + other -> + other + end + + extra_info = Process.info(pid, [:registered_name, :message_queue_len, :total_heap_size]) + + Logger.warning( + "#{__MODULE__} message: " <> + inspect(msg) <> "|\n process info: #{inspect(pid_info)} #{inspect(extra_info)}" + ) + rescue + _ -> + Logger.warning("#{__MODULE__} message: " <> inspect(msg)) + end end diff --git a/mix.exs b/mix.exs index b4f626b9f..95e8393b3 100644 --- a/mix.exs +++ b/mix.exs @@ -4,7 +4,7 @@ defmodule Realtime.MixProject do def project do [ app: :realtime, - version: "2.51.1", + version: "2.51.2", elixir: "~> 1.17.3", elixirc_paths: elixirc_paths(Mix.env()), start_permanent: Mix.env() == :prod, diff --git a/test/realtime/monitoring/erl_sys_mon_test.exs b/test/realtime/monitoring/erl_sys_mon_test.exs index b1e122d58..e9c7b87b7 100644 --- a/test/realtime/monitoring/erl_sys_mon_test.exs +++ b/test/realtime/monitoring/erl_sys_mon_test.exs @@ -5,16 +5,25 @@ defmodule Realtime.Monitoring.ErlSysMonTest do describe "system monitoring" do test "logs system monitor events" do - start_supervised!({ErlSysMon, config: [{:long_message_queue, {1, 10}}]}) + start_supervised!({ErlSysMon, config: [{:long_message_queue, {1, 100}}]}) - assert capture_log(fn -> - Task.async(fn -> - Enum.map(1..1000, &send(self(), &1)) - # Wait for ErlSysMon to notice - Process.sleep(4000) - end) - |> Task.await() - end) =~ "Realtime.ErlSysMon message:" + log = + capture_log(fn -> + Task.async(fn -> + Process.register(self(), TestProcess) + Enum.map(1..1000, &send(self(), &1)) + # Wait for ErlSysMon to notice + Process.sleep(4000) + end) + |> Task.await() + end) + + assert log =~ "Realtime.ErlSysMon message:" + assert log =~ "$initial_call\", {Realtime.Monitoring.ErlSysMonTest" + assert log =~ "ancestors\", [#{inspect(self())}]" + assert log =~ "registered_name: TestProcess" + assert log =~ "message_queue_len: " + assert log =~ "total_heap_size: " end end end From 54cd3f763a817a703a2eee9c7f8c8ea5d29b684a Mon Sep 17 00:00:00 2001 From: Eduardo Gurgel Date: Mon, 22 Sep 2025 20:48:12 +1200 Subject: [PATCH 18/34] fix: make pubsub adapter configurable (#1539) --- config/runtime.exs | 2 + lib/realtime/application.ex | 10 +- lib/realtime_web/tenant_broadcaster.ex | 22 ++- mix.exs | 2 +- test/realtime_web/tenant_broadcaster_test.exs | 140 ++++++++++-------- 5 files changed, 108 insertions(+), 68 deletions(-) diff --git a/config/runtime.exs b/config/runtime.exs index 39a69135a..47961f98a 100644 --- a/config/runtime.exs +++ b/config/runtime.exs @@ -68,6 +68,7 @@ janitor_children_timeout = Env.get_integer("JANITOR_CHILDREN_TIMEOUT", :timer.se janitor_schedule_timer = Env.get_integer("JANITOR_SCHEDULE_TIMER_IN_MS", :timer.hours(4)) platform = if System.get_env("AWS_EXECUTION_ENV") == "AWS_ECS_FARGATE", do: :aws, else: :fly broadcast_pool_size = Env.get_integer("BROADCAST_POOL_SIZE", 10) +pubsub_adapter = System.get_env("PUBSUB_ADAPTER", "pg2") |> String.to_atom() websocket_max_heap_size = div(Env.get_integer("WEBSOCKET_MAX_HEAP_SIZE", 50_000_000), :erlang.system_info(:wordsize)) no_channel_timeout_in_ms = @@ -124,6 +125,7 @@ config :realtime, max_gen_rpc_clients: max_gen_rpc_clients, no_channel_timeout_in_ms: no_channel_timeout_in_ms, platform: platform, + pubsub_adapter: pubsub_adapter, broadcast_pool_size: broadcast_pool_size if config_env() != :test && run_janitor? do diff --git a/lib/realtime/application.ex b/lib/realtime/application.ex index cda853150..99096edfb 100644 --- a/lib/realtime/application.ex +++ b/lib/realtime/application.ex @@ -67,7 +67,7 @@ defmodule Realtime.Application do RealtimeWeb.Telemetry, {Cluster.Supervisor, [topologies, [name: Realtime.ClusterSupervisor]]}, {Phoenix.PubSub, - name: Realtime.PubSub, pool_size: 10, adapter: Realtime.GenRpcPubSub, broadcast_pool_size: broadcast_pool_size}, + name: Realtime.PubSub, pool_size: 10, adapter: pubsub_adapter(), broadcast_pool_size: broadcast_pool_size}, {Cachex, name: Realtime.RateCounter}, Realtime.Tenants.Cache, Realtime.RateCounter.DynamicSupervisor, @@ -154,4 +154,12 @@ defmodule Realtime.Application do OpentelemetryPhoenix.setup(adapter: :cowboy2) OpentelemetryEcto.setup([:realtime, :repo], db_statement: :enabled) end + + defp pubsub_adapter do + if Application.fetch_env!(:realtime, :pubsub_adapter) == :gen_rpc do + Realtime.GenRpcPubSub + else + Phoenix.PubSub.PG2 + end + end end diff --git a/lib/realtime_web/tenant_broadcaster.ex b/lib/realtime_web/tenant_broadcaster.ex index 9995f2f27..da02df79e 100644 --- a/lib/realtime_web/tenant_broadcaster.ex +++ b/lib/realtime_web/tenant_broadcaster.ex @@ -9,7 +9,11 @@ defmodule RealtimeWeb.TenantBroadcaster do def pubsub_broadcast(tenant_id, topic, message, dispatcher) do collect_payload_size(tenant_id, message) - PubSub.broadcast(Realtime.PubSub, topic, message, dispatcher) + if pubsub_adapter() == :gen_rpc do + PubSub.broadcast(Realtime.PubSub, topic, message, dispatcher) + else + Realtime.GenRpc.multicast(PubSub, :local_broadcast, [Realtime.PubSub, topic, message, dispatcher], key: topic) + end :ok end @@ -25,7 +29,17 @@ defmodule RealtimeWeb.TenantBroadcaster do def pubsub_broadcast_from(tenant_id, from, topic, message, dispatcher) do collect_payload_size(tenant_id, message) - PubSub.broadcast_from(Realtime.PubSub, from, topic, message, dispatcher) + if pubsub_adapter() == :gen_rpc do + PubSub.broadcast_from(Realtime.PubSub, from, topic, message, dispatcher) + else + Realtime.GenRpc.multicast( + PubSub, + :local_broadcast_from, + [Realtime.PubSub, from, topic, message, dispatcher], + key: topic + ) + end + :ok end @@ -39,4 +53,8 @@ defmodule RealtimeWeb.TenantBroadcaster do defp collect_payload_size(tenant_id, payload) do :telemetry.execute(@payload_size_event, %{size: :erlang.external_size(payload)}, %{tenant: tenant_id}) end + + defp pubsub_adapter do + Application.fetch_env!(:realtime, :pubsub_adapter) + end end diff --git a/mix.exs b/mix.exs index 95e8393b3..9c66b3dde 100644 --- a/mix.exs +++ b/mix.exs @@ -4,7 +4,7 @@ defmodule Realtime.MixProject do def project do [ app: :realtime, - version: "2.51.2", + version: "2.51.3", elixir: "~> 1.17.3", elixirc_paths: elixirc_paths(Mix.env()), start_permanent: Mix.env() == :prod, diff --git a/test/realtime_web/tenant_broadcaster_test.exs b/test/realtime_web/tenant_broadcaster_test.exs index d9afbf641..ddda381a1 100644 --- a/test/realtime_web/tenant_broadcaster_test.exs +++ b/test/realtime_web/tenant_broadcaster_test.exs @@ -1,5 +1,5 @@ defmodule RealtimeWeb.TenantBroadcasterTest do - # Usage of Clustered + # Usage of Clustered and changing Application env use Realtime.DataCase, async: false alias Phoenix.Socket.Broadcast @@ -47,95 +47,107 @@ defmodule RealtimeWeb.TenantBroadcasterTest do pid: self() ) + original = Application.fetch_env!(:realtime, :pubsub_adapter) + on_exit(fn -> Application.put_env(:realtime, :pubsub_adapter, original) end) + Application.put_env(:realtime, :pubsub_adapter, context.pubsub_adapter) + :ok end - describe "pubsub_broadcast/4" do - test "pubsub_broadcast", %{node: node} do - message = %Broadcast{topic: @topic, event: "an event", payload: %{"a" => "b"}} - TenantBroadcaster.pubsub_broadcast("realtime-dev", @topic, message, Phoenix.PubSub) + for pubsub_adapter <- [:gen_rpc, :pg2] do + describe "pubsub_broadcast/4 #{pubsub_adapter}" do + @describetag pubsub_adapter: pubsub_adapter - assert_receive ^message + test "pubsub_broadcast", %{node: node} do + message = %Broadcast{topic: @topic, event: "an event", payload: %{"a" => "b"}} + TenantBroadcaster.pubsub_broadcast("realtime-dev", @topic, message, Phoenix.PubSub) - # Remote node received the broadcast - assert_receive {:relay, ^node, ^message} + assert_receive ^message - assert_receive { - :telemetry, - [:realtime, :tenants, :payload, :size], - %{size: 114}, - %{tenant: "realtime-dev"} - } - end + # Remote node received the broadcast + assert_receive {:relay, ^node, ^message} - test "pubsub_broadcast list payload", %{node: node} do - message = %Broadcast{topic: @topic, event: "an event", payload: ["a", %{"b" => "c"}, 1, 23]} - TenantBroadcaster.pubsub_broadcast("realtime-dev", @topic, message, Phoenix.PubSub) + assert_receive { + :telemetry, + [:realtime, :tenants, :payload, :size], + %{size: 114}, + %{tenant: "realtime-dev"} + } + end - assert_receive ^message + test "pubsub_broadcast list payload", %{node: node} do + message = %Broadcast{topic: @topic, event: "an event", payload: ["a", %{"b" => "c"}, 1, 23]} + TenantBroadcaster.pubsub_broadcast("realtime-dev", @topic, message, Phoenix.PubSub) - # Remote node received the broadcast - assert_receive {:relay, ^node, ^message} + assert_receive ^message - assert_receive { - :telemetry, - [:realtime, :tenants, :payload, :size], - %{size: 130}, - %{tenant: "realtime-dev"} - } - end + # Remote node received the broadcast + assert_receive {:relay, ^node, ^message} - test "pubsub_broadcast string payload", %{node: node} do - message = %Broadcast{topic: @topic, event: "an event", payload: "some text payload"} - TenantBroadcaster.pubsub_broadcast("realtime-dev", @topic, message, Phoenix.PubSub) + assert_receive { + :telemetry, + [:realtime, :tenants, :payload, :size], + %{size: 130}, + %{tenant: "realtime-dev"} + } + end - assert_receive ^message + test "pubsub_broadcast string payload", %{node: node} do + message = %Broadcast{topic: @topic, event: "an event", payload: "some text payload"} + TenantBroadcaster.pubsub_broadcast("realtime-dev", @topic, message, Phoenix.PubSub) - # Remote node received the broadcast - assert_receive {:relay, ^node, ^message} + assert_receive ^message - assert_receive { - :telemetry, - [:realtime, :tenants, :payload, :size], - %{size: 119}, - %{tenant: "realtime-dev"} - } + # Remote node received the broadcast + assert_receive {:relay, ^node, ^message} + + assert_receive { + :telemetry, + [:realtime, :tenants, :payload, :size], + %{size: 119}, + %{tenant: "realtime-dev"} + } + end end end - describe "pubsub_broadcast_from/5" do - test "pubsub_broadcast_from", %{node: node} do - parent = self() + for pubsub_adapter <- [:gen_rpc, :pg2] do + describe "pubsub_broadcast_from/5 #{pubsub_adapter}" do + @describetag pubsub_adapter: pubsub_adapter + + test "pubsub_broadcast_from", %{node: node} do + parent = self() - spawn_link(fn -> - Endpoint.subscribe(@topic) - send(parent, :ready) + spawn_link(fn -> + Endpoint.subscribe(@topic) + send(parent, :ready) - receive do - msg -> send(parent, {:other_process, msg}) - end - end) + receive do + msg -> send(parent, {:other_process, msg}) + end + end) - assert_receive :ready + assert_receive :ready - message = %Broadcast{topic: @topic, event: "an event", payload: %{"a" => "b"}} + message = %Broadcast{topic: @topic, event: "an event", payload: %{"a" => "b"}} - TenantBroadcaster.pubsub_broadcast_from("realtime-dev", self(), @topic, message, Phoenix.PubSub) + TenantBroadcaster.pubsub_broadcast_from("realtime-dev", self(), @topic, message, Phoenix.PubSub) - assert_receive {:other_process, ^message} + assert_receive {:other_process, ^message} - # Remote node received the broadcast - assert_receive {:relay, ^node, ^message} + # Remote node received the broadcast + assert_receive {:relay, ^node, ^message} - assert_receive { - :telemetry, - [:realtime, :tenants, :payload, :size], - %{size: 114}, - %{tenant: "realtime-dev"} - } + assert_receive { + :telemetry, + [:realtime, :tenants, :payload, :size], + %{size: 114}, + %{tenant: "realtime-dev"} + } - # This process does not receive the message - refute_receive _any + # This process does not receive the message + refute_receive _any + end end end From e4ee7c83d619383ddc6291183cf609355afbe3b9 Mon Sep 17 00:00:00 2001 From: Eduardo Gurgel Date: Thu, 25 Sep 2025 12:38:26 +1200 Subject: [PATCH 19/34] fix: specify that only private channels are allowed when replaying (#1543) messages --- lib/realtime_web/channels/realtime_channel.ex | 5 ++++- mix.exs | 2 +- test/realtime_web/channels/realtime_channel_test.exs | 2 +- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/lib/realtime_web/channels/realtime_channel.ex b/lib/realtime_web/channels/realtime_channel.ex index 1d58d9da7..63be07f03 100644 --- a/lib/realtime_web/channels/realtime_channel.ex +++ b/lib/realtime_web/channels/realtime_channel.ex @@ -213,6 +213,9 @@ defmodule RealtimeWeb.RealtimeChannel do {:error, :invalid_replay_params} -> log_error(socket, "UnableToReplayMessages", "Replay params are not valid") + {:error, :invalid_replay_channel} -> + log_error(socket, "UnableToReplayMessages", "Replay is not allowed for public channels") + {:error, error} -> log_error(socket, "UnknownErrorOnChannel", error) {:error, %{reason: "Unknown Error on Channel"}} @@ -790,7 +793,7 @@ defmodule RealtimeWeb.RealtimeChannel do end defp maybe_replay_messages(%{"broadcast" => %{"replay" => _}}, _sub_topic, _db_conn, false = _private?) do - {:error, :invalid_replay_params} + {:error, :invalid_replay_channel} end defp maybe_replay_messages(%{"broadcast" => %{"replay" => replay_params}}, sub_topic, db_conn, true = _private?) diff --git a/mix.exs b/mix.exs index 9c66b3dde..139e862fc 100644 --- a/mix.exs +++ b/mix.exs @@ -4,7 +4,7 @@ defmodule Realtime.MixProject do def project do [ app: :realtime, - version: "2.51.3", + version: "2.51.4", elixir: "~> 1.17.3", elixirc_paths: elixirc_paths(Mix.env()), start_permanent: Mix.env() == :prod, diff --git a/test/realtime_web/channels/realtime_channel_test.exs b/test/realtime_web/channels/realtime_channel_test.exs index 0a0d8aca9..ae6c1734a 100644 --- a/test/realtime_web/channels/realtime_channel_test.exs +++ b/test/realtime_web/channels/realtime_channel_test.exs @@ -153,7 +153,7 @@ defmodule RealtimeWeb.RealtimeChannelTest do assert { :error, - %{reason: "UnableToReplayMessages: Replay params are not valid"} + %{reason: "UnableToReplayMessages: Replay is not allowed for public channels"} } = subscribe_and_join(socket, "realtime:test", %{"config" => config}) refute_receive _any From d4565dfc53996cd94f5a11ba514f0b32808ce759 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Filipe=20Caba=C3=A7o?= Date: Fri, 26 Sep 2025 11:46:48 +0100 Subject: [PATCH 20/34] fix: rate limit connect module (#1541) On bad connection, we rate limit the Connect module so we prevent abuses and too much logging of errors --- README.md | 1 + lib/realtime/tenants.ex | 26 ++++++++++ lib/realtime/tenants/connect.ex | 32 +++++++------ lib/realtime_web/channels/realtime_channel.ex | 4 ++ mix.exs | 2 +- test/realtime/tenants/connect_test.exs | 47 +++++++++++++++++++ .../controllers/broadcast_controller_test.exs | 18 +++++-- 7 files changed, 111 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index 4e13e44df..7dd223bf3 100644 --- a/README.md +++ b/README.md @@ -243,6 +243,7 @@ This is the list of operational codes that can help you understand your deployme | ChannelRateLimitReached | The number of channels you can create has reached its limit | | ConnectionRateLimitReached | The number of connected clients as reached its limit | | ClientJoinRateLimitReached | The rate of joins per second from your clients has reached the channel limits | +| DatabaseConnectionRateLimitReached | The rate of attempts to connect to tenants database has reached the limit | | MessagePerSecondRateLimitReached | The rate of messages per second from your clients has reached the channel limits | | RealtimeDisabledForTenant | Realtime has been disabled for the tenant | | UnableToConnectToTenantDatabase | Realtime was not able to connect to the tenant's database | diff --git a/lib/realtime/tenants.ex b/lib/realtime/tenants.ex index 63965abea..db2a02cc4 100644 --- a/lib/realtime/tenants.ex +++ b/lib/realtime/tenants.ex @@ -328,6 +328,32 @@ defmodule Realtime.Tenants do %RateCounter.Args{id: {:channel, :authorization_errors, external_id}, opts: opts} end + @connect_per_second_default 10 + @doc "RateCounter arguments for counting connect per second." + @spec connect_per_second_rate(Tenant.t() | String.t()) :: RateCounter.Args.t() + def connect_per_second_rate(%Tenant{external_id: external_id}) do + connect_per_second_rate(external_id) + end + + def connect_per_second_rate(tenant_id) do + opts = [ + max_bucket_len: 10, + limit: [ + value: @connect_per_second_default, + measurement: :sum, + log_fn: fn -> + Logger.critical( + "DatabaseConnectionRateLimitReached: Too many connection attempts against the tenant database", + external_id: tenant_id, + project: tenant_id + ) + end + ] + ] + + %RateCounter.Args{id: {:database, :connect, tenant_id}, opts: opts} + end + defp pool_size(%{extensions: [%{settings: settings} | _]}) do Database.pool_size_by_application_name("realtime_connect", settings) end diff --git a/lib/realtime/tenants/connect.ex b/lib/realtime/tenants/connect.ex index 3d8f39833..0ee43f161 100644 --- a/lib/realtime/tenants/connect.ex +++ b/lib/realtime/tenants/connect.ex @@ -11,8 +11,9 @@ defmodule Realtime.Tenants.Connect do use Realtime.Logs - alias Realtime.Tenants.Rebalancer alias Realtime.Api.Tenant + alias Realtime.GenCounter + alias Realtime.RateCounter alias Realtime.Rpc alias Realtime.Tenants alias Realtime.Tenants.Connect.CheckConnection @@ -20,6 +21,7 @@ defmodule Realtime.Tenants.Connect do alias Realtime.Tenants.Connect.Piper alias Realtime.Tenants.Connect.RegisterProcess alias Realtime.Tenants.Migrations + alias Realtime.Tenants.Rebalancer alias Realtime.Tenants.ReplicationConnection alias Realtime.UsersCounter @@ -39,11 +41,8 @@ defmodule Realtime.Tenants.Connect do @doc "Check if Connect has finished setting up connections" def ready?(tenant_id) do case whereis(tenant_id) do - pid when is_pid(pid) -> - GenServer.call(pid, :ready?) - - _ -> - false + pid when is_pid(pid) -> GenServer.call(pid, :ready?) + _ -> false end end @@ -55,24 +54,29 @@ defmodule Realtime.Tenants.Connect do | {:error, :tenant_database_unavailable} | {:error, :initializing} | {:error, :tenant_database_connection_initializing} - | {:error, :tenant_db_too_many_connections} + | {:error, :connect_rate_limit_reached} | {:error, :rpc_error, term()} def lookup_or_start_connection(tenant_id, opts \\ []) when is_binary(tenant_id) do - case get_status(tenant_id) do - {:ok, conn} -> - {:ok, conn} + rate_args = Tenants.connect_per_second_rate(tenant_id) + RateCounter.new(rate_args) - {:error, :tenant_database_unavailable} -> - {:error, :tenant_database_unavailable} + with {:ok, %{limit: %{triggered: false}}} <- RateCounter.get(rate_args), + {:ok, conn} <- get_status(tenant_id) do + {:ok, conn} + else + {:ok, %{limit: %{triggered: true}}} -> + {:error, :connect_rate_limit_reached} {:error, :tenant_database_connection_initializing} -> + GenCounter.add(rate_args.id) call_external_node(tenant_id, opts) {:error, :initializing} -> {:error, :tenant_database_unavailable} - {:error, :tenant_db_too_many_connections} -> - {:error, :tenant_db_too_many_connections} + {:error, reason} -> + GenCounter.add(rate_args.id) + {:error, reason} end end diff --git a/lib/realtime_web/channels/realtime_channel.ex b/lib/realtime_web/channels/realtime_channel.ex index 63be07f03..91a417c21 100644 --- a/lib/realtime_web/channels/realtime_channel.ex +++ b/lib/realtime_web/channels/realtime_channel.ex @@ -167,6 +167,10 @@ defmodule RealtimeWeb.RealtimeChannel do msg = "Database can't accept more connections, Realtime won't connect" log_error(socket, "DatabaseLackOfConnections", msg) + {:error, :connect_rate_limit_reached} -> + msg = "Too many database connections attempts per second" + log_error(socket, "DatabaseConnectionRateLimitReached", msg) + {:error, :unable_to_set_policies, error} -> log_error(socket, "UnableToSetPolicies", error) {:error, %{reason: "Realtime was unable to connect to the project database"}} diff --git a/mix.exs b/mix.exs index 139e862fc..4b0b1f40c 100644 --- a/mix.exs +++ b/mix.exs @@ -4,7 +4,7 @@ defmodule Realtime.MixProject do def project do [ app: :realtime, - version: "2.51.4", + version: "2.51.5", elixir: "~> 1.17.3", elixirc_paths: elixirc_paths(Mix.env()), start_permanent: Mix.env() == :prod, diff --git a/test/realtime/tenants/connect_test.exs b/test/realtime/tenants/connect_test.exs index 8ba462b27..a52973d53 100644 --- a/test/realtime/tenants/connect_test.exs +++ b/test/realtime/tenants/connect_test.exs @@ -515,6 +515,53 @@ defmodule Realtime.Tenants.ConnectTest do assert capture_log(fn -> assert {:error, :rpc_error, _} = Connect.lookup_or_start_connection("tenant") end) =~ "project=tenant external_id=tenant [error] ErrorOnRpcCall" end + + test "rate limit connect when too many connections against bad database", %{tenant: tenant} do + extension = %{ + "type" => "postgres_cdc_rls", + "settings" => %{ + "db_host" => "127.0.0.1", + "db_name" => "postgres", + "db_user" => "supabase_admin", + "db_password" => "postgres", + "poll_interval" => 100, + "poll_max_changes" => 100, + "poll_max_record_bytes" => 1_048_576, + "region" => "us-east-1", + "ssl_enforced" => true + } + } + + {:ok, tenant} = update_extension(tenant, extension) + + log = + capture_log(fn -> + res = + for _ <- 1..50 do + Process.sleep(200) + Connect.lookup_or_start_connection(tenant.external_id) + end + + assert Enum.any?(res, fn {_, res} -> res == :connect_rate_limit_reached end) + end) + + assert log =~ "DatabaseConnectionRateLimitReached: Too many connection attempts against the tenant database" + end + + test "rate limit connect will not trigger if connection is successful", %{tenant: tenant} do + log = + capture_log(fn -> + res = + for _ <- 1..20 do + Process.sleep(500) + Connect.lookup_or_start_connection(tenant.external_id) + end + + refute Enum.any?(res, fn {_, res} -> res == :tenant_db_too_many_connections end) + end) + + refute log =~ "DatabaseConnectionRateLimitReached: Too many connection attempts against the tenant database" + end end describe "shutdown/1" do diff --git a/test/realtime_web/controllers/broadcast_controller_test.exs b/test/realtime_web/controllers/broadcast_controller_test.exs index 9c38d58bd..7bd426353 100644 --- a/test/realtime_web/controllers/broadcast_controller_test.exs +++ b/test/realtime_web/controllers/broadcast_controller_test.exs @@ -272,6 +272,7 @@ defmodule RealtimeWeb.BroadcastControllerTest do } do request_events_key = Tenants.requests_per_second_key(tenant) broadcast_events_key = Tenants.events_per_second_key(tenant) + connect_events_key = Tenants.connect_per_second_rate(tenant).id expect(TenantBroadcaster, :pubsub_broadcast, 5, fn _, _, _, _ -> :ok end) messages_to_send = @@ -290,7 +291,10 @@ defmodule RealtimeWeb.BroadcastControllerTest do GenCounter |> expect(:add, fn ^request_events_key -> :ok end) - |> expect(:add, length(messages), fn ^broadcast_events_key -> :ok end) + |> expect(:add, length(messages), fn + ^broadcast_events_key -> :ok + ^connect_events_key -> :ok + end) conn = post(conn, Routes.broadcast_path(conn, :broadcast), %{"messages" => messages}) @@ -326,6 +330,7 @@ defmodule RealtimeWeb.BroadcastControllerTest do } do request_events_key = Tenants.requests_per_second_key(tenant) broadcast_events_key = Tenants.events_per_second_key(tenant) + connect_events_key = Tenants.connect_per_second_rate(tenant).id expect(TenantBroadcaster, :pubsub_broadcast, 6, fn _, _, _, _ -> :ok end) channels = @@ -354,7 +359,10 @@ defmodule RealtimeWeb.BroadcastControllerTest do GenCounter |> expect(:add, fn ^request_events_key -> :ok end) - |> expect(:add, length(messages), fn ^broadcast_events_key -> :ok end) + |> expect(:add, length(messages), fn + ^broadcast_events_key -> :ok + ^connect_events_key -> :ok + end) conn = post(conn, Routes.broadcast_path(conn, :broadcast), %{"messages" => messages}) @@ -408,6 +416,7 @@ defmodule RealtimeWeb.BroadcastControllerTest do } do request_events_key = Tenants.requests_per_second_key(tenant) broadcast_events_key = Tenants.events_per_second_key(tenant) + connect_events_key = Tenants.connect_per_second_rate(tenant).id expect(TenantBroadcaster, :pubsub_broadcast, 5, fn _, _, _, _ -> :ok end) messages_to_send = @@ -428,7 +437,9 @@ defmodule RealtimeWeb.BroadcastControllerTest do GenCounter |> expect(:add, fn ^request_events_key -> :ok end) - |> expect(:add, length(messages_to_send), fn ^broadcast_events_key -> :ok end) + # remove the one message that won't be broadcasted for this user + |> expect(:add, 1, fn ^connect_events_key -> :ok end) + |> expect(:add, length(messages) - 1, fn ^broadcast_events_key -> :ok end) conn = post(conn, Routes.broadcast_path(conn, :broadcast), %{"messages" => messages}) @@ -482,7 +493,6 @@ defmodule RealtimeWeb.BroadcastControllerTest do GenCounter |> expect(:add, fn ^request_events_key -> 1 end) - |> reject(:add, 1) conn = post(conn, Routes.broadcast_path(conn, :broadcast), %{"messages" => messages}) From d309c55bfb60c8377eb7cb4b240f2ecf7b2d6962 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kevin=20Gr=C3=BCneberg?= Date: Sat, 27 Sep 2025 16:43:37 +0800 Subject: [PATCH 21/34] build: automatically cancel old tests/build on new push (#1545) Currently, whenever you push any commit to your branch, the old builds are still running and a new build is started. Once a new commit is added, the old test results no longer matter and it's just a waste of CI resources. Also reduces confusion with multiple builds running in parallel for the same branch/possibly blocking any merges. With this little change, we ensure that whenever a new commit is added, the previous build is immediately canceled/stopped and only the build (latest commit) runs. --- .github/workflows/tests.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 5d3818814..c9c2a73fa 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -16,6 +16,10 @@ on: branches: - main +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: tests: name: Tests From a72a8353cf6eb9bd7e3549422c5b5c3e70bcef3d Mon Sep 17 00:00:00 2001 From: Eduardo Gurgel Date: Wed, 1 Oct 2025 11:53:59 +1300 Subject: [PATCH 22/34] fix: move message queue data to off-heap for gen_rpc pub sub workers (#1548) --- config/runtime.exs | 2 +- lib/realtime/gen_rpc/pub_sub.ex | 5 ++++- mix.exs | 2 +- test/realtime/gen_rpc_pub_sub_test.exs | 10 ++++++++++ 4 files changed, 16 insertions(+), 3 deletions(-) diff --git a/config/runtime.exs b/config/runtime.exs index 47961f98a..447934b65 100644 --- a/config/runtime.exs +++ b/config/runtime.exs @@ -68,7 +68,7 @@ janitor_children_timeout = Env.get_integer("JANITOR_CHILDREN_TIMEOUT", :timer.se janitor_schedule_timer = Env.get_integer("JANITOR_SCHEDULE_TIMER_IN_MS", :timer.hours(4)) platform = if System.get_env("AWS_EXECUTION_ENV") == "AWS_ECS_FARGATE", do: :aws, else: :fly broadcast_pool_size = Env.get_integer("BROADCAST_POOL_SIZE", 10) -pubsub_adapter = System.get_env("PUBSUB_ADAPTER", "pg2") |> String.to_atom() +pubsub_adapter = System.get_env("PUBSUB_ADAPTER", "gen_rpc") |> String.to_atom() websocket_max_heap_size = div(Env.get_integer("WEBSOCKET_MAX_HEAP_SIZE", 50_000_000), :erlang.system_info(:wordsize)) no_channel_timeout_in_ms = diff --git a/lib/realtime/gen_rpc/pub_sub.ex b/lib/realtime/gen_rpc/pub_sub.ex index b2a90b165..c8ddf5568 100644 --- a/lib/realtime/gen_rpc/pub_sub.ex +++ b/lib/realtime/gen_rpc/pub_sub.ex @@ -65,7 +65,10 @@ defmodule Realtime.GenRpcPubSub.Worker do def start_link({pubsub, worker}), do: GenServer.start_link(__MODULE__, pubsub, name: worker) @impl true - def init(pubsub), do: {:ok, pubsub} + def init(pubsub) do + Process.flag(:message_queue_data, :off_heap) + {:ok, pubsub} + end @impl true def handle_info({:ftl, topic, message, dispatcher}, pubsub) do diff --git a/mix.exs b/mix.exs index 4b0b1f40c..e093db4bf 100644 --- a/mix.exs +++ b/mix.exs @@ -4,7 +4,7 @@ defmodule Realtime.MixProject do def project do [ app: :realtime, - version: "2.51.5", + version: "2.51.6", elixir: "~> 1.17.3", elixirc_paths: elixirc_paths(Mix.env()), start_permanent: Mix.env() == :prod, diff --git a/test/realtime/gen_rpc_pub_sub_test.exs b/test/realtime/gen_rpc_pub_sub_test.exs index 0013c2e7b..5e7a1f14b 100644 --- a/test/realtime/gen_rpc_pub_sub_test.exs +++ b/test/realtime/gen_rpc_pub_sub_test.exs @@ -1,2 +1,12 @@ Application.put_env(:phoenix_pubsub, :test_adapter, {Realtime.GenRpcPubSub, []}) Code.require_file("../../deps/phoenix_pubsub/test/shared/pubsub_test.exs", __DIR__) + +defmodule Realtime.GenRpcPubSubTest do + use ExUnit.Case, async: true + + test "it sets off_heap message_queue_data flag on the workers" do + assert Realtime.PubSubElixir.Realtime.PubSub.Adapter_1 + |> Process.whereis() + |> Process.info(:message_queue_data) == {:message_queue_data, :off_heap} + end +end From 353c14230ccc45278154e488ac990c93f2cdf33b Mon Sep 17 00:00:00 2001 From: Eduardo Gurgel Date: Wed, 1 Oct 2025 13:47:00 +1300 Subject: [PATCH 23/34] fix: rate limit Connect.lookup_or_start_connection on error only (#1549) --- lib/realtime/tenants.ex | 12 ++++---- lib/realtime/tenants/connect.ex | 12 ++++++-- mix.exs | 2 +- test/realtime/tenants/connect_test.exs | 30 +++++++++++++++++++ .../controllers/broadcast_controller_test.exs | 14 ++------- 5 files changed, 48 insertions(+), 22 deletions(-) diff --git a/lib/realtime/tenants.ex b/lib/realtime/tenants.ex index db2a02cc4..019a87e99 100644 --- a/lib/realtime/tenants.ex +++ b/lib/realtime/tenants.ex @@ -328,18 +328,18 @@ defmodule Realtime.Tenants do %RateCounter.Args{id: {:channel, :authorization_errors, external_id}, opts: opts} end - @connect_per_second_default 10 + @connect_errors_per_second_default 10 @doc "RateCounter arguments for counting connect per second." - @spec connect_per_second_rate(Tenant.t() | String.t()) :: RateCounter.Args.t() - def connect_per_second_rate(%Tenant{external_id: external_id}) do - connect_per_second_rate(external_id) + @spec connect_errors_per_second_rate(Tenant.t() | String.t()) :: RateCounter.Args.t() + def connect_errors_per_second_rate(%Tenant{external_id: external_id}) do + connect_errors_per_second_rate(external_id) end - def connect_per_second_rate(tenant_id) do + def connect_errors_per_second_rate(tenant_id) do opts = [ max_bucket_len: 10, limit: [ - value: @connect_per_second_default, + value: @connect_errors_per_second_default, measurement: :sum, log_fn: fn -> Logger.critical( diff --git a/lib/realtime/tenants/connect.ex b/lib/realtime/tenants/connect.ex index 0ee43f161..caf49cc57 100644 --- a/lib/realtime/tenants/connect.ex +++ b/lib/realtime/tenants/connect.ex @@ -57,7 +57,7 @@ defmodule Realtime.Tenants.Connect do | {:error, :connect_rate_limit_reached} | {:error, :rpc_error, term()} def lookup_or_start_connection(tenant_id, opts \\ []) when is_binary(tenant_id) do - rate_args = Tenants.connect_per_second_rate(tenant_id) + rate_args = Tenants.connect_errors_per_second_rate(tenant_id) RateCounter.new(rate_args) with {:ok, %{limit: %{triggered: false}}} <- RateCounter.get(rate_args), @@ -68,8 +68,14 @@ defmodule Realtime.Tenants.Connect do {:error, :connect_rate_limit_reached} {:error, :tenant_database_connection_initializing} -> - GenCounter.add(rate_args.id) - call_external_node(tenant_id, opts) + case call_external_node(tenant_id, opts) do + {:ok, pid} -> + {:ok, pid} + + error -> + GenCounter.add(rate_args.id) + error + end {:error, :initializing} -> {:error, :tenant_database_unavailable} diff --git a/mix.exs b/mix.exs index e093db4bf..4e5bf5852 100644 --- a/mix.exs +++ b/mix.exs @@ -4,7 +4,7 @@ defmodule Realtime.MixProject do def project do [ app: :realtime, - version: "2.51.6", + version: "2.51.7", elixir: "~> 1.17.3", elixirc_paths: elixirc_paths(Mix.env()), start_permanent: Mix.env() == :prod, diff --git a/test/realtime/tenants/connect_test.exs b/test/realtime/tenants/connect_test.exs index a52973d53..741f6ecf7 100644 --- a/test/realtime/tenants/connect_test.exs +++ b/test/realtime/tenants/connect_test.exs @@ -51,6 +51,36 @@ defmodule Realtime.Tenants.ConnectTest do end describe "handle cold start" do + test "multiple processes connecting calling Connect.connect", %{tenant: tenant} do + parent = self() + + # Let's slow down Connect.connect so that multiple RPC calls are executed + stub(Connect, :connect, fn x, y, z -> + :timer.sleep(1000) + call_original(Connect, :connect, [x, y, z]) + end) + + connect = fn -> send(parent, Connect.lookup_or_start_connection(tenant.external_id)) end + # Let's call enough times to potentially trigger the Connect RateCounter + + for _ <- 1..50, do: spawn(connect) + + assert_receive({:ok, pid}, 1100) + + for _ <- 1..49, do: assert_receive({:ok, ^pid}) + + # Does not trigger rate limit as connections eventually succeeded + + {:ok, rate_counter} = + tenant.external_id + |> Tenants.connect_errors_per_second_rate() + |> Realtime.RateCounter.get() + + assert rate_counter.sum == 0 + assert rate_counter.avg == 0.0 + assert rate_counter.limit.triggered == false + end + test "multiple proccesses succeed together", %{tenant: tenant} do parent = self() diff --git a/test/realtime_web/controllers/broadcast_controller_test.exs b/test/realtime_web/controllers/broadcast_controller_test.exs index 7bd426353..d42466722 100644 --- a/test/realtime_web/controllers/broadcast_controller_test.exs +++ b/test/realtime_web/controllers/broadcast_controller_test.exs @@ -272,7 +272,6 @@ defmodule RealtimeWeb.BroadcastControllerTest do } do request_events_key = Tenants.requests_per_second_key(tenant) broadcast_events_key = Tenants.events_per_second_key(tenant) - connect_events_key = Tenants.connect_per_second_rate(tenant).id expect(TenantBroadcaster, :pubsub_broadcast, 5, fn _, _, _, _ -> :ok end) messages_to_send = @@ -291,10 +290,7 @@ defmodule RealtimeWeb.BroadcastControllerTest do GenCounter |> expect(:add, fn ^request_events_key -> :ok end) - |> expect(:add, length(messages), fn - ^broadcast_events_key -> :ok - ^connect_events_key -> :ok - end) + |> expect(:add, length(messages), fn ^broadcast_events_key -> :ok end) conn = post(conn, Routes.broadcast_path(conn, :broadcast), %{"messages" => messages}) @@ -330,7 +326,6 @@ defmodule RealtimeWeb.BroadcastControllerTest do } do request_events_key = Tenants.requests_per_second_key(tenant) broadcast_events_key = Tenants.events_per_second_key(tenant) - connect_events_key = Tenants.connect_per_second_rate(tenant).id expect(TenantBroadcaster, :pubsub_broadcast, 6, fn _, _, _, _ -> :ok end) channels = @@ -359,10 +354,7 @@ defmodule RealtimeWeb.BroadcastControllerTest do GenCounter |> expect(:add, fn ^request_events_key -> :ok end) - |> expect(:add, length(messages), fn - ^broadcast_events_key -> :ok - ^connect_events_key -> :ok - end) + |> expect(:add, length(messages), fn ^broadcast_events_key -> :ok end) conn = post(conn, Routes.broadcast_path(conn, :broadcast), %{"messages" => messages}) @@ -416,7 +408,6 @@ defmodule RealtimeWeb.BroadcastControllerTest do } do request_events_key = Tenants.requests_per_second_key(tenant) broadcast_events_key = Tenants.events_per_second_key(tenant) - connect_events_key = Tenants.connect_per_second_rate(tenant).id expect(TenantBroadcaster, :pubsub_broadcast, 5, fn _, _, _, _ -> :ok end) messages_to_send = @@ -438,7 +429,6 @@ defmodule RealtimeWeb.BroadcastControllerTest do GenCounter |> expect(:add, fn ^request_events_key -> :ok end) # remove the one message that won't be broadcasted for this user - |> expect(:add, 1, fn ^connect_events_key -> :ok end) |> expect(:add, length(messages) - 1, fn ^broadcast_events_key -> :ok end) conn = post(conn, Routes.broadcast_path(conn, :broadcast), %{"messages" => messages}) From 748398ccf0e81b433afa61eb0200507251349aa4 Mon Sep 17 00:00:00 2001 From: Eduardo Gurgel Date: Wed, 1 Oct 2025 20:32:49 +1300 Subject: [PATCH 24/34] fix: increase connect error rate window to 30 seconds (#1550) --- lib/realtime/tenants.ex | 2 +- mix.exs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/realtime/tenants.ex b/lib/realtime/tenants.ex index 019a87e99..efd2397ac 100644 --- a/lib/realtime/tenants.ex +++ b/lib/realtime/tenants.ex @@ -337,7 +337,7 @@ defmodule Realtime.Tenants do def connect_errors_per_second_rate(tenant_id) do opts = [ - max_bucket_len: 10, + max_bucket_len: 30, limit: [ value: @connect_errors_per_second_default, measurement: :sum, diff --git a/mix.exs b/mix.exs index 4e5bf5852..8618d067f 100644 --- a/mix.exs +++ b/mix.exs @@ -4,7 +4,7 @@ defmodule Realtime.MixProject do def project do [ app: :realtime, - version: "2.51.7", + version: "2.51.8", elixir: "~> 1.17.3", elixirc_paths: elixirc_paths(Mix.env()), start_permanent: Mix.env() == :prod, From 92e7b5999bc3a9617fc5cf891b326b92790b2ec0 Mon Sep 17 00:00:00 2001 From: Eduardo Gurgel Date: Thu, 2 Oct 2025 09:29:07 +1300 Subject: [PATCH 25/34] fix: set a lower fullsweep_after flag for GenRpcPubSub workers (#1551) --- lib/realtime/gen_rpc/pub_sub.ex | 1 + mix.exs | 2 +- test/realtime/gen_rpc_pub_sub_test.exs | 6 ++++++ 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/lib/realtime/gen_rpc/pub_sub.ex b/lib/realtime/gen_rpc/pub_sub.ex index c8ddf5568..63fff145c 100644 --- a/lib/realtime/gen_rpc/pub_sub.ex +++ b/lib/realtime/gen_rpc/pub_sub.ex @@ -67,6 +67,7 @@ defmodule Realtime.GenRpcPubSub.Worker do @impl true def init(pubsub) do Process.flag(:message_queue_data, :off_heap) + Process.flag(:fullsweep_after, 1000) {:ok, pubsub} end diff --git a/mix.exs b/mix.exs index 8618d067f..e3123d41c 100644 --- a/mix.exs +++ b/mix.exs @@ -4,7 +4,7 @@ defmodule Realtime.MixProject do def project do [ app: :realtime, - version: "2.51.8", + version: "2.51.9", elixir: "~> 1.17.3", elixirc_paths: elixirc_paths(Mix.env()), start_permanent: Mix.env() == :prod, diff --git a/test/realtime/gen_rpc_pub_sub_test.exs b/test/realtime/gen_rpc_pub_sub_test.exs index 5e7a1f14b..f94bc5f89 100644 --- a/test/realtime/gen_rpc_pub_sub_test.exs +++ b/test/realtime/gen_rpc_pub_sub_test.exs @@ -9,4 +9,10 @@ defmodule Realtime.GenRpcPubSubTest do |> Process.whereis() |> Process.info(:message_queue_data) == {:message_queue_data, :off_heap} end + + test "it sets fullsweep_after flag on the workers" do + assert Realtime.PubSubElixir.Realtime.PubSub.Adapter_1 + |> Process.whereis() + |> Process.info(:fullsweep_after) == {:fullsweep_after, 1000} + end end From 6248e2b19a1eb5116c308757da5c1ea33ab2b2e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Filipe=20Caba=C3=A7o?= Date: Thu, 2 Oct 2025 04:54:05 +0100 Subject: [PATCH 26/34] fix: hardcode presence limit (#1552) --- .../channels/realtime_channel/presence_handler.ex | 5 +++-- mix.exs | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/lib/realtime_web/channels/realtime_channel/presence_handler.ex b/lib/realtime_web/channels/realtime_channel/presence_handler.ex index 9dc23d219..be3d8593d 100644 --- a/lib/realtime_web/channels/realtime_channel/presence_handler.ex +++ b/lib/realtime_web/channels/realtime_channel/presence_handler.ex @@ -138,13 +138,14 @@ defmodule RealtimeWeb.RealtimeChannel.PresenceHandler do |> Phoenix.Presence.group() end + @presence_limit 1000 defp limit_presence_event(socket) do %{assigns: %{presence_rate_counter: presence_counter, tenant: tenant_id}} = socket {:ok, rate_counter} = RateCounter.get(presence_counter) - tenant = Tenants.Cache.get_tenant_by_external_id(tenant_id) + # tenant = Tenants.Cache.get_tenant_by_external_id(tenant_id) - if rate_counter.avg > tenant.max_presence_events_per_second do + if rate_counter.avg > @presence_limit do {:error, :rate_limit_exceeded} else GenCounter.add(presence_counter.id) diff --git a/mix.exs b/mix.exs index e3123d41c..55bf9022d 100644 --- a/mix.exs +++ b/mix.exs @@ -4,7 +4,7 @@ defmodule Realtime.MixProject do def project do [ app: :realtime, - version: "2.51.9", + version: "2.51.10", elixir: "~> 1.17.3", elixirc_paths: elixirc_paths(Mix.env()), start_permanent: Mix.env() == :prod, From e84ac08ca378be05ad9ef366b1903d5d3f8195ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Filipe=20Caba=C3=A7o?= Date: Thu, 2 Oct 2025 05:33:08 +0100 Subject: [PATCH 27/34] fix: further decrease limit on presence events (#1553) --- .../channels/realtime_channel/presence_handler.ex | 4 ++-- mix.exs | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/realtime_web/channels/realtime_channel/presence_handler.ex b/lib/realtime_web/channels/realtime_channel/presence_handler.ex index be3d8593d..e081fdffb 100644 --- a/lib/realtime_web/channels/realtime_channel/presence_handler.ex +++ b/lib/realtime_web/channels/realtime_channel/presence_handler.ex @@ -138,9 +138,9 @@ defmodule RealtimeWeb.RealtimeChannel.PresenceHandler do |> Phoenix.Presence.group() end - @presence_limit 1000 + @presence_limit 500 defp limit_presence_event(socket) do - %{assigns: %{presence_rate_counter: presence_counter, tenant: tenant_id}} = socket + %{assigns: %{presence_rate_counter: presence_counter, tenant: _tenant_id}} = socket {:ok, rate_counter} = RateCounter.get(presence_counter) # tenant = Tenants.Cache.get_tenant_by_external_id(tenant_id) diff --git a/mix.exs b/mix.exs index 55bf9022d..958bf7ed2 100644 --- a/mix.exs +++ b/mix.exs @@ -4,7 +4,7 @@ defmodule Realtime.MixProject do def project do [ app: :realtime, - version: "2.51.10", + version: "2.51.11", elixir: "~> 1.17.3", elixirc_paths: elixirc_paths(Mix.env()), start_permanent: Mix.env() == :prod, From 13052aa822139d764b3a3f9aa2063077fd325ff7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Filipe=20Caba=C3=A7o?= Date: Thu, 2 Oct 2025 06:45:58 +0100 Subject: [PATCH 28/34] fix: bump up realtime (#1554) --- mix.exs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mix.exs b/mix.exs index 958bf7ed2..fe2f8978c 100644 --- a/mix.exs +++ b/mix.exs @@ -4,7 +4,7 @@ defmodule Realtime.MixProject do def project do [ app: :realtime, - version: "2.51.11", + version: "2.51.12", elixir: "~> 1.17.3", elixirc_paths: elixirc_paths(Mix.env()), start_permanent: Mix.env() == :prod, From 6e650f0a7306fd7022c698c5a474c65e5c2b8331 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Filipe=20Caba=C3=A7o?= Date: Thu, 2 Oct 2025 16:33:58 +0100 Subject: [PATCH 29/34] fix: lower rate limit to 100 events per second (#1556) --- lib/realtime_web/channels/realtime_channel/presence_handler.ex | 2 +- mix.exs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/realtime_web/channels/realtime_channel/presence_handler.ex b/lib/realtime_web/channels/realtime_channel/presence_handler.ex index e081fdffb..29ae9294e 100644 --- a/lib/realtime_web/channels/realtime_channel/presence_handler.ex +++ b/lib/realtime_web/channels/realtime_channel/presence_handler.ex @@ -138,7 +138,7 @@ defmodule RealtimeWeb.RealtimeChannel.PresenceHandler do |> Phoenix.Presence.group() end - @presence_limit 500 + @presence_limit 100 defp limit_presence_event(socket) do %{assigns: %{presence_rate_counter: presence_counter, tenant: _tenant_id}} = socket {:ok, rate_counter} = RateCounter.get(presence_counter) diff --git a/mix.exs b/mix.exs index fe2f8978c..2f31c5ada 100644 --- a/mix.exs +++ b/mix.exs @@ -4,7 +4,7 @@ defmodule Realtime.MixProject do def project do [ app: :realtime, - version: "2.51.12", + version: "2.51.13", elixir: "~> 1.17.3", elixirc_paths: elixirc_paths(Mix.env()), start_permanent: Mix.env() == :prod, From 05ac93eabdd764153995f4f999dcef2da9c686fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Filipe=20Caba=C3=A7o?= Date: Thu, 2 Oct 2025 22:00:42 +0100 Subject: [PATCH 30/34] fix: move connect rate limit to socket (#1555) * fix: reduce max_frame_size to 5MB * fix: fullsweep_after=100 on gen rpc pub sub workers --------- Co-authored-by: Eduardo Gurgel Pinho --- lib/realtime/gen_rpc/pub_sub.ex | 2 +- .../realtime_channel/presence_handler.ex | 2 +- .../channels/tenant_rate_limiters.ex | 43 +++++++++++++++++++ lib/realtime_web/channels/user_socket.ex | 12 ++++++ lib/realtime_web/endpoint.ex | 2 +- mix.exs | 2 +- test/realtime/gen_rpc_pub_sub_test.exs | 2 +- .../presence_handler_test.exs | 1 + .../channels/tenant_rate_limiters_test.exs | 31 +++++++++++++ 9 files changed, 92 insertions(+), 5 deletions(-) create mode 100644 lib/realtime_web/channels/tenant_rate_limiters.ex create mode 100644 test/realtime_web/channels/tenant_rate_limiters_test.exs diff --git a/lib/realtime/gen_rpc/pub_sub.ex b/lib/realtime/gen_rpc/pub_sub.ex index 63fff145c..3ba9e053a 100644 --- a/lib/realtime/gen_rpc/pub_sub.ex +++ b/lib/realtime/gen_rpc/pub_sub.ex @@ -67,7 +67,7 @@ defmodule Realtime.GenRpcPubSub.Worker do @impl true def init(pubsub) do Process.flag(:message_queue_data, :off_heap) - Process.flag(:fullsweep_after, 1000) + Process.flag(:fullsweep_after, 100) {:ok, pubsub} end diff --git a/lib/realtime_web/channels/realtime_channel/presence_handler.ex b/lib/realtime_web/channels/realtime_channel/presence_handler.ex index 29ae9294e..1af26c528 100644 --- a/lib/realtime_web/channels/realtime_channel/presence_handler.ex +++ b/lib/realtime_web/channels/realtime_channel/presence_handler.ex @@ -11,7 +11,7 @@ defmodule RealtimeWeb.RealtimeChannel.PresenceHandler do alias Phoenix.Tracker.Shard alias Realtime.GenCounter alias Realtime.RateCounter - alias Realtime.Tenants + # alias Realtime.Tenants alias Realtime.Tenants.Authorization alias RealtimeWeb.Presence alias RealtimeWeb.RealtimeChannel.Logging diff --git a/lib/realtime_web/channels/tenant_rate_limiters.ex b/lib/realtime_web/channels/tenant_rate_limiters.ex new file mode 100644 index 000000000..2101ac945 --- /dev/null +++ b/lib/realtime_web/channels/tenant_rate_limiters.ex @@ -0,0 +1,43 @@ +defmodule RealtimeWeb.TenantRateLimiters do + @moduledoc """ + Rate limiters for tenants. + """ + require Logger + alias Realtime.UsersCounter + alias Realtime.Tenants + alias Realtime.RateCounter + alias Realtime.Api.Tenant + + @spec check_tenant(Realtime.Api.Tenant.t()) :: :ok | {:error, :too_many_connections | :too_many_joins} + def check_tenant(tenant) do + with :ok <- max_concurrent_users_check(tenant) do + max_joins_per_second_check(tenant) + end + end + + defp max_concurrent_users_check(%Tenant{max_concurrent_users: max_conn_users, external_id: external_id}) do + total_conn_users = UsersCounter.tenant_users(external_id) + + if total_conn_users < max_conn_users, + do: :ok, + else: {:error, :too_many_connections} + end + + defp max_joins_per_second_check(%Tenant{max_joins_per_second: max_joins_per_second} = tenant) do + rate_args = Tenants.joins_per_second_rate(tenant.external_id, max_joins_per_second) + + RateCounter.new(rate_args) + + case RateCounter.get(rate_args) do + {:ok, %{limit: %{triggered: false}}} -> + :ok + + {:ok, %{limit: %{triggered: true}}} -> + {:error, :too_many_joins} + + error -> + Logger.error("UnknownErrorOnCounter: #{inspect(error)}") + {:error, error} + end + end +end diff --git a/lib/realtime_web/channels/user_socket.ex b/lib/realtime_web/channels/user_socket.ex index 849aa052d..6d4bf9017 100644 --- a/lib/realtime_web/channels/user_socket.ex +++ b/lib/realtime_web/channels/user_socket.ex @@ -16,6 +16,7 @@ defmodule RealtimeWeb.UserSocket do alias Realtime.PostgresCdc alias Realtime.Tenants + alias RealtimeWeb.TenantRateLimiters alias RealtimeWeb.ChannelsAuthorization alias RealtimeWeb.RealtimeChannel alias RealtimeWeb.RealtimeChannel.Logging @@ -56,6 +57,7 @@ defmodule RealtimeWeb.UserSocket do token when is_binary(token) <- token, jwt_secret_dec <- Crypto.decrypt!(jwt_secret), {:ok, claims} <- ChannelsAuthorization.authorize_conn(token, jwt_secret_dec, jwt_jwks), + :ok <- TenantRateLimiters.check_tenant(tenant), {:ok, postgres_cdc_module} <- PostgresCdc.driver(postgres_cdc_default) do %Tenant{ extensions: extensions, @@ -111,6 +113,16 @@ defmodule RealtimeWeb.UserSocket do log_error("MalformedJWT", "The token provided is not a valid JWT") {:error, :token_malformed} + {:error, :too_many_connections} -> + msg = "Too many connected users" + Logging.log_error(socket, "ConnectionRateLimitReached", msg) + {:error, :too_many_connections} + + {:error, :too_many_joins} -> + msg = "Too many joins per second" + Logging.log_error(socket, "JoinsRateLimitReached", msg) + {:error, :too_many_joins} + error -> log_error("ErrorConnectingToWebsocket", error) error diff --git a/lib/realtime_web/endpoint.ex b/lib/realtime_web/endpoint.ex index 190e1a917..894911803 100644 --- a/lib/realtime_web/endpoint.ex +++ b/lib/realtime_web/endpoint.ex @@ -15,7 +15,7 @@ defmodule RealtimeWeb.Endpoint do websocket: [ connect_info: [:peer_data, :uri, :x_headers], fullsweep_after: 20, - max_frame_size: 8_000_000, + max_frame_size: 5_000_000, # https://github.com/ninenines/cowboy/blob/24d32de931a0c985ff7939077463fc8be939f0e9/doc/src/manual/cowboy_websocket.asciidoc#L228 # active_n: The number of packets Cowboy will request from the socket at once. # This can be used to tweak the performance of the server. Higher values reduce diff --git a/mix.exs b/mix.exs index 2f31c5ada..4cf76563f 100644 --- a/mix.exs +++ b/mix.exs @@ -4,7 +4,7 @@ defmodule Realtime.MixProject do def project do [ app: :realtime, - version: "2.51.13", + version: "2.51.14", elixir: "~> 1.17.3", elixirc_paths: elixirc_paths(Mix.env()), start_permanent: Mix.env() == :prod, diff --git a/test/realtime/gen_rpc_pub_sub_test.exs b/test/realtime/gen_rpc_pub_sub_test.exs index f94bc5f89..517c6c369 100644 --- a/test/realtime/gen_rpc_pub_sub_test.exs +++ b/test/realtime/gen_rpc_pub_sub_test.exs @@ -13,6 +13,6 @@ defmodule Realtime.GenRpcPubSubTest do test "it sets fullsweep_after flag on the workers" do assert Realtime.PubSubElixir.Realtime.PubSub.Adapter_1 |> Process.whereis() - |> Process.info(:fullsweep_after) == {:fullsweep_after, 1000} + |> Process.info(:fullsweep_after) == {:fullsweep_after, 100} end end diff --git a/test/realtime_web/channels/realtime_channel/presence_handler_test.exs b/test/realtime_web/channels/realtime_channel/presence_handler_test.exs index 0cdf422e2..4891e4187 100644 --- a/test/realtime_web/channels/realtime_channel/presence_handler_test.exs +++ b/test/realtime_web/channels/realtime_channel/presence_handler_test.exs @@ -434,6 +434,7 @@ defmodule RealtimeWeb.RealtimeChannel.PresenceHandlerTest do assert log =~ "PresenceRateLimitReached" end + @tag :skip @tag policies: [:authenticated_read_broadcast_and_presence, :authenticated_write_broadcast_and_presence] test "respects rate limits on private channels", %{tenant: tenant, topic: topic, db_conn: db_conn} do key = random_string() diff --git a/test/realtime_web/channels/tenant_rate_limiters_test.exs b/test/realtime_web/channels/tenant_rate_limiters_test.exs new file mode 100644 index 000000000..05d56ec82 --- /dev/null +++ b/test/realtime_web/channels/tenant_rate_limiters_test.exs @@ -0,0 +1,31 @@ +defmodule RealtimeWeb.TenantRateLimitersTest do + use Realtime.DataCase, async: true + + use Mimic + alias RealtimeWeb.TenantRateLimiters + alias Realtime.Api.Tenant + + setup do + tenant = %Tenant{external_id: random_string(), max_concurrent_users: 1, max_joins_per_second: 1} + + %{tenant: tenant} + end + + describe "check_tenant/1" do + test "rate is not exceeded", %{tenant: tenant} do + assert TenantRateLimiters.check_tenant(tenant) == :ok + end + + test "max concurrent users is exceeded", %{tenant: tenant} do + Realtime.UsersCounter.add(self(), tenant.external_id) + + assert TenantRateLimiters.check_tenant(tenant) == {:error, :too_many_connections} + end + + test "max joins is exceeded", %{tenant: tenant} do + expect(Realtime.RateCounter, :get, fn _ -> {:ok, %{limit: %{triggered: true}}} end) + + assert TenantRateLimiters.check_tenant(tenant) == {:error, :too_many_joins} + end + end +end From e9eaf9f117dc97a42b66f28aa3afd0005e79a301 Mon Sep 17 00:00:00 2001 From: Eduardo Gurgel Date: Fri, 3 Oct 2025 11:17:13 +1300 Subject: [PATCH 31/34] fix: collect global metrics without tenant tagging (#1557) --- .../monitoring/prom_ex/plugins/tenant.ex | 18 ++++++++++ .../monitoring/prom_ex/plugins/tenants.ex | 9 +++++ mix.exs | 2 +- .../prom_ex/plugins/tenant_test.exs | 31 +++++++++++++++++ .../prom_ex/plugins/tenants_test.exs | 33 +++++++++++++++++++ 5 files changed, 92 insertions(+), 1 deletion(-) diff --git a/lib/realtime/monitoring/prom_ex/plugins/tenant.ex b/lib/realtime/monitoring/prom_ex/plugins/tenant.ex index 1bd324624..bf9d850ee 100644 --- a/lib/realtime/monitoring/prom_ex/plugins/tenant.ex +++ b/lib/realtime/monitoring/prom_ex/plugins/tenant.ex @@ -157,6 +157,12 @@ defmodule Realtime.PromEx.Plugins.Tenant do description: "Sum of messages sent on a Realtime Channel.", tags: [:tenant] ), + sum( + [:realtime, :channel, :global, :events], + event_name: [:realtime, :rate_counter, :channel, :events], + measurement: :sum, + description: "Global sum of messages sent on a Realtime Channel." + ), sum( [:realtime, :channel, :presence_events], event_name: [:realtime, :rate_counter, :channel, :presence_events], @@ -164,6 +170,12 @@ defmodule Realtime.PromEx.Plugins.Tenant do description: "Sum of presence messages sent on a Realtime Channel.", tags: [:tenant] ), + sum( + [:realtime, :channel, :global, :presence_events], + event_name: [:realtime, :rate_counter, :channel, :presence_events], + measurement: :sum, + description: "Global sum of presence messages sent on a Realtime Channel." + ), sum( [:realtime, :channel, :db_events], event_name: [:realtime, :rate_counter, :channel, :db_events], @@ -171,6 +183,12 @@ defmodule Realtime.PromEx.Plugins.Tenant do description: "Sum of db messages sent on a Realtime Channel.", tags: [:tenant] ), + sum( + [:realtime, :channel, :global, :db_events], + event_name: [:realtime, :rate_counter, :channel, :db_events], + measurement: :sum, + description: "Global sum of db messages sent on a Realtime Channel." + ), sum( [:realtime, :channel, :joins], event_name: [:realtime, :rate_counter, :channel, :joins], diff --git a/lib/realtime/monitoring/prom_ex/plugins/tenants.ex b/lib/realtime/monitoring/prom_ex/plugins/tenants.ex index 0035e9594..e8106df58 100644 --- a/lib/realtime/monitoring/prom_ex/plugins/tenants.ex +++ b/lib/realtime/monitoring/prom_ex/plugins/tenants.ex @@ -21,6 +21,15 @@ defmodule Realtime.PromEx.Plugins.Tenants do unit: {:microsecond, :millisecond}, tags: [:success, :tenant, :mechanism], reporter_options: [buckets: [10, 250, 5000, 15_000]] + ), + distribution( + [:realtime, :global, :rpc], + event_name: [:realtime, :rpc], + description: "Global Latency of rpc calls", + measurement: :latency, + unit: {:microsecond, :millisecond}, + tags: [:success, :mechanism], + reporter_options: [buckets: [10, 250, 5000, 15_000]] ) ]) end diff --git a/mix.exs b/mix.exs index 4cf76563f..bc5fccc8b 100644 --- a/mix.exs +++ b/mix.exs @@ -4,7 +4,7 @@ defmodule Realtime.MixProject do def project do [ app: :realtime, - version: "2.51.14", + version: "2.51.15", elixir: "~> 1.17.3", elixirc_paths: elixirc_paths(Mix.env()), start_permanent: Mix.env() == :prod, diff --git a/test/realtime/monitoring/prom_ex/plugins/tenant_test.exs b/test/realtime/monitoring/prom_ex/plugins/tenant_test.exs index 164c8d2eb..cfa727cfa 100644 --- a/test/realtime/monitoring/prom_ex/plugins/tenant_test.exs +++ b/test/realtime/monitoring/prom_ex/plugins/tenant_test.exs @@ -129,6 +129,17 @@ defmodule Realtime.PromEx.Plugins.TenantTest do assert metric_value(pattern) == metric_value + 1 end + test "global event exists after counter added", %{tenant: %{external_id: external_id}} do + pattern = + ~r/realtime_channel_global_events\s(?\d+)/ + + metric_value = metric_value(pattern) + FakeUserCounter.fake_event(external_id) + + Process.sleep(200) + assert metric_value(pattern) == metric_value + 1 + end + test "db_event exists after counter added", %{tenant: %{external_id: external_id}} do pattern = ~r/realtime_channel_db_events{tenant="#{external_id}"}\s(?\d+)/ @@ -139,6 +150,16 @@ defmodule Realtime.PromEx.Plugins.TenantTest do assert metric_value(pattern) == metric_value + 1 end + test "global db_event exists after counter added", %{tenant: %{external_id: external_id}} do + pattern = + ~r/realtime_channel_global_db_events\s(?\d+)/ + + metric_value = metric_value(pattern) + FakeUserCounter.fake_db_event(external_id) + Process.sleep(200) + assert metric_value(pattern) == metric_value + 1 + end + test "presence_event exists after counter added", %{tenant: %{external_id: external_id}} do pattern = ~r/realtime_channel_presence_events{tenant="#{external_id}"}\s(?\d+)/ @@ -149,6 +170,16 @@ defmodule Realtime.PromEx.Plugins.TenantTest do assert metric_value(pattern) == metric_value + 1 end + test "global presence_event exists after counter added", %{tenant: %{external_id: external_id}} do + pattern = + ~r/realtime_channel_global_presence_events\s(?\d+)/ + + metric_value = metric_value(pattern) + FakeUserCounter.fake_presence_event(external_id) + Process.sleep(200) + assert metric_value(pattern) == metric_value + 1 + end + test "metric read_authorization_check exists after check", context do pattern = ~r/realtime_tenants_read_authorization_check_count{tenant="#{context.tenant.external_id}"}\s(?\d+)/ diff --git a/test/realtime/monitoring/prom_ex/plugins/tenants_test.exs b/test/realtime/monitoring/prom_ex/plugins/tenants_test.exs index 080fd3cfb..ded087c74 100644 --- a/test/realtime/monitoring/prom_ex/plugins/tenants_test.exs +++ b/test/realtime/monitoring/prom_ex/plugins/tenants_test.exs @@ -37,6 +37,16 @@ defmodule Realtime.PromEx.Plugins.TenantsTest do assert metric_value(pattern) == previous_value + 1 end + test "global success" do + pattern = ~r/realtime_global_rpc_count{mechanism=\"erpc\",success="true"}\s(?\d+)/ + # Enough time for the poll rate to be triggered at least once + Process.sleep(200) + previous_value = metric_value(pattern) + assert {:ok, "success"} = Rpc.enhanced_call(node(), Test, :success, [], tenant_id: "123") + Process.sleep(200) + assert metric_value(pattern) == previous_value + 1 + end + test "failure" do pattern = ~r/realtime_rpc_count{mechanism=\"erpc\",success="false",tenant="123"}\s(?\d+)/ # Enough time for the poll rate to be triggered at least once @@ -47,6 +57,16 @@ defmodule Realtime.PromEx.Plugins.TenantsTest do assert metric_value(pattern) == previous_value + 1 end + test "global failure" do + pattern = ~r/realtime_global_rpc_count{mechanism=\"erpc\",success="false"}\s(?\d+)/ + # Enough time for the poll rate to be triggered at least once + Process.sleep(200) + previous_value = metric_value(pattern) + assert {:error, "failure"} = Rpc.enhanced_call(node(), Test, :failure, [], tenant_id: "123") + Process.sleep(200) + assert metric_value(pattern) == previous_value + 1 + end + test "exception" do pattern = ~r/realtime_rpc_count{mechanism=\"erpc\",success="false",tenant="123"}\s(?\d+)/ # Enough time for the poll rate to be triggered at least once @@ -59,6 +79,19 @@ defmodule Realtime.PromEx.Plugins.TenantsTest do Process.sleep(200) assert metric_value(pattern) == previous_value + 1 end + + test "global exception" do + pattern = ~r/realtime_global_rpc_count{mechanism=\"erpc\",success="false"}\s(?\d+)/ + # Enough time for the poll rate to be triggered at least once + Process.sleep(200) + previous_value = metric_value(pattern) + + assert {:error, :rpc_error, %RuntimeError{message: "runtime error"}} = + Rpc.enhanced_call(node(), Test, :exception, [], tenant_id: "123") + + Process.sleep(200) + assert metric_value(pattern) == previous_value + 1 + end end test "event_metrics rpc" do From 16bd44d17a9dda973eec1f0a5b0198e9d565ce15 Mon Sep 17 00:00:00 2001 From: Eduardo Gurgel Date: Mon, 6 Oct 2025 09:40:25 +1300 Subject: [PATCH 32/34] feat: presence payload size (#1559) * Also tweak buckets to account all the way to 3000KB * Start tagging the payload size metrics with message_type. message_type can be presence, broadcast or postgres_changes --- .../postgres_cdc_rls/replication_poller.ex | 2 +- .../monitoring/prom_ex/plugins/tenant.ex | 7 +-- lib/realtime/tenants/batch_broadcast.ex | 9 +++- .../realtime_channel/broadcast_handler.ex | 11 ++++- .../realtime_channel/presence_handler.ex | 1 + lib/realtime_web/tenant_broadcaster.ex | 32 +++++++----- mix.exs | 2 +- .../extensions/cdc_rls/cdc_rls_test.exs | 18 +++++++ .../prom_ex/plugins/tenant_test.exs | 12 ++--- .../presence_handler_test.exs | 31 +++++++++++- .../controllers/broadcast_controller_test.exs | 22 ++++----- test/realtime_web/tenant_broadcaster_test.exs | 49 ++++++++++++++++--- 12 files changed, 149 insertions(+), 47 deletions(-) diff --git a/lib/extensions/postgres_cdc_rls/replication_poller.ex b/lib/extensions/postgres_cdc_rls/replication_poller.ex index 65f4a33f1..85466ebe9 100644 --- a/lib/extensions/postgres_cdc_rls/replication_poller.ex +++ b/lib/extensions/postgres_cdc_rls/replication_poller.ex @@ -183,7 +183,7 @@ defmodule Extensions.PostgresCdcRls.ReplicationPoller do change <- columns |> Enum.zip(row) |> generate_record() |> List.wrap() do topic = "realtime:postgres:" <> tenant_id - RealtimeWeb.TenantBroadcaster.pubsub_broadcast(tenant_id, topic, change, MessageDispatcher) + RealtimeWeb.TenantBroadcaster.pubsub_broadcast(tenant_id, topic, change, MessageDispatcher, :postgres_changes) end {:ok, rows_count} diff --git a/lib/realtime/monitoring/prom_ex/plugins/tenant.ex b/lib/realtime/monitoring/prom_ex/plugins/tenant.ex index bf9d850ee..a3019a68a 100644 --- a/lib/realtime/monitoring/prom_ex/plugins/tenant.ex +++ b/lib/realtime/monitoring/prom_ex/plugins/tenant.ex @@ -36,10 +36,10 @@ defmodule Realtime.PromEx.Plugins.Tenant do event_name: [:realtime, :tenants, :payload, :size], measurement: :size, description: "Tenant payload size", - tags: [:tenant], + tags: [:tenant, :message_type], unit: :byte, reporter_options: [ - buckets: [100, 250, 500, 1000, 2000, 3000, 5000, 10_000, 25_000] + buckets: [250, 500, 1000, 3000, 5000, 10_000, 25_000, 100_000, 500_000, 1_000_000, 3_000_000] ] ), distribution( @@ -47,9 +47,10 @@ defmodule Realtime.PromEx.Plugins.Tenant do event_name: [:realtime, :tenants, :payload, :size], measurement: :size, description: "Payload size", + tags: [:message_type], unit: :byte, reporter_options: [ - buckets: [100, 250, 500, 1000, 2000, 3000, 5000, 10_000, 25_000] + buckets: [250, 500, 1000, 3000, 5000, 10_000, 25_000, 100_000, 500_000, 1_000_000, 3_000_000] ] ) ] diff --git a/lib/realtime/tenants/batch_broadcast.ex b/lib/realtime/tenants/batch_broadcast.ex index 98427621b..9e4ed4c3c 100644 --- a/lib/realtime/tenants/batch_broadcast.ex +++ b/lib/realtime/tenants/batch_broadcast.ex @@ -129,7 +129,14 @@ defmodule Realtime.Tenants.BatchBroadcast do broadcast = %Phoenix.Socket.Broadcast{topic: message.topic, event: @event_type, payload: payload} GenCounter.add(events_per_second_rate.id) - TenantBroadcaster.pubsub_broadcast(tenant.external_id, tenant_topic, broadcast, RealtimeChannel.MessageDispatcher) + + TenantBroadcaster.pubsub_broadcast( + tenant.external_id, + tenant_topic, + broadcast, + RealtimeChannel.MessageDispatcher, + :broadcast + ) end defp permissions_for_message(_, nil, _), do: nil diff --git a/lib/realtime_web/channels/realtime_channel/broadcast_handler.ex b/lib/realtime_web/channels/realtime_channel/broadcast_handler.ex index f8e736c2e..036ad9159 100644 --- a/lib/realtime_web/channels/realtime_channel/broadcast_handler.ex +++ b/lib/realtime_web/channels/realtime_channel/broadcast_handler.ex @@ -76,14 +76,21 @@ defmodule RealtimeWeb.RealtimeChannel.BroadcastHandler do broadcast = %Phoenix.Socket.Broadcast{topic: tenant_topic, event: @event_type, payload: payload} if self_broadcast do - TenantBroadcaster.pubsub_broadcast(tenant_id, tenant_topic, broadcast, RealtimeChannel.MessageDispatcher) + TenantBroadcaster.pubsub_broadcast( + tenant_id, + tenant_topic, + broadcast, + RealtimeChannel.MessageDispatcher, + :broadcast + ) else TenantBroadcaster.pubsub_broadcast_from( tenant_id, self(), tenant_topic, broadcast, - RealtimeChannel.MessageDispatcher + RealtimeChannel.MessageDispatcher, + :broadcast ) end end diff --git a/lib/realtime_web/channels/realtime_channel/presence_handler.ex b/lib/realtime_web/channels/realtime_channel/presence_handler.ex index 1af26c528..ec16c7b16 100644 --- a/lib/realtime_web/channels/realtime_channel/presence_handler.ex +++ b/lib/realtime_web/channels/realtime_channel/presence_handler.ex @@ -109,6 +109,7 @@ defmodule RealtimeWeb.RealtimeChannel.PresenceHandler do %{assigns: %{presence_key: presence_key, tenant_topic: tenant_topic}} = socket payload = Map.get(payload, "payload", %{}) + RealtimeWeb.TenantBroadcaster.collect_payload_size(socket.assigns.tenant, payload, :presence) with :ok <- limit_presence_event(socket), {:ok, _} <- Presence.track(self(), tenant_topic, presence_key, payload) do diff --git a/lib/realtime_web/tenant_broadcaster.ex b/lib/realtime_web/tenant_broadcaster.ex index da02df79e..f8b739a0b 100644 --- a/lib/realtime_web/tenant_broadcaster.ex +++ b/lib/realtime_web/tenant_broadcaster.ex @@ -5,9 +5,12 @@ defmodule RealtimeWeb.TenantBroadcaster do alias Phoenix.PubSub - @spec pubsub_broadcast(tenant_id :: String.t(), PubSub.topic(), PubSub.message(), PubSub.dispatcher()) :: :ok - def pubsub_broadcast(tenant_id, topic, message, dispatcher) do - collect_payload_size(tenant_id, message) + @type message_type :: :broadcast | :presence | :postgres_changes + + @spec pubsub_broadcast(tenant_id :: String.t(), PubSub.topic(), PubSub.message(), PubSub.dispatcher(), message_type) :: + :ok + def pubsub_broadcast(tenant_id, topic, message, dispatcher, message_type) do + collect_payload_size(tenant_id, message, message_type) if pubsub_adapter() == :gen_rpc do PubSub.broadcast(Realtime.PubSub, topic, message, dispatcher) @@ -23,11 +26,12 @@ defmodule RealtimeWeb.TenantBroadcaster do from :: pid, PubSub.topic(), PubSub.message(), - PubSub.dispatcher() + PubSub.dispatcher(), + message_type ) :: :ok - def pubsub_broadcast_from(tenant_id, from, topic, message, dispatcher) do - collect_payload_size(tenant_id, message) + def pubsub_broadcast_from(tenant_id, from, topic, message, dispatcher, message_type) do + collect_payload_size(tenant_id, message, message_type) if pubsub_adapter() == :gen_rpc do PubSub.broadcast_from(Realtime.PubSub, from, topic, message, dispatcher) @@ -45,16 +49,18 @@ defmodule RealtimeWeb.TenantBroadcaster do @payload_size_event [:realtime, :tenants, :payload, :size] - defp collect_payload_size(tenant_id, payload) when is_struct(payload) do + @spec collect_payload_size(tenant_id :: String.t(), payload :: term, message_type :: message_type) :: :ok + def collect_payload_size(tenant_id, payload, message_type) when is_struct(payload) do # Extracting from struct so the __struct__ bit is not calculated as part of the payload - collect_payload_size(tenant_id, Map.from_struct(payload)) + collect_payload_size(tenant_id, Map.from_struct(payload), message_type) end - defp collect_payload_size(tenant_id, payload) do - :telemetry.execute(@payload_size_event, %{size: :erlang.external_size(payload)}, %{tenant: tenant_id}) + def collect_payload_size(tenant_id, payload, message_type) do + :telemetry.execute(@payload_size_event, %{size: :erlang.external_size(payload)}, %{ + tenant: tenant_id, + message_type: message_type + }) end - defp pubsub_adapter do - Application.fetch_env!(:realtime, :pubsub_adapter) - end + defp pubsub_adapter, do: Application.fetch_env!(:realtime, :pubsub_adapter) end diff --git a/mix.exs b/mix.exs index bc5fccc8b..cb6633281 100644 --- a/mix.exs +++ b/mix.exs @@ -4,7 +4,7 @@ defmodule Realtime.MixProject do def project do [ app: :realtime, - version: "2.51.15", + version: "2.52.0", elixir: "~> 1.17.3", elixirc_paths: elixirc_paths(Mix.env()), start_permanent: Mix.env() == :prod, diff --git a/test/realtime/extensions/cdc_rls/cdc_rls_test.exs b/test/realtime/extensions/cdc_rls/cdc_rls_test.exs index 5f341c134..d12c0ba73 100644 --- a/test/realtime/extensions/cdc_rls/cdc_rls_test.exs +++ b/test/realtime/extensions/cdc_rls/cdc_rls_test.exs @@ -236,6 +236,15 @@ defmodule Realtime.Extensions.CdcRlsTest do RateCounter.stop(tenant.external_id) + on_exit(fn -> :telemetry.detach(__MODULE__) end) + + :telemetry.attach( + __MODULE__, + [:realtime, :tenants, :payload, :size], + &__MODULE__.handle_telemetry/4, + pid: self() + ) + %{tenant: tenant, conn: conn} end @@ -317,6 +326,13 @@ defmodule Realtime.Extensions.CdcRlsTest do assert {:ok, %RateCounter{id: {:channel, :db_events, "dev_tenant"}, bucket: bucket}} = RateCounter.get(rate) assert 1 in bucket + + assert_receive { + :telemetry, + [:realtime, :tenants, :payload, :size], + %{size: 341}, + %{tenant: "dev_tenant", message_type: :postgres_changes} + } end @aux_mod (quote do @@ -414,4 +430,6 @@ defmodule Realtime.Extensions.CdcRlsTest do :erpc.call(node, PostgresCdcRls, :handle_stop, [tenant.external_id, 10_000]) end end + + def handle_telemetry(event, measures, metadata, pid: pid), do: send(pid, {:telemetry, event, measures, metadata}) end diff --git a/test/realtime/monitoring/prom_ex/plugins/tenant_test.exs b/test/realtime/monitoring/prom_ex/plugins/tenant_test.exs index cfa727cfa..77c1dc7cf 100644 --- a/test/realtime/monitoring/prom_ex/plugins/tenant_test.exs +++ b/test/realtime/monitoring/prom_ex/plugins/tenant_test.exs @@ -262,18 +262,18 @@ defmodule Realtime.PromEx.Plugins.TenantTest do external_id = context.tenant.external_id pattern = - ~r/realtime_tenants_payload_size_count{tenant="#{external_id}"}\s(?\d+)/ + ~r/realtime_tenants_payload_size_count{message_type=\"presence\",tenant="#{external_id}"}\s(?\d+)/ metric_value = metric_value(pattern) message = %{topic: "a topic", event: "an event", payload: ["a", %{"b" => "c"}, 1, 23]} - RealtimeWeb.TenantBroadcaster.pubsub_broadcast(external_id, "a topic", message, Phoenix.PubSub) + RealtimeWeb.TenantBroadcaster.pubsub_broadcast(external_id, "a topic", message, Phoenix.PubSub, :presence) Process.sleep(200) assert metric_value(pattern) == metric_value + 1 bucket_pattern = - ~r/realtime_tenants_payload_size_bucket{tenant="#{external_id}",le="100"}\s(?\d+)/ + ~r/realtime_tenants_payload_size_bucket{message_type=\"presence\",tenant="#{external_id}",le="250"}\s(?\d+)/ assert metric_value(bucket_pattern) > 0 end @@ -281,17 +281,17 @@ defmodule Realtime.PromEx.Plugins.TenantTest do test "global metric payload size", context do external_id = context.tenant.external_id - pattern = ~r/realtime_payload_size_count\s(?\d+)/ + pattern = ~r/realtime_payload_size_count{message_type=\"broadcast\"}\s(?\d+)/ metric_value = metric_value(pattern) message = %{topic: "a topic", event: "an event", payload: ["a", %{"b" => "c"}, 1, 23]} - RealtimeWeb.TenantBroadcaster.pubsub_broadcast(external_id, "a topic", message, Phoenix.PubSub) + RealtimeWeb.TenantBroadcaster.pubsub_broadcast(external_id, "a topic", message, Phoenix.PubSub, :broadcast) Process.sleep(200) assert metric_value(pattern) == metric_value + 1 - bucket_pattern = ~r/realtime_payload_size_bucket{le="100"}\s(?\d+)/ + bucket_pattern = ~r/realtime_payload_size_bucket{message_type=\"broadcast\",le="250"}\s(?\d+)/ assert metric_value(bucket_pattern) > 0 end diff --git a/test/realtime_web/channels/realtime_channel/presence_handler_test.exs b/test/realtime_web/channels/realtime_channel/presence_handler_test.exs index 4891e4187..219f13e55 100644 --- a/test/realtime_web/channels/realtime_channel/presence_handler_test.exs +++ b/test/realtime_web/channels/realtime_channel/presence_handler_test.exs @@ -100,25 +100,41 @@ defmodule RealtimeWeb.RealtimeChannel.PresenceHandlerTest do end describe "handle/3" do + setup do + on_exit(fn -> :telemetry.detach(__MODULE__) end) + + :telemetry.attach( + __MODULE__, + [:realtime, :tenants, :payload, :size], + &__MODULE__.handle_telemetry/4, + pid: self() + ) + end + test "with true policy and is private, user can track their presence and changes", %{ tenant: tenant, topic: topic, db_conn: db_conn } do + external_id = tenant.external_id key = random_string() policies = %Policies{presence: %PresencePolicies{read: true, write: true}} socket = socket_fixture(tenant, topic, key, policies: policies) - PresenceHandler.handle(%{"event" => "track"}, db_conn, socket) + PresenceHandler.handle(%{"event" => "track", "payload" => %{"A" => "b", "c" => "b"}}, db_conn, socket) topic = socket.assigns.tenant_topic assert_receive %Broadcast{topic: ^topic, event: "presence_diff", payload: %{joins: joins, leaves: %{}}} assert Map.has_key?(joins, key) + + assert_receive {:telemetry, [:realtime, :tenants, :payload, :size], %{size: 30}, + %{tenant: ^external_id, message_type: :presence}} end test "when tracking already existing user, metadata updated", %{tenant: tenant, topic: topic, db_conn: db_conn} do + external_id = tenant.external_id key = random_string() policies = %Policies{presence: %PresencePolicies{read: true, write: true}} socket = socket_fixture(tenant, topic, key, policies: policies) @@ -134,10 +150,18 @@ defmodule RealtimeWeb.RealtimeChannel.PresenceHandlerTest do assert_receive %Broadcast{topic: ^topic, event: "presence_diff", payload: %{joins: joins, leaves: %{}}} assert Map.has_key?(joins, key) + + assert_receive {:telemetry, [:realtime, :tenants, :payload, :size], %{size: 6}, + %{tenant: ^external_id, message_type: :presence}} + + assert_receive {:telemetry, [:realtime, :tenants, :payload, :size], %{size: 55}, + %{tenant: ^external_id, message_type: :presence}} + refute_receive :_ end test "with false policy and is public, user can track their presence and changes", %{tenant: tenant, topic: topic} do + external_id = tenant.external_id key = random_string() policies = %Policies{presence: %PresencePolicies{read: false, write: false}} socket = socket_fixture(tenant, topic, key, policies: policies, private?: false) @@ -147,6 +171,9 @@ defmodule RealtimeWeb.RealtimeChannel.PresenceHandlerTest do topic = socket.assigns.tenant_topic assert_receive %Broadcast{topic: ^topic, event: "presence_diff", payload: %{joins: joins, leaves: %{}}} assert Map.has_key?(joins, key) + + assert_receive {:telemetry, [:realtime, :tenants, :payload, :size], %{size: 6}, + %{tenant: ^external_id, message_type: :presence}} end test "user can untrack when they want", %{tenant: tenant, topic: topic, db_conn: db_conn} do @@ -518,4 +545,6 @@ defmodule RealtimeWeb.RealtimeChannel.PresenceHandlerTest do } } end + + def handle_telemetry(event, measures, metadata, pid: pid), do: send(pid, {:telemetry, event, measures, metadata}) end diff --git a/test/realtime_web/controllers/broadcast_controller_test.exs b/test/realtime_web/controllers/broadcast_controller_test.exs index d42466722..209c405de 100644 --- a/test/realtime_web/controllers/broadcast_controller_test.exs +++ b/test/realtime_web/controllers/broadcast_controller_test.exs @@ -272,7 +272,7 @@ defmodule RealtimeWeb.BroadcastControllerTest do } do request_events_key = Tenants.requests_per_second_key(tenant) broadcast_events_key = Tenants.events_per_second_key(tenant) - expect(TenantBroadcaster, :pubsub_broadcast, 5, fn _, _, _, _ -> :ok end) + expect(TenantBroadcaster, :pubsub_broadcast, 5, fn _, _, _, _, _ -> :ok end) messages_to_send = Stream.repeatedly(fn -> generate_message_with_policies(db_conn, tenant) end) @@ -294,7 +294,7 @@ defmodule RealtimeWeb.BroadcastControllerTest do conn = post(conn, Routes.broadcast_path(conn, :broadcast), %{"messages" => messages}) - broadcast_calls = calls(&TenantBroadcaster.pubsub_broadcast/4) + broadcast_calls = calls(&TenantBroadcaster.pubsub_broadcast/5) Enum.each(messages_to_send, fn %{topic: topic} -> broadcast_topic = Tenants.tenant_topic(tenant, topic, false) @@ -310,7 +310,7 @@ defmodule RealtimeWeb.BroadcastControllerTest do } assert Enum.any?(broadcast_calls, fn - [_, ^broadcast_topic, ^message, RealtimeChannel.MessageDispatcher] -> true + [_, ^broadcast_topic, ^message, RealtimeChannel.MessageDispatcher, :broadcast] -> true _ -> false end) end) @@ -326,7 +326,7 @@ defmodule RealtimeWeb.BroadcastControllerTest do } do request_events_key = Tenants.requests_per_second_key(tenant) broadcast_events_key = Tenants.events_per_second_key(tenant) - expect(TenantBroadcaster, :pubsub_broadcast, 6, fn _, _, _, _ -> :ok end) + expect(TenantBroadcaster, :pubsub_broadcast, 6, fn _, _, _, _, _ -> :ok end) channels = Stream.repeatedly(fn -> generate_message_with_policies(db_conn, tenant) end) @@ -358,7 +358,7 @@ defmodule RealtimeWeb.BroadcastControllerTest do conn = post(conn, Routes.broadcast_path(conn, :broadcast), %{"messages" => messages}) - broadcast_calls = calls(&TenantBroadcaster.pubsub_broadcast/4) + broadcast_calls = calls(&TenantBroadcaster.pubsub_broadcast/5) Enum.each(channels, fn %{topic: topic} -> broadcast_topic = Tenants.tenant_topic(tenant, topic, false) @@ -374,7 +374,7 @@ defmodule RealtimeWeb.BroadcastControllerTest do } assert Enum.count(broadcast_calls, fn - [_, ^broadcast_topic, ^message, RealtimeChannel.MessageDispatcher] -> true + [_, ^broadcast_topic, ^message, RealtimeChannel.MessageDispatcher, :broadcast] -> true _ -> false end) == 1 end) @@ -393,7 +393,7 @@ defmodule RealtimeWeb.BroadcastControllerTest do open_channel_topic = Tenants.tenant_topic(tenant, "open_channel", true) assert Enum.count(broadcast_calls, fn - [_, ^open_channel_topic, ^message, RealtimeChannel.MessageDispatcher] -> true + [_, ^open_channel_topic, ^message, RealtimeChannel.MessageDispatcher, :broadcast] -> true _ -> false end) == 1 @@ -408,7 +408,7 @@ defmodule RealtimeWeb.BroadcastControllerTest do } do request_events_key = Tenants.requests_per_second_key(tenant) broadcast_events_key = Tenants.events_per_second_key(tenant) - expect(TenantBroadcaster, :pubsub_broadcast, 5, fn _, _, _, _ -> :ok end) + expect(TenantBroadcaster, :pubsub_broadcast, 5, fn _, _, _, _, _ -> :ok end) messages_to_send = Stream.repeatedly(fn -> generate_message_with_policies(db_conn, tenant) end) @@ -433,7 +433,7 @@ defmodule RealtimeWeb.BroadcastControllerTest do conn = post(conn, Routes.broadcast_path(conn, :broadcast), %{"messages" => messages}) - broadcast_calls = calls(&TenantBroadcaster.pubsub_broadcast/4) + broadcast_calls = calls(&TenantBroadcaster.pubsub_broadcast/5) Enum.each(messages_to_send, fn %{topic: topic} -> broadcast_topic = Tenants.tenant_topic(tenant, topic, false) @@ -449,7 +449,7 @@ defmodule RealtimeWeb.BroadcastControllerTest do } assert Enum.count(broadcast_calls, fn - [_, ^broadcast_topic, ^message, RealtimeChannel.MessageDispatcher] -> true + [_, ^broadcast_topic, ^message, RealtimeChannel.MessageDispatcher, :broadcast] -> true _ -> false end) == 1 end) @@ -462,7 +462,7 @@ defmodule RealtimeWeb.BroadcastControllerTest do @tag role: "anon" test "user without permission won't broadcast", %{conn: conn, db_conn: db_conn, tenant: tenant} do request_events_key = Tenants.requests_per_second_key(tenant) - reject(&TenantBroadcaster.pubsub_broadcast/4) + reject(&TenantBroadcaster.pubsub_broadcast/5) messages = Stream.repeatedly(fn -> generate_message_with_policies(db_conn, tenant) end) diff --git a/test/realtime_web/tenant_broadcaster_test.exs b/test/realtime_web/tenant_broadcaster_test.exs index ddda381a1..bc3b4f90a 100644 --- a/test/realtime_web/tenant_broadcaster_test.exs +++ b/test/realtime_web/tenant_broadcaster_test.exs @@ -60,7 +60,7 @@ defmodule RealtimeWeb.TenantBroadcasterTest do test "pubsub_broadcast", %{node: node} do message = %Broadcast{topic: @topic, event: "an event", payload: %{"a" => "b"}} - TenantBroadcaster.pubsub_broadcast("realtime-dev", @topic, message, Phoenix.PubSub) + TenantBroadcaster.pubsub_broadcast("realtime-dev", @topic, message, Phoenix.PubSub, :broadcast) assert_receive ^message @@ -71,13 +71,13 @@ defmodule RealtimeWeb.TenantBroadcasterTest do :telemetry, [:realtime, :tenants, :payload, :size], %{size: 114}, - %{tenant: "realtime-dev"} + %{tenant: "realtime-dev", message_type: :broadcast} } end test "pubsub_broadcast list payload", %{node: node} do message = %Broadcast{topic: @topic, event: "an event", payload: ["a", %{"b" => "c"}, 1, 23]} - TenantBroadcaster.pubsub_broadcast("realtime-dev", @topic, message, Phoenix.PubSub) + TenantBroadcaster.pubsub_broadcast("realtime-dev", @topic, message, Phoenix.PubSub, :broadcast) assert_receive ^message @@ -88,13 +88,13 @@ defmodule RealtimeWeb.TenantBroadcasterTest do :telemetry, [:realtime, :tenants, :payload, :size], %{size: 130}, - %{tenant: "realtime-dev"} + %{tenant: "realtime-dev", message_type: :broadcast} } end test "pubsub_broadcast string payload", %{node: node} do message = %Broadcast{topic: @topic, event: "an event", payload: "some text payload"} - TenantBroadcaster.pubsub_broadcast("realtime-dev", @topic, message, Phoenix.PubSub) + TenantBroadcaster.pubsub_broadcast("realtime-dev", @topic, message, Phoenix.PubSub, :broadcast) assert_receive ^message @@ -105,7 +105,7 @@ defmodule RealtimeWeb.TenantBroadcasterTest do :telemetry, [:realtime, :tenants, :payload, :size], %{size: 119}, - %{tenant: "realtime-dev"} + %{tenant: "realtime-dev", message_type: :broadcast} } end end @@ -131,7 +131,7 @@ defmodule RealtimeWeb.TenantBroadcasterTest do message = %Broadcast{topic: @topic, event: "an event", payload: %{"a" => "b"}} - TenantBroadcaster.pubsub_broadcast_from("realtime-dev", self(), @topic, message, Phoenix.PubSub) + TenantBroadcaster.pubsub_broadcast_from("realtime-dev", self(), @topic, message, Phoenix.PubSub, :broadcast) assert_receive {:other_process, ^message} @@ -142,7 +142,7 @@ defmodule RealtimeWeb.TenantBroadcasterTest do :telemetry, [:realtime, :tenants, :payload, :size], %{size: 114}, - %{tenant: "realtime-dev"} + %{tenant: "realtime-dev", message_type: :broadcast} } # This process does not receive the message @@ -151,5 +151,38 @@ defmodule RealtimeWeb.TenantBroadcasterTest do end end + describe "collect_payload_size/3" do + @describetag pubsub_adapter: :gen_rpc + + test "emit telemetry for struct" do + TenantBroadcaster.collect_payload_size( + "realtime-dev", + %Phoenix.Socket.Broadcast{event: "broadcast", payload: %{"a" => "b"}}, + :broadcast + ) + + assert_receive {:telemetry, [:realtime, :tenants, :payload, :size], %{size: 65}, + %{tenant: "realtime-dev", message_type: :broadcast}} + end + + test "emit telemetry for map" do + TenantBroadcaster.collect_payload_size( + "realtime-dev", + %{event: "broadcast", payload: %{"a" => "b"}}, + :postgres_changes + ) + + assert_receive {:telemetry, [:realtime, :tenants, :payload, :size], %{size: 53}, + %{tenant: "realtime-dev", message_type: :postgres_changes}} + end + + test "emit telemetry for non-map" do + TenantBroadcaster.collect_payload_size("realtime-dev", "some blob", :presence) + + assert_receive {:telemetry, [:realtime, :tenants, :payload, :size], %{size: 15}, + %{tenant: "realtime-dev", message_type: :presence}} + end + end + def handle_telemetry(event, measures, metadata, pid: pid), do: send(pid, {:telemetry, event, measures, metadata}) end From 07de6656527b69b11aa08dfed46ff55fdb635f51 Mon Sep 17 00:00:00 2001 From: Eduardo Gurgel Date: Mon, 6 Oct 2025 21:06:02 +1300 Subject: [PATCH 33/34] fix: use GenRpc for Realtime.Latency pings (#1560) --- Makefile | 4 ++-- lib/realtime/monitoring/latency.ex | 8 ++++---- mix.exs | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Makefile b/Makefile index fd7f0f7fd..1259a1335 100644 --- a/Makefile +++ b/Makefile @@ -9,10 +9,10 @@ PORT ?= 4000 # Common commands dev: ## Start a dev server - ELIXIR_ERL_OPTIONS="+hmax 1000000000" SLOT_NAME_SUFFIX=some_sha PORT=$(PORT) MIX_ENV=dev SECURE_CHANNELS=true API_JWT_SECRET=dev METRICS_JWT_SECRET=dev REGION=fra DB_ENC_KEY="1234567890123456" CLUSTER_STRATEGIES=$(CLUSTER_STRATEGIES) ERL_AFLAGS="-kernel shell_history enabled" GEN_RPC_TCP_SERVER_PORT=5369 GEN_RPC_TCP_CLIENT_PORT=5469 iex --name $(NODE_NAME)@127.0.0.1 --cookie cookie -S mix phx.server + ELIXIR_ERL_OPTIONS="+hmax 1000000000" SLOT_NAME_SUFFIX=some_sha PORT=$(PORT) MIX_ENV=dev SECURE_CHANNELS=true API_JWT_SECRET=dev METRICS_JWT_SECRET=dev REGION=us-east-1 DB_ENC_KEY="1234567890123456" CLUSTER_STRATEGIES=$(CLUSTER_STRATEGIES) ERL_AFLAGS="-kernel shell_history enabled" GEN_RPC_TCP_SERVER_PORT=5369 GEN_RPC_TCP_CLIENT_PORT=5469 iex --name $(NODE_NAME)@127.0.0.1 --cookie cookie -S mix phx.server dev.orange: ## Start another dev server (orange) on port 4001 - ELIXIR_ERL_OPTIONS="+hmax 1000000000" SLOT_NAME_SUFFIX=some_sha PORT=4001 MIX_ENV=dev SECURE_CHANNELS=true API_JWT_SECRET=dev METRICS_JWT_SECRET=dev DB_ENC_KEY="1234567890123456" CLUSTER_STRATEGIES=$(CLUSTER_STRATEGIES) ERL_AFLAGS="-kernel shell_history enabled" GEN_RPC_TCP_SERVER_PORT=5469 GEN_RPC_TCP_CLIENT_PORT=5369 iex --name orange@127.0.0.1 --cookie cookie -S mix phx.server + ELIXIR_ERL_OPTIONS="+hmax 1000000000" SLOT_NAME_SUFFIX=some_sha PORT=4001 MIX_ENV=dev SECURE_CHANNELS=true API_JWT_SECRET=dev METRICS_JWT_SECRET=dev REGION=eu-west-1 DB_ENC_KEY="1234567890123456" CLUSTER_STRATEGIES=$(CLUSTER_STRATEGIES) ERL_AFLAGS="-kernel shell_history enabled" GEN_RPC_TCP_SERVER_PORT=5469 GEN_RPC_TCP_CLIENT_PORT=5369 iex --name orange@127.0.0.1 --cookie cookie -S mix phx.server seed: ## Seed the database DB_ENC_KEY="1234567890123456" FLY_ALLOC_ID=123e4567-e89b-12d3-a456-426614174000 mix run priv/repo/dev_seeds.exs diff --git a/lib/realtime/monitoring/latency.ex b/lib/realtime/monitoring/latency.ex index 52c46adb4..d9ddd0d9a 100644 --- a/lib/realtime/monitoring/latency.ex +++ b/lib/realtime/monitoring/latency.ex @@ -7,7 +7,7 @@ defmodule Realtime.Latency do use Realtime.Logs alias Realtime.Nodes - alias Realtime.Rpc + alias Realtime.GenRpc defmodule Payload do @moduledoc false @@ -33,7 +33,7 @@ defmodule Realtime.Latency do } end - @every 5_000 + @every 15_000 def start_link(args) do GenServer.start_link(__MODULE__, args, name: __MODULE__) end @@ -76,7 +76,7 @@ defmodule Realtime.Latency do Task.Supervisor.async(Realtime.TaskSupervisor, fn -> {latency, response} = :timer.tc(fn -> - Rpc.call(n, __MODULE__, :pong, [pong_timeout], timeout: timer_timeout) + GenRpc.call(n, __MODULE__, :pong, [pong_timeout], timeout: timer_timeout) end) latency_ms = latency / 1_000 @@ -85,7 +85,7 @@ defmodule Realtime.Latency do from_node = Nodes.short_node_id_from_name(Node.self()) case response do - {:badrpc, reason} -> + {:error, :rpc_error, reason} -> log_error( "RealtimeNodeDisconnected", "Unable to connect to #{short_name} from #{region}: #{reason}" diff --git a/mix.exs b/mix.exs index cb6633281..72ae7f630 100644 --- a/mix.exs +++ b/mix.exs @@ -4,7 +4,7 @@ defmodule Realtime.MixProject do def project do [ app: :realtime, - version: "2.52.0", + version: "2.52.1", elixir: "~> 1.17.3", elixirc_paths: elixirc_paths(Mix.env()), start_permanent: Mix.env() == :prod, From ecac071eded1a3ce7f3f4a2bf24dfff535c3f802 Mon Sep 17 00:00:00 2001 From: Eduardo Gurgel Date: Tue, 7 Oct 2025 12:21:22 +1300 Subject: [PATCH 34/34] Fastlane for phoenix presence_diff (#1558) It uses a fork of Phoenix for time being * fix: count presence_diff events on MessageDispatcher * fix: remove traces from console during development --- config/dev.exs | 4 +- lib/realtime_web/channels/presence.ex | 1 + lib/realtime_web/channels/realtime_channel.ex | 17 ----- .../realtime_channel/message_dispatcher.ex | 70 ++++++++++-------- mix.exs | 4 +- mix.lock | 2 +- .../message_dispatcher_test.exs | 71 ++++++++++++++++--- .../channels/realtime_channel_test.exs | 15 +--- 8 files changed, 113 insertions(+), 71 deletions(-) diff --git a/config/dev.exs b/config/dev.exs index a438f8ea4..0eff300d8 100644 --- a/config/dev.exs +++ b/config/dev.exs @@ -97,6 +97,8 @@ config :phoenix, :plug_init_mode, :runtime # Disable caching to ensure the rendered spec is refreshed config :open_api_spex, :cache_adapter, OpenApiSpex.Plug.NoneCache -config :opentelemetry, traces_exporter: {:otel_exporter_stdout, []} +# Disabled but can print to stdout with: +# config :opentelemetry, traces_exporter: {:otel_exporter_stdout, []} +config :opentelemetry, traces_exporter: :none config :mix_test_watch, clear: true diff --git a/lib/realtime_web/channels/presence.ex b/lib/realtime_web/channels/presence.ex index f4d378b92..9e173febe 100644 --- a/lib/realtime_web/channels/presence.ex +++ b/lib/realtime_web/channels/presence.ex @@ -8,5 +8,6 @@ defmodule RealtimeWeb.Presence do use Phoenix.Presence, otp_app: :realtime, pubsub_server: Realtime.PubSub, + dispatcher: RealtimeWeb.RealtimeChannel.MessageDispatcher, pool_size: 10 end diff --git a/lib/realtime_web/channels/realtime_channel.ex b/lib/realtime_web/channels/realtime_channel.ex index 91a417c21..104d9a077 100644 --- a/lib/realtime_web/channels/realtime_channel.ex +++ b/lib/realtime_web/channels/realtime_channel.ex @@ -18,7 +18,6 @@ defmodule RealtimeWeb.RealtimeChannel do alias Realtime.Tenants.Authorization alias Realtime.Tenants.Authorization.Policies alias Realtime.Tenants.Authorization.Policies.BroadcastPolicies - alias Realtime.Tenants.Authorization.Policies.PresencePolicies alias Realtime.Tenants.Connect alias RealtimeWeb.Channels.Payloads.Join @@ -259,27 +258,11 @@ defmodule RealtimeWeb.RealtimeChannel do {:noreply, assign(socket, %{pg_sub_ref: pg_sub_ref})} end - def handle_info( - %{event: "presence_diff"}, - %{assigns: %{policies: %Policies{presence: %PresencePolicies{read: false}}}} = socket - ) do - Logger.warning("Presence message ignored") - {:noreply, socket} - end - def handle_info(_msg, %{assigns: %{policies: %Policies{broadcast: %BroadcastPolicies{read: false}}}} = socket) do Logger.warning("Broadcast message ignored") {:noreply, socket} end - def handle_info(%{event: "presence_diff", payload: payload} = msg, socket) do - %{presence_rate_counter: presence_rate_counter} = socket.assigns - GenCounter.add(presence_rate_counter.id) - maybe_log_info(socket, msg) - push(socket, "presence_diff", payload) - {:noreply, socket} - end - def handle_info(%{event: type, payload: payload} = msg, socket) do count(socket) maybe_log_info(socket, msg) diff --git a/lib/realtime_web/channels/realtime_channel/message_dispatcher.ex b/lib/realtime_web/channels/realtime_channel/message_dispatcher.ex index 32e1528f3..6604eb2bd 100644 --- a/lib/realtime_web/channels/realtime_channel/message_dispatcher.ex +++ b/lib/realtime_web/channels/realtime_channel/message_dispatcher.ex @@ -5,14 +5,8 @@ defmodule RealtimeWeb.RealtimeChannel.MessageDispatcher do require Logger - def fastlane_metadata(fastlane_pid, serializer, topic, log_level, tenant_id, replayed_message_ids \\ MapSet.new()) - - def fastlane_metadata(fastlane_pid, serializer, topic, :info, tenant_id, replayed_message_ids) do - {:rc_fastlane, fastlane_pid, serializer, topic, {:log, tenant_id}, replayed_message_ids} - end - - def fastlane_metadata(fastlane_pid, serializer, topic, _log_level, _tenant_id, replayed_message_ids) do - {:rc_fastlane, fastlane_pid, serializer, topic, replayed_message_ids} + def fastlane_metadata(fastlane_pid, serializer, topic, log_level, tenant_id, replayed_message_ids \\ MapSet.new()) do + {:rc_fastlane, fastlane_pid, serializer, topic, log_level, tenant_id, replayed_message_ids} end @doc """ @@ -20,48 +14,58 @@ defmodule RealtimeWeb.RealtimeChannel.MessageDispatcher do It also sends an :update_rate_counter to the subscriber and it can conditionally log """ @spec dispatch(list, pid, Phoenix.Socket.Broadcast.t()) :: :ok - def dispatch(subscribers, from, %Phoenix.Socket.Broadcast{} = msg) do + def dispatch(subscribers, from, %Phoenix.Socket.Broadcast{event: event} = msg) do # fastlane_pid is the actual socket transport pid # This reduce caches the serialization and bypasses the channel process going straight to the # transport process message_id = message_id(msg.payload) - # Credo doesn't like that we don't use the result aggregation - _ = - Enum.reduce(subscribers, %{}, fn - {pid, _}, cache when pid == from -> - cache + {_cache, count} = + Enum.reduce(subscribers, {%{}, 0}, fn + {pid, _}, {cache, count} when pid == from -> + {cache, count} - {pid, {:rc_fastlane, fastlane_pid, serializer, join_topic, replayed_message_ids}}, cache -> + {pid, {:rc_fastlane, fastlane_pid, serializer, join_topic, log_level, tenant_id, replayed_message_ids}}, + {cache, count} -> if already_replayed?(message_id, replayed_message_ids) do # skip already replayed message - cache + {cache, count} else - send(pid, :update_rate_counter) - do_dispatch(msg, fastlane_pid, serializer, join_topic, cache) - end + if event != "presence_diff", do: send(pid, :update_rate_counter) - {pid, {:rc_fastlane, fastlane_pid, serializer, join_topic, {:log, tenant_id}, replayed_message_ids}}, cache -> - if already_replayed?(message_id, replayed_message_ids) do - # skip already replayed message - cache - else - send(pid, :update_rate_counter) - log = "Received message on #{join_topic} with payload: #{inspect(msg, pretty: true)}" - Logger.info(log, external_id: tenant_id, project: tenant_id) + maybe_log(log_level, join_topic, msg, tenant_id) - do_dispatch(msg, fastlane_pid, serializer, join_topic, cache) + cache = do_dispatch(msg, fastlane_pid, serializer, join_topic, cache) + {cache, count + 1} end - {pid, _}, cache -> + {pid, _}, {cache, count} -> send(pid, msg) - cache + {cache, count} end) + tenant_id = tenant_id(subscribers) + increment_presence_counter(tenant_id, event, count) + :ok end + defp increment_presence_counter(tenant_id, "presence_diff", count) when is_binary(tenant_id) do + tenant_id + |> Realtime.Tenants.presence_events_per_second_key() + |> Realtime.GenCounter.add(count) + end + + defp increment_presence_counter(_tenant_id, _event, _count), do: :ok + + defp maybe_log(:info, join_topic, msg, tenant_id) do + log = "Received message on #{join_topic} with payload: #{inspect(msg, pretty: true)}" + Logger.info(log, external_id: tenant_id, project: tenant_id) + end + + defp maybe_log(_level, _join_topic, _msg, _tenant_id), do: :ok + defp message_id(%{"meta" => %{"id" => id}}), do: id defp message_id(_), do: nil @@ -82,4 +86,10 @@ defmodule RealtimeWeb.RealtimeChannel.MessageDispatcher do Map.put(cache, serializer, encoded_msg) end end + + defp tenant_id([{_pid, {:rc_fastlane, _, _, _, _, tenant_id, _}} | _]) do + tenant_id + end + + defp tenant_id(_), do: nil end diff --git a/mix.exs b/mix.exs index 72ae7f630..d0e42bf11 100644 --- a/mix.exs +++ b/mix.exs @@ -4,7 +4,7 @@ defmodule Realtime.MixProject do def project do [ app: :realtime, - version: "2.52.1", + version: "2.53.0", elixir: "~> 1.17.3", elixirc_paths: elixirc_paths(Mix.env()), start_permanent: Mix.env() == :prod, @@ -53,7 +53,7 @@ defmodule Realtime.MixProject do # Type `mix help deps` for examples and options. defp deps do [ - {:phoenix, "~> 1.7.0"}, + {:phoenix, override: true, github: "supabase/phoenix", branch: "feat/presence-custom-dispatcher-1.7.19"}, {:phoenix_ecto, "~> 4.4.0"}, {:ecto_sql, "~> 3.11"}, {:ecto_psql_extras, "~> 0.8"}, diff --git a/mix.lock b/mix.lock index c5fce6022..ba6f47328 100644 --- a/mix.lock +++ b/mix.lock @@ -66,7 +66,7 @@ "opentelemetry_semantic_conventions": {:hex, :opentelemetry_semantic_conventions, "1.27.0", "acd0194a94a1e57d63da982ee9f4a9f88834ae0b31b0bd850815fe9be4bbb45f", [:mix, :rebar3], [], "hexpm", "9681ccaa24fd3d810b4461581717661fd85ff7019b082c2dff89c7d5b1fc2864"}, "opentelemetry_telemetry": {:hex, :opentelemetry_telemetry, "1.1.2", "410ab4d76b0921f42dbccbe5a7c831b8125282850be649ee1f70050d3961118a", [:mix, :rebar3], [{:opentelemetry_api, "~> 1.3", [hex: :opentelemetry_api, repo: "hexpm", optional: false]}, {:telemetry, "~> 1.1", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "641ab469deb181957ac6d59bce6e1321d5fe2a56df444fc9c19afcad623ab253"}, "otel_http": {:hex, :otel_http, "0.2.0", "b17385986c7f1b862f5d577f72614ecaa29de40392b7618869999326b9a61d8a", [:rebar3], [], "hexpm", "f2beadf922c8cfeb0965488dd736c95cc6ea8b9efce89466b3904d317d7cc717"}, - "phoenix": {:hex, :phoenix, "1.7.19", "36617efe5afbd821099a8b994ff4618a340a5bfb25531a1802c4d4c634017a57", [:mix], [{:castore, ">= 0.0.0", [hex: :castore, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: true]}, {:phoenix_pubsub, "~> 2.1", [hex: :phoenix_pubsub, repo: "hexpm", optional: false]}, {:phoenix_template, "~> 1.0", [hex: :phoenix_template, repo: "hexpm", optional: false]}, {:phoenix_view, "~> 2.0", [hex: :phoenix_view, repo: "hexpm", optional: true]}, {:plug, "~> 1.14", [hex: :plug, repo: "hexpm", optional: false]}, {:plug_cowboy, "~> 2.7", [hex: :plug_cowboy, repo: "hexpm", optional: true]}, {:plug_crypto, "~> 1.2 or ~> 2.0", [hex: :plug_crypto, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}, {:websock_adapter, "~> 0.5.3", [hex: :websock_adapter, repo: "hexpm", optional: false]}], "hexpm", "ba4dc14458278773f905f8ae6c2ec743d52c3a35b6b353733f64f02dfe096cd6"}, + "phoenix": {:git, "https://github.com/supabase/phoenix.git", "7b884cc0cc1a49ad2bc272acda2e622b3e11c139", [branch: "feat/presence-custom-dispatcher-1.7.19"]}, "phoenix_ecto": {:hex, :phoenix_ecto, "4.4.3", "86e9878f833829c3f66da03d75254c155d91d72a201eb56ae83482328dc7ca93", [:mix], [{:ecto, "~> 3.5", [hex: :ecto, repo: "hexpm", optional: false]}, {:phoenix_html, "~> 2.14.2 or ~> 3.0 or ~> 4.0", [hex: :phoenix_html, repo: "hexpm", optional: true]}, {:plug, "~> 1.9", [hex: :plug, repo: "hexpm", optional: false]}], "hexpm", "d36c401206f3011fefd63d04e8ef626ec8791975d9d107f9a0817d426f61ac07"}, "phoenix_html": {:hex, :phoenix_html, "3.3.4", "42a09fc443bbc1da37e372a5c8e6755d046f22b9b11343bf885067357da21cb3", [:mix], [{:plug, "~> 1.5", [hex: :plug, repo: "hexpm", optional: true]}], "hexpm", "0249d3abec3714aff3415e7ee3d9786cb325be3151e6c4b3021502c585bf53fb"}, "phoenix_live_dashboard": {:hex, :phoenix_live_dashboard, "0.8.6", "7b1f0327f54c9eb69845fd09a77accf922f488c549a7e7b8618775eb603a62c7", [:mix], [{:ecto, "~> 3.6.2 or ~> 3.7", [hex: :ecto, repo: "hexpm", optional: true]}, {:ecto_mysql_extras, "~> 0.5", [hex: :ecto_mysql_extras, repo: "hexpm", optional: true]}, {:ecto_psql_extras, "~> 0.7", [hex: :ecto_psql_extras, repo: "hexpm", optional: true]}, {:ecto_sqlite3_extras, "~> 1.1.7 or ~> 1.2.0", [hex: :ecto_sqlite3_extras, repo: "hexpm", optional: true]}, {:mime, "~> 1.6 or ~> 2.0", [hex: :mime, repo: "hexpm", optional: false]}, {:phoenix_live_view, "~> 0.19 or ~> 1.0", [hex: :phoenix_live_view, repo: "hexpm", optional: false]}, {:telemetry_metrics, "~> 0.6 or ~> 1.0", [hex: :telemetry_metrics, repo: "hexpm", optional: false]}], "hexpm", "1681ab813ec26ca6915beb3414aa138f298e17721dc6a2bde9e6eb8a62360ff6"}, diff --git a/test/realtime_web/channels/realtime_channel/message_dispatcher_test.exs b/test/realtime_web/channels/realtime_channel/message_dispatcher_test.exs index 44ce83b99..53be2e51f 100644 --- a/test/realtime_web/channels/realtime_channel/message_dispatcher_test.exs +++ b/test/realtime_web/channels/realtime_channel/message_dispatcher_test.exs @@ -16,12 +16,24 @@ defmodule RealtimeWeb.RealtimeChannel.MessageDispatcherTest do describe "fastlane_metadata/5" do test "info level" do assert MessageDispatcher.fastlane_metadata(self(), Serializer, "realtime:topic", :info, "tenant_id") == - {:rc_fastlane, self(), Serializer, "realtime:topic", {:log, "tenant_id"}, MapSet.new()} + {:rc_fastlane, self(), Serializer, "realtime:topic", :info, "tenant_id", MapSet.new()} end test "non-info level" do assert MessageDispatcher.fastlane_metadata(self(), Serializer, "realtime:topic", :warning, "tenant_id") == - {:rc_fastlane, self(), Serializer, "realtime:topic", MapSet.new()} + {:rc_fastlane, self(), Serializer, "realtime:topic", :warning, "tenant_id", MapSet.new()} + end + + test "replayed message ids" do + assert MessageDispatcher.fastlane_metadata( + self(), + Serializer, + "realtime:topic", + :warning, + "tenant_id", + MapSet.new([1]) + ) == + {:rc_fastlane, self(), Serializer, "realtime:topic", :warning, "tenant_id", MapSet.new([1])} end end @@ -50,8 +62,8 @@ defmodule RealtimeWeb.RealtimeChannel.MessageDispatcherTest do from_pid = :erlang.list_to_pid(~c'<0.2.1>') subscribers = [ - {subscriber_pid, {:rc_fastlane, self(), TestSerializer, "realtime:topic", {:log, "tenant123"}, MapSet.new()}}, - {subscriber_pid, {:rc_fastlane, self(), TestSerializer, "realtime:topic", MapSet.new()}} + {subscriber_pid, {:rc_fastlane, self(), TestSerializer, "realtime:topic", :info, "tenant123", MapSet.new()}}, + {subscriber_pid, {:rc_fastlane, self(), TestSerializer, "realtime:topic", :warning, "tenant123", MapSet.new()}} ] msg = %Broadcast{topic: "some:other:topic", event: "event", payload: %{data: "test"}} @@ -74,6 +86,48 @@ defmodule RealtimeWeb.RealtimeChannel.MessageDispatcherTest do refute_receive _any end + test "dispatches 'presence_diff' messages to fastlane subscribers" do + parent = self() + + subscriber_pid = + spawn(fn -> + loop = fn loop -> + receive do + msg -> + send(parent, {:subscriber, msg}) + loop.(loop) + end + end + + loop.(loop) + end) + + from_pid = :erlang.list_to_pid(~c'<0.2.1>') + + subscribers = [ + {subscriber_pid, {:rc_fastlane, self(), TestSerializer, "realtime:topic", :info, "tenant456", MapSet.new()}}, + {subscriber_pid, {:rc_fastlane, self(), TestSerializer, "realtime:topic", :warning, "tenant456", MapSet.new()}} + ] + + msg = %Broadcast{topic: "some:other:topic", event: "presence_diff", payload: %{data: "test"}} + + log = + capture_log(fn -> + assert MessageDispatcher.dispatch(subscribers, from_pid, msg) == :ok + end) + + assert log =~ "Received message on realtime:topic with payload: #{inspect(msg, pretty: true)}" + + assert_receive {:encoded, %Broadcast{event: "presence_diff", payload: %{data: "test"}, topic: "realtime:topic"}} + assert_receive {:encoded, %Broadcast{event: "presence_diff", payload: %{data: "test"}, topic: "realtime:topic"}} + + assert Agent.get(TestSerializer, & &1) == 1 + + assert Realtime.GenCounter.get(Realtime.Tenants.presence_events_per_second_key("tenant456")) == 2 + + refute_receive _any + end + test "does not dispatch messages to fastlane subscribers if they already replayed it" do parent = self() @@ -95,8 +149,9 @@ defmodule RealtimeWeb.RealtimeChannel.MessageDispatcherTest do subscribers = [ {subscriber_pid, - {:rc_fastlane, self(), TestSerializer, "realtime:topic", {:log, "tenant123"}, replaeyd_message_ids}}, - {subscriber_pid, {:rc_fastlane, self(), TestSerializer, "realtime:topic", replaeyd_message_ids}} + {:rc_fastlane, self(), TestSerializer, "realtime:topic", :info, "tenant123", replaeyd_message_ids}}, + {subscriber_pid, + {:rc_fastlane, self(), TestSerializer, "realtime:topic", :warning, "tenant123", replaeyd_message_ids}} ] msg = %Broadcast{ @@ -131,8 +186,8 @@ defmodule RealtimeWeb.RealtimeChannel.MessageDispatcherTest do from_pid = :erlang.list_to_pid(~c'<0.2.1>') subscribers = [ - {subscriber_pid, {:rc_fastlane, self(), TestSerializer, "realtime:topic", {:log, "tenant123"}, MapSet.new()}}, - {subscriber_pid, {:rc_fastlane, self(), TestSerializer, "realtime:topic", MapSet.new()}} + {subscriber_pid, {:rc_fastlane, self(), TestSerializer, "realtime:topic", :info, "tenant123", MapSet.new()}}, + {subscriber_pid, {:rc_fastlane, self(), TestSerializer, "realtime:topic", :warning, "tenant123", MapSet.new()}} ] msg = %Broadcast{topic: "some:other:topic", event: "event", payload: "not a map"} diff --git a/test/realtime_web/channels/realtime_channel_test.exs b/test/realtime_web/channels/realtime_channel_test.exs index ae6c1734a..8022d6ebd 100644 --- a/test/realtime_web/channels/realtime_channel_test.exs +++ b/test/realtime_web/channels/realtime_channel_test.exs @@ -239,23 +239,14 @@ defmodule RealtimeWeb.RealtimeChannelTest do end describe "presence" do - test "events are counted", %{tenant: tenant} do + test "presence state event is counted", %{tenant: tenant} do jwt = Generators.generate_jwt_token(tenant) {:ok, %Socket{} = socket} = connect(UserSocket, %{"log_level" => "warning"}, conn_opts(tenant, jwt)) assert {:ok, _, %Socket{} = socket} = subscribe_and_join(socket, "realtime:test", %{}) - presence_diff = %Socket.Broadcast{event: "presence_diff", payload: %{joins: %{}, leaves: %{}}} - send(socket.channel_pid, presence_diff) - assert_receive %Socket.Message{topic: "realtime:test", event: "presence_state", payload: %{}} - assert_receive %Socket.Message{ - topic: "realtime:test", - event: "presence_diff", - payload: %{joins: %{}, leaves: %{}} - } - tenant_id = tenant.external_id # Wait for RateCounter to tick @@ -264,8 +255,8 @@ defmodule RealtimeWeb.RealtimeChannelTest do assert {:ok, %RateCounter{id: {:channel, :presence_events, ^tenant_id}, bucket: bucket}} = RateCounter.get(socket.assigns.presence_rate_counter) - # presence_state + presence_diff - assert 2 in bucket + # presence_state + assert Enum.sum(bucket) == 1 end end