From 7db2586d292fa7a538d41cf3921e5f258d094cbd Mon Sep 17 00:00:00 2001 From: longsizhuo Date: Sat, 25 Apr 2026 07:47:03 +0000 Subject: [PATCH 1/3] =?UTF-8?q?fix(listener):=20=E6=8B=A6=E8=B4=B4?= =?UTF-8?q?=E7=BA=B8/GIF=E8=81=9A=E5=90=88=E7=AB=99=20+=20=E8=A3=B8?= =?UTF-8?q?=E5=AA=92=E4=BD=93=E6=96=87=E4=BB=B6=EF=BC=8C=E9=81=BF=E5=85=8D?= =?UTF-8?q?=20Discord=20=E8=A1=A8=E6=83=85=E5=8C=85=E8=AF=AF=E5=85=A5?= =?UTF-8?q?=E5=88=86=E4=BA=AB=E5=BA=93?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 事故:用户 yhn 在分享频道发了一个 Discord 贴纸(klipy GIF),message.content 里就是裸 https://klipy.com/gifs/... URL,listener 当成正常分享走完 OG fetch + 分类,被打成 APPROVED 上架成 #18。 原 _SKIP_HOSTS 只拦了 discord.com / cdn.discordapp.com 等 Discord 自家域,没考虑贴纸面板默认走 tenor / klipy / giphy。同类问题:mmbiz.qpic.cn 这类纯图片直链(#5)也不该入库。 改法两层:(1) _SKIP_HOSTS 加入 tenor / klipy / giphy 全套;(2) 兜底在 path 上做媒体扩展名(.gif/.png/.jpg/.mp4/...)匹配,host 永远穷举不完。匹配只看 path,query 里出现 .jpg 不算(避免误伤带 ?file=foo.jpg 的正常 API 链接)。+19 个测试 case 覆盖。 --- src/chat_bot/cogs/listener.py | 59 +++++++++++++++++++++++++++++------ tests/test_listener_skip.py | 53 +++++++++++++++++++++++++++++++ 2 files changed, 103 insertions(+), 9 deletions(-) diff --git a/src/chat_bot/cogs/listener.py b/src/chat_bot/cogs/listener.py index a05acea..221768a 100644 --- a/src/chat_bot/cogs/listener.py +++ b/src/chat_bot/cogs/listener.py @@ -28,31 +28,72 @@ _URL_RE = re.compile(r"https?://[^\s<>\"'\]\)]+", re.IGNORECASE) -# 跳过 Discord 自身的各种链接:用户经常复制错(比如右键"复制消息链接"会粘 -# discord.com/channels/.../... 出来,这不该被当作"分享"入库)。静默忽略,不 -# 回复也不提交,像 bot 没看到一样。 +# 跳过的链接源。两层: +# 1. Discord 自身(消息链接 / 附件 CDN)—— 用户复制消息链接时常误粘 +# 2. 贴纸 / GIF / meme 聚合站 —— Discord 内置贴纸面板会发 tenor/klipy/giphy +# 链接出来,message.content 里就是裸 URL。这些不是"分享资源",不该入库 +# 静默忽略,不回复不提交,像 bot 没看到一样。 _SKIP_HOSTS = frozenset({ - # 主站 + # Discord 主站 "discord.com", "www.discord.com", "canary.discord.com", "ptb.discord.com", - # 邀请短链 + # Discord 邀请短链 "discord.gg", - # 附件 / CDN + # Discord 附件 / CDN "discordapp.com", "cdn.discordapp.com", "media.discordapp.net", + # 贴纸 / GIF 聚合(Discord 贴纸面板默认走这些) + "tenor.com", + "media.tenor.com", + "c.tenor.com", + "giphy.com", + "media.giphy.com", + "media0.giphy.com", + "media1.giphy.com", + "media2.giphy.com", + "media3.giphy.com", + "media4.giphy.com", + "klipy.com", + "media.klipy.com", }) +# 兜底:只指向静态媒体文件的 URL(路径以这些扩展名结尾)一律跳过——常见于 +# WeChat / 各种图床的裸图片链接,非分享资源。把扩展名匹配做在 path 上避免误伤 +# 带 query 的正常链接(query 里出现 .jpg 不算)。 +_MEDIA_EXTENSIONS = ( + ".gif", + ".png", + ".jpg", + ".jpeg", + ".webp", + ".bmp", + ".svg", + ".ico", + ".mp4", + ".webm", + ".mov", + ".m4v", + ".mp3", + ".wav", + ".ogg", + ".flac", +) + def _should_skip(url: str) -> bool: - """URL 是否属于需要跳过的源(当前只屏蔽 Discord 自身域名)。""" + """URL 是否属于需要跳过的源:Discord 域、贴纸聚合、或裸媒体文件。""" try: - host = urlparse(url).netloc.lower().split(":")[0] + parsed = urlparse(url) except Exception: return False - return host in _SKIP_HOSTS + host = parsed.netloc.lower().split(":")[0] + if host in _SKIP_HOSTS: + return True + # path 走小写匹配,跟 query 解耦:?foo=bar.jpg 不会误命中 + return parsed.path.lower().endswith(_MEDIA_EXTENSIONS) # 轮询最终状态的参数:每 2s 查一次,最多 30s _POLL_INTERVAL_SEC = 2.0 diff --git a/tests/test_listener_skip.py b/tests/test_listener_skip.py index 489762c..052213c 100644 --- a/tests/test_listener_skip.py +++ b/tests/test_listener_skip.py @@ -31,6 +31,59 @@ def test_should_skip_discord_urls(url: str) -> None: assert _should_skip(url) is True +@pytest.mark.parametrize( + "url", + [ + # 这次实事故的 klipy GIF + "https://klipy.com/gifs/hello-8126--k01KQ1SBY07FP9N8QRABJGVNGQC", + # Tenor(Discord 贴纸面板默认) + "https://tenor.com/view/cat-cute-gif-1234567", + "https://media.tenor.com/AbCdEfGhIj/cat.gif", + # Giphy(也常见) + "https://giphy.com/gifs/cat-cute-AbCdEfGhIj", + "https://media2.giphy.com/media/AbCdEfGhIj/giphy.gif", + # Klipy CDN + "https://media.klipy.com/some.gif", + ], +) +def test_should_skip_sticker_gif_aggregators(url: str) -> None: + assert _should_skip(url) is True + + +@pytest.mark.parametrize( + "url", + [ + # 裸图片(WeChat 图床、随便哪个 host 的图片直链) + "https://mmbiz.qpic.cn/mmbiz_jpg/abc/640.jpg", + "https://example.com/path/photo.PNG", + "https://i.example.com/cat.gif", + "https://example.com/foo.webp", + # 视频/音频直链 + "https://example.com/clip.mp4", + "https://example.com/audio.mp3", + # SVG(即便 host 不在黑名单也拦,配合服务端 SVG 上传黑名单) + "https://example.com/icon.svg", + ], +) +def test_should_skip_bare_media_files(url: str) -> None: + assert _should_skip(url) is True + + +@pytest.mark.parametrize( + "url", + [ + # path 不带媒体扩展,但 query 里出现 .jpg —— 不该误命中 + "https://example.com/api?file=foo.jpg", + # 微信公众号文章 URL(典型分享) + "https://mp.weixin.qq.com/s/abc", + # 小红书帖子(path 没扩展名) + "https://www.xiaohongshu.com/explore/abc123", + ], +) +def test_should_not_skip_normal_articles_with_media_query(url: str) -> None: + assert _should_skip(url) is False + + @pytest.mark.parametrize( "url", [ From 6ab2fb6c6301ddec46f8f3c1d49949781b5f15f5 Mon Sep 17 00:00:00 2001 From: longsizhuo Date: Sat, 25 Apr 2026 07:48:59 +0000 Subject: [PATCH 2/3] =?UTF-8?q?fix(replies):=20=E5=8F=8D=E9=A6=88=E9=93=BE?= =?UTF-8?q?=E6=8E=A5=E4=BB=8E=20/share=20=E6=94=B9=E6=88=90=20/feed?= =?UTF-8?q?=EF=BC=8C'=E5=B7=B2=E4=B8=8A=E6=9E=B6'=20=E5=BA=94=E8=B7=B3?= =?UTF-8?q?=E5=88=97=E8=A1=A8=E9=A1=B5=E8=80=8C=E9=9D=9E=E6=8F=90=E4=BA=A4?= =?UTF-8?q?=E8=A1=A8=E5=8D=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit /share 是单页提交入口(带 ?url=... 预填,给 bookmarklet 用),/feed 才是已审核通过的展示墙。Bot 在 listener.py(首条 reply + APPROVED 终态 reply)和 commands.py(/share 斜杠命令成功回执)三处都把 '点此查看 / 已收录到内卷地狱分享库' 链接指向 /share——结果用户点过去看到的是空提交表单,不是自己刚分享的内容。 --- src/chat_bot/cogs/commands.py | 2 +- src/chat_bot/cogs/listener.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/chat_bot/cogs/commands.py b/src/chat_bot/cogs/commands.py index d5b5e98..368ff33 100644 --- a/src/chat_bot/cogs/commands.py +++ b/src/chat_bot/cogs/commands.py @@ -86,7 +86,7 @@ async def share( # 小字 caption 用 `-# ...` 语法(Discord 的 subtext 行,显示为灰色细字) content = ( f"{url}\n" - f"-# ✅ 已收录到 [内卷地狱分享库](https://involutionhell.com/share) " + f"-# ✅ 已收录到 [内卷地狱分享库](https://involutionhell.com/feed) " f"· `#{result.link_id}` · by {interaction.user.display_name}" ) if recommendation: diff --git a/src/chat_bot/cogs/listener.py b/src/chat_bot/cogs/listener.py index 221768a..9fe4d1d 100644 --- a/src/chat_bot/cogs/listener.py +++ b/src/chat_bot/cogs/listener.py @@ -191,7 +191,7 @@ async def _handle_one_url(self, message: discord.Message, url: str) -> None: await self._safe_reply( message, f"感谢 {message.author.mention} 大佬分享!正在过审核," - f"通过后会上架 [内卷地狱分享库]() #{result.link_id}", + f"通过后会上架 [内卷地狱分享库]() #{result.link_id}", ) # 后台轮询拿最终状态,拿到了再发第二条 @@ -238,7 +238,7 @@ async def _send_status_update( await self._safe_reply( message, f"🎉 {user} 已上架 · #{link_id} " - f"[点此查看]()", + f"[点此查看]()", ) return From 220de9fb7a3021da25652996b5d7c5b56bc9ba95 Mon Sep 17 00:00:00 2001 From: longsizhuo Date: Sat, 25 Apr 2026 08:01:02 +0000 Subject: [PATCH 3/3] =?UTF-8?q?fix(listener):=20=E8=87=AA=E5=AE=B6=20GitHu?= =?UTF-8?q?b=20=E4=BB=93=E5=BA=93=20PR/issue/commit=20=E7=AD=89=20dev=20?= =?UTF-8?q?=E5=AD=90=E8=B7=AF=E5=BE=84=E4=B8=8D=E5=85=A5=E5=88=86=E4=BA=AB?= =?UTF-8?q?=E5=BA=93?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 用户在分享频道贴自己 PR (https://github.com/InvolutionHell/ChatBot/pull/2) 通告,bot 把它当 '社区分享' 收成 #19。同类还会有 issue/commit/compare/actions/releases/discussions/blob/tree 等 dev 子路径。 策略:path 至少 3 段(///)且 org=involutionhell 时 skip,仓库主页和第三方仓库全放行。这是 dev 自循环噪声专杀,不影响合法分享。+11 测试 case。 --- src/chat_bot/cogs/listener.py | 35 ++++++++++++++++++++++++++++++++++- tests/test_listener_skip.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+), 1 deletion(-) diff --git a/src/chat_bot/cogs/listener.py b/src/chat_bot/cogs/listener.py index 9fe4d1d..b62aa31 100644 --- a/src/chat_bot/cogs/listener.py +++ b/src/chat_bot/cogs/listener.py @@ -83,8 +83,39 @@ ) +_INTERNAL_GITHUB_ORG = "involutionhell" # GitHub 路径不区分大小写,统一比小写 + + +def _is_self_org_github_chatter(parsed) -> bool: + """github.com/InvolutionHell// 视为内部 dev 讨论,不入分享库。 + + 放行 case: + - github.com/InvolutionHell/ 仓库主页("安利自家工具"这种正常分享) + - github.com/InvolutionHell// 同上,带尾斜杠 + - github.com/InvolutionHell org 主页(罕见,但放行) + - github.com/<其它 org>/... 第三方仓库的任何路径 + + 拦截 case: + - github.com/InvolutionHell//pull/N + - github.com/InvolutionHell//issues/N + - github.com/InvolutionHell//commit/ + - github.com/InvolutionHell//blob/... + - github.com/InvolutionHell//tree/... + - github.com/InvolutionHell//actions/... + - github.com/InvolutionHell//discussions/... + - github.com/InvolutionHell//releases/tag/... + —— 这些是 PR/issue 自动通知或 dev 联调时贴的,不是给社区"上架"的资源 + """ + host = parsed.netloc.lower().split(":")[0] + if host not in {"github.com", "www.github.com"}: + return False + segs = [s for s in parsed.path.split("/") if s] + # /// (>= 3 段才算 dev 子路径) + return len(segs) >= 3 and segs[0].lower() == _INTERNAL_GITHUB_ORG + + def _should_skip(url: str) -> bool: - """URL 是否属于需要跳过的源:Discord 域、贴纸聚合、或裸媒体文件。""" + """URL 是否属于需要跳过的源:Discord 域、贴纸聚合、自家 GitHub dev 子路径、或裸媒体文件。""" try: parsed = urlparse(url) except Exception: @@ -92,6 +123,8 @@ def _should_skip(url: str) -> bool: host = parsed.netloc.lower().split(":")[0] if host in _SKIP_HOSTS: return True + if _is_self_org_github_chatter(parsed): + return True # path 走小写匹配,跟 query 解耦:?foo=bar.jpg 不会误命中 return parsed.path.lower().endswith(_MEDIA_EXTENSIONS) diff --git a/tests/test_listener_skip.py b/tests/test_listener_skip.py index 052213c..3772eeb 100644 --- a/tests/test_listener_skip.py +++ b/tests/test_listener_skip.py @@ -89,7 +89,12 @@ def test_should_not_skip_normal_articles_with_media_query(url: str) -> None: [ "https://arxiv.org/abs/2501.00001", "https://mp.weixin.qq.com/s/abc", + # 自家仓库主页是合法分享("看看我们的新工具"),允许 "https://github.com/InvolutionHell/ChatBot", + "https://github.com/InvolutionHell/ChatBot/", + # 第三方仓库的任何路径都允许 + "https://github.com/torvalds/linux/commit/abc123", + "https://github.com/openai/openai-python/pull/42", "https://scholar.google.com/scholar?q=rag", # 只有 host 相似但不完全匹配就不该 skip(防范未来新域名放行策略) "https://not-discord.com/x", @@ -100,6 +105,29 @@ def test_should_not_skip_other_urls(url: str) -> None: assert _should_skip(url) is False +@pytest.mark.parametrize( + "url", + [ + # 实际事故:bot 自己的 PR 通告被自己捕获成 #19 + "https://github.com/InvolutionHell/ChatBot/pull/2", + # 各种 dev 子路径都该 skip + "https://github.com/InvolutionHell/ChatBot/issues/5", + "https://github.com/InvolutionHell/ChatBot/commit/abc123", + "https://github.com/InvolutionHell/ChatBot/compare/main...feature", + "https://github.com/InvolutionHell/ChatBot/actions/runs/123", + "https://github.com/InvolutionHell/ChatBot/releases/tag/v1.0", + "https://github.com/InvolutionHell/ChatBot/discussions/10", + "https://github.com/InvolutionHell/ChatBot/blob/main/README.md", + "https://github.com/InvolutionHell/ChatBot/tree/main/src", + # 大小写漂移也要拦 + "https://github.com/INVOLUTIONHELL/ChatBot/pull/2", + "https://www.github.com/InvolutionHell/involutionhell/pull/320", + ], +) +def test_should_skip_self_org_github_dev_chatter(url: str) -> None: + assert _should_skip(url) is True + + def test_should_skip_handles_bad_url_gracefully() -> None: # 坏 URL 不应抛异常;当前 urlparse 对大多数输入都不抛,兜底返回 False assert _should_skip("not-a-url") is False