diff --git a/src/chat_bot/cogs/commands.py b/src/chat_bot/cogs/commands.py index d5b5e98..368ff33 100644 --- a/src/chat_bot/cogs/commands.py +++ b/src/chat_bot/cogs/commands.py @@ -86,7 +86,7 @@ async def share( # 小字 caption 用 `-# ...` 语法(Discord 的 subtext 行,显示为灰色细字) content = ( f"{url}\n" - f"-# ✅ 已收录到 [内卷地狱分享库](https://involutionhell.com/share) " + f"-# ✅ 已收录到 [内卷地狱分享库](https://involutionhell.com/feed) " f"· `#{result.link_id}` · by {interaction.user.display_name}" ) if recommendation: diff --git a/src/chat_bot/cogs/listener.py b/src/chat_bot/cogs/listener.py index a05acea..b62aa31 100644 --- a/src/chat_bot/cogs/listener.py +++ b/src/chat_bot/cogs/listener.py @@ -28,31 +28,105 @@ _URL_RE = re.compile(r"https?://[^\s<>\"'\]\)]+", re.IGNORECASE) -# 跳过 Discord 自身的各种链接:用户经常复制错(比如右键"复制消息链接"会粘 -# discord.com/channels/.../... 出来,这不该被当作"分享"入库)。静默忽略,不 -# 回复也不提交,像 bot 没看到一样。 +# 跳过的链接源。两层: +# 1. Discord 自身(消息链接 / 附件 CDN)—— 用户复制消息链接时常误粘 +# 2. 贴纸 / GIF / meme 聚合站 —— Discord 内置贴纸面板会发 tenor/klipy/giphy +# 链接出来,message.content 里就是裸 URL。这些不是"分享资源",不该入库 +# 静默忽略,不回复不提交,像 bot 没看到一样。 _SKIP_HOSTS = frozenset({ - # 主站 + # Discord 主站 "discord.com", "www.discord.com", "canary.discord.com", "ptb.discord.com", - # 邀请短链 + # Discord 邀请短链 "discord.gg", - # 附件 / CDN + # Discord 附件 / CDN "discordapp.com", "cdn.discordapp.com", "media.discordapp.net", + # 贴纸 / GIF 聚合(Discord 贴纸面板默认走这些) + "tenor.com", + "media.tenor.com", + "c.tenor.com", + "giphy.com", + "media.giphy.com", + "media0.giphy.com", + "media1.giphy.com", + "media2.giphy.com", + "media3.giphy.com", + "media4.giphy.com", + "klipy.com", + "media.klipy.com", }) +# 兜底:只指向静态媒体文件的 URL(路径以这些扩展名结尾)一律跳过——常见于 +# WeChat / 各种图床的裸图片链接,非分享资源。把扩展名匹配做在 path 上避免误伤 +# 带 query 的正常链接(query 里出现 .jpg 不算)。 +_MEDIA_EXTENSIONS = ( + ".gif", + ".png", + ".jpg", + ".jpeg", + ".webp", + ".bmp", + ".svg", + ".ico", + ".mp4", + ".webm", + ".mov", + ".m4v", + ".mp3", + ".wav", + ".ogg", + ".flac", +) + + +_INTERNAL_GITHUB_ORG = "involutionhell" # GitHub 路径不区分大小写,统一比小写 + + +def _is_self_org_github_chatter(parsed) -> bool: + """github.com/InvolutionHell// 视为内部 dev 讨论,不入分享库。 + + 放行 case: + - github.com/InvolutionHell/ 仓库主页("安利自家工具"这种正常分享) + - github.com/InvolutionHell// 同上,带尾斜杠 + - github.com/InvolutionHell org 主页(罕见,但放行) + - github.com/<其它 org>/... 第三方仓库的任何路径 + + 拦截 case: + - github.com/InvolutionHell//pull/N + - github.com/InvolutionHell//issues/N + - github.com/InvolutionHell//commit/ + - github.com/InvolutionHell//blob/... + - github.com/InvolutionHell//tree/... + - github.com/InvolutionHell//actions/... + - github.com/InvolutionHell//discussions/... + - github.com/InvolutionHell//releases/tag/... + —— 这些是 PR/issue 自动通知或 dev 联调时贴的,不是给社区"上架"的资源 + """ + host = parsed.netloc.lower().split(":")[0] + if host not in {"github.com", "www.github.com"}: + return False + segs = [s for s in parsed.path.split("/") if s] + # /// (>= 3 段才算 dev 子路径) + return len(segs) >= 3 and segs[0].lower() == _INTERNAL_GITHUB_ORG + def _should_skip(url: str) -> bool: - """URL 是否属于需要跳过的源(当前只屏蔽 Discord 自身域名)。""" + """URL 是否属于需要跳过的源:Discord 域、贴纸聚合、自家 GitHub dev 子路径、或裸媒体文件。""" try: - host = urlparse(url).netloc.lower().split(":")[0] + parsed = urlparse(url) except Exception: return False - return host in _SKIP_HOSTS + host = parsed.netloc.lower().split(":")[0] + if host in _SKIP_HOSTS: + return True + if _is_self_org_github_chatter(parsed): + return True + # path 走小写匹配,跟 query 解耦:?foo=bar.jpg 不会误命中 + return parsed.path.lower().endswith(_MEDIA_EXTENSIONS) # 轮询最终状态的参数:每 2s 查一次,最多 30s _POLL_INTERVAL_SEC = 2.0 @@ -150,7 +224,7 @@ async def _handle_one_url(self, message: discord.Message, url: str) -> None: await self._safe_reply( message, f"感谢 {message.author.mention} 大佬分享!正在过审核," - f"通过后会上架 [内卷地狱分享库]() #{result.link_id}", + f"通过后会上架 [内卷地狱分享库]() #{result.link_id}", ) # 后台轮询拿最终状态,拿到了再发第二条 @@ -197,7 +271,7 @@ async def _send_status_update( await self._safe_reply( message, f"🎉 {user} 已上架 · #{link_id} " - f"[点此查看]()", + f"[点此查看]()", ) return diff --git a/tests/test_listener_skip.py b/tests/test_listener_skip.py index 489762c..3772eeb 100644 --- a/tests/test_listener_skip.py +++ b/tests/test_listener_skip.py @@ -31,12 +31,70 @@ def test_should_skip_discord_urls(url: str) -> None: assert _should_skip(url) is True +@pytest.mark.parametrize( + "url", + [ + # 这次实事故的 klipy GIF + "https://klipy.com/gifs/hello-8126--k01KQ1SBY07FP9N8QRABJGVNGQC", + # Tenor(Discord 贴纸面板默认) + "https://tenor.com/view/cat-cute-gif-1234567", + "https://media.tenor.com/AbCdEfGhIj/cat.gif", + # Giphy(也常见) + "https://giphy.com/gifs/cat-cute-AbCdEfGhIj", + "https://media2.giphy.com/media/AbCdEfGhIj/giphy.gif", + # Klipy CDN + "https://media.klipy.com/some.gif", + ], +) +def test_should_skip_sticker_gif_aggregators(url: str) -> None: + assert _should_skip(url) is True + + +@pytest.mark.parametrize( + "url", + [ + # 裸图片(WeChat 图床、随便哪个 host 的图片直链) + "https://mmbiz.qpic.cn/mmbiz_jpg/abc/640.jpg", + "https://example.com/path/photo.PNG", + "https://i.example.com/cat.gif", + "https://example.com/foo.webp", + # 视频/音频直链 + "https://example.com/clip.mp4", + "https://example.com/audio.mp3", + # SVG(即便 host 不在黑名单也拦,配合服务端 SVG 上传黑名单) + "https://example.com/icon.svg", + ], +) +def test_should_skip_bare_media_files(url: str) -> None: + assert _should_skip(url) is True + + +@pytest.mark.parametrize( + "url", + [ + # path 不带媒体扩展,但 query 里出现 .jpg —— 不该误命中 + "https://example.com/api?file=foo.jpg", + # 微信公众号文章 URL(典型分享) + "https://mp.weixin.qq.com/s/abc", + # 小红书帖子(path 没扩展名) + "https://www.xiaohongshu.com/explore/abc123", + ], +) +def test_should_not_skip_normal_articles_with_media_query(url: str) -> None: + assert _should_skip(url) is False + + @pytest.mark.parametrize( "url", [ "https://arxiv.org/abs/2501.00001", "https://mp.weixin.qq.com/s/abc", + # 自家仓库主页是合法分享("看看我们的新工具"),允许 "https://github.com/InvolutionHell/ChatBot", + "https://github.com/InvolutionHell/ChatBot/", + # 第三方仓库的任何路径都允许 + "https://github.com/torvalds/linux/commit/abc123", + "https://github.com/openai/openai-python/pull/42", "https://scholar.google.com/scholar?q=rag", # 只有 host 相似但不完全匹配就不该 skip(防范未来新域名放行策略) "https://not-discord.com/x", @@ -47,6 +105,29 @@ def test_should_not_skip_other_urls(url: str) -> None: assert _should_skip(url) is False +@pytest.mark.parametrize( + "url", + [ + # 实际事故:bot 自己的 PR 通告被自己捕获成 #19 + "https://github.com/InvolutionHell/ChatBot/pull/2", + # 各种 dev 子路径都该 skip + "https://github.com/InvolutionHell/ChatBot/issues/5", + "https://github.com/InvolutionHell/ChatBot/commit/abc123", + "https://github.com/InvolutionHell/ChatBot/compare/main...feature", + "https://github.com/InvolutionHell/ChatBot/actions/runs/123", + "https://github.com/InvolutionHell/ChatBot/releases/tag/v1.0", + "https://github.com/InvolutionHell/ChatBot/discussions/10", + "https://github.com/InvolutionHell/ChatBot/blob/main/README.md", + "https://github.com/InvolutionHell/ChatBot/tree/main/src", + # 大小写漂移也要拦 + "https://github.com/INVOLUTIONHELL/ChatBot/pull/2", + "https://www.github.com/InvolutionHell/involutionhell/pull/320", + ], +) +def test_should_skip_self_org_github_dev_chatter(url: str) -> None: + assert _should_skip(url) is True + + def test_should_skip_handles_bad_url_gracefully() -> None: # 坏 URL 不应抛异常;当前 urlparse 对大多数输入都不抛,兜底返回 False assert _should_skip("not-a-url") is False