-
Notifications
You must be signed in to change notification settings - Fork 0
fix(listener): 拦贴纸/GIF聚合站 + 裸媒体文件,避免 Discord 表情包误入分享库 #2
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -28,31 +28,105 @@ | |
|
|
||
| _URL_RE = re.compile(r"https?://[^\s<>\"'\]\)]+", re.IGNORECASE) | ||
|
|
||
| # 跳过 Discord 自身的各种链接:用户经常复制错(比如右键"复制消息链接"会粘 | ||
| # discord.com/channels/.../... 出来,这不该被当作"分享"入库)。静默忽略,不 | ||
| # 回复也不提交,像 bot 没看到一样。 | ||
| # 跳过的链接源。两层: | ||
| # 1. Discord 自身(消息链接 / 附件 CDN)—— 用户复制消息链接时常误粘 | ||
| # 2. 贴纸 / GIF / meme 聚合站 —— Discord 内置贴纸面板会发 tenor/klipy/giphy | ||
| # 链接出来,message.content 里就是裸 URL。这些不是"分享资源",不该入库 | ||
| # 静默忽略,不回复不提交,像 bot 没看到一样。 | ||
| _SKIP_HOSTS = frozenset({ | ||
| # 主站 | ||
| # Discord 主站 | ||
| "discord.com", | ||
| "www.discord.com", | ||
| "canary.discord.com", | ||
| "ptb.discord.com", | ||
| # 邀请短链 | ||
| # Discord 邀请短链 | ||
| "discord.gg", | ||
| # 附件 / CDN | ||
| # Discord 附件 / CDN | ||
| "discordapp.com", | ||
| "cdn.discordapp.com", | ||
| "media.discordapp.net", | ||
| # 贴纸 / GIF 聚合(Discord 贴纸面板默认走这些) | ||
| "tenor.com", | ||
| "media.tenor.com", | ||
| "c.tenor.com", | ||
| "giphy.com", | ||
| "media.giphy.com", | ||
| "media0.giphy.com", | ||
| "media1.giphy.com", | ||
| "media2.giphy.com", | ||
| "media3.giphy.com", | ||
| "media4.giphy.com", | ||
| "klipy.com", | ||
| "media.klipy.com", | ||
| }) | ||
|
|
||
| # 兜底:只指向静态媒体文件的 URL(路径以这些扩展名结尾)一律跳过——常见于 | ||
| # WeChat / 各种图床的裸图片链接,非分享资源。把扩展名匹配做在 path 上避免误伤 | ||
| # 带 query 的正常链接(query 里出现 .jpg 不算)。 | ||
| _MEDIA_EXTENSIONS = ( | ||
| ".gif", | ||
| ".png", | ||
| ".jpg", | ||
| ".jpeg", | ||
| ".webp", | ||
| ".bmp", | ||
| ".svg", | ||
| ".ico", | ||
| ".mp4", | ||
| ".webm", | ||
| ".mov", | ||
| ".m4v", | ||
| ".mp3", | ||
| ".wav", | ||
| ".ogg", | ||
| ".flac", | ||
| ) | ||
|
|
||
|
|
||
| _INTERNAL_GITHUB_ORG = "involutionhell" # GitHub 路径不区分大小写,统一比小写 | ||
|
|
||
|
|
||
| def _is_self_org_github_chatter(parsed) -> bool: | ||
| """github.com/InvolutionHell/<repo>/<sub-path> 视为内部 dev 讨论,不入分享库。 | ||
|
|
||
| 放行 case: | ||
| - github.com/InvolutionHell/<repo> 仓库主页("安利自家工具"这种正常分享) | ||
| - github.com/InvolutionHell/<repo>/ 同上,带尾斜杠 | ||
| - github.com/InvolutionHell org 主页(罕见,但放行) | ||
| - github.com/<其它 org>/... 第三方仓库的任何路径 | ||
|
|
||
| 拦截 case: | ||
| - github.com/InvolutionHell/<repo>/pull/N | ||
| - github.com/InvolutionHell/<repo>/issues/N | ||
| - github.com/InvolutionHell/<repo>/commit/<sha> | ||
| - github.com/InvolutionHell/<repo>/blob/... | ||
| - github.com/InvolutionHell/<repo>/tree/... | ||
| - github.com/InvolutionHell/<repo>/actions/... | ||
| - github.com/InvolutionHell/<repo>/discussions/... | ||
| - github.com/InvolutionHell/<repo>/releases/tag/... | ||
| —— 这些是 PR/issue 自动通知或 dev 联调时贴的,不是给社区"上架"的资源 | ||
| """ | ||
| host = parsed.netloc.lower().split(":")[0] | ||
| if host not in {"github.com", "www.github.com"}: | ||
| return False | ||
| segs = [s for s in parsed.path.split("/") if s] | ||
| # /<org>/<repo>/<sub-path...> (>= 3 段才算 dev 子路径) | ||
| return len(segs) >= 3 and segs[0].lower() == _INTERNAL_GITHUB_ORG | ||
|
|
||
|
|
||
| def _should_skip(url: str) -> bool: | ||
| """URL 是否属于需要跳过的源(当前只屏蔽 Discord 自身域名)。""" | ||
| """URL 是否属于需要跳过的源:Discord 域、贴纸聚合、自家 GitHub dev 子路径、或裸媒体文件。""" | ||
| try: | ||
| host = urlparse(url).netloc.lower().split(":")[0] | ||
| parsed = urlparse(url) | ||
| except Exception: | ||
| return False | ||
| return host in _SKIP_HOSTS | ||
| host = parsed.netloc.lower().split(":")[0] | ||
| if host in _SKIP_HOSTS: | ||
| return True | ||
| if _is_self_org_github_chatter(parsed): | ||
| return True | ||
| # path 走小写匹配,跟 query 解耦:?foo=bar.jpg 不会误命中 | ||
| return parsed.path.lower().endswith(_MEDIA_EXTENSIONS) | ||
|
Comment on lines
+128
to
+129
|
||
|
|
||
| # 轮询最终状态的参数:每 2s 查一次,最多 30s | ||
| _POLL_INTERVAL_SEC = 2.0 | ||
|
|
@@ -150,7 +224,7 @@ async def _handle_one_url(self, message: discord.Message, url: str) -> None: | |
| await self._safe_reply( | ||
| message, | ||
| f"感谢 {message.author.mention} 大佬分享!正在过审核," | ||
| f"通过后会上架 [内卷地狱分享库](<https://involutionhell.com/share>) #{result.link_id}", | ||
| f"通过后会上架 [内卷地狱分享库](<https://involutionhell.com/feed>) #{result.link_id}", | ||
| ) | ||
|
|
||
| # 后台轮询拿最终状态,拿到了再发第二条 | ||
|
|
@@ -197,7 +271,7 @@ async def _send_status_update( | |
| await self._safe_reply( | ||
| message, | ||
| f"🎉 {user} 已上架 · #{link_id} " | ||
| f"[点此查看](<https://involutionhell.com/share>)", | ||
| f"[点此查看](<https://involutionhell.com/feed>)", | ||
| ) | ||
| return | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -31,12 +31,70 @@ def test_should_skip_discord_urls(url: str) -> None: | |
| assert _should_skip(url) is True | ||
|
|
||
|
|
||
| @pytest.mark.parametrize( | ||
| "url", | ||
| [ | ||
| # 这次实事故的 klipy GIF | ||
| "https://klipy.com/gifs/hello-8126--k01KQ1SBY07FP9N8QRABJGVNGQC", | ||
| # Tenor(Discord 贴纸面板默认) | ||
| "https://tenor.com/view/cat-cute-gif-1234567", | ||
| "https://media.tenor.com/AbCdEfGhIj/cat.gif", | ||
| # Giphy(也常见) | ||
| "https://giphy.com/gifs/cat-cute-AbCdEfGhIj", | ||
| "https://media2.giphy.com/media/AbCdEfGhIj/giphy.gif", | ||
| # Klipy CDN | ||
| "https://media.klipy.com/some.gif", | ||
| ], | ||
| ) | ||
| def test_should_skip_sticker_gif_aggregators(url: str) -> None: | ||
| assert _should_skip(url) is True | ||
|
|
||
|
|
||
| @pytest.mark.parametrize( | ||
| "url", | ||
| [ | ||
| # 裸图片(WeChat 图床、随便哪个 host 的图片直链) | ||
| "https://mmbiz.qpic.cn/mmbiz_jpg/abc/640.jpg", | ||
| "https://example.com/path/photo.PNG", | ||
| "https://i.example.com/cat.gif", | ||
| "https://example.com/foo.webp", | ||
| # 视频/音频直链 | ||
| "https://example.com/clip.mp4", | ||
| "https://example.com/audio.mp3", | ||
| # SVG(即便 host 不在黑名单也拦,配合服务端 SVG 上传黑名单) | ||
| "https://example.com/icon.svg", | ||
| ], | ||
|
Comment on lines
+53
to
+66
|
||
| ) | ||
| def test_should_skip_bare_media_files(url: str) -> None: | ||
| assert _should_skip(url) is True | ||
|
|
||
|
|
||
| @pytest.mark.parametrize( | ||
| "url", | ||
| [ | ||
| # path 不带媒体扩展,但 query 里出现 .jpg —— 不该误命中 | ||
| "https://example.com/api?file=foo.jpg", | ||
| # 微信公众号文章 URL(典型分享) | ||
| "https://mp.weixin.qq.com/s/abc", | ||
| # 小红书帖子(path 没扩展名) | ||
| "https://www.xiaohongshu.com/explore/abc123", | ||
| ], | ||
| ) | ||
| def test_should_not_skip_normal_articles_with_media_query(url: str) -> None: | ||
| assert _should_skip(url) is False | ||
|
|
||
|
|
||
| @pytest.mark.parametrize( | ||
| "url", | ||
| [ | ||
| "https://arxiv.org/abs/2501.00001", | ||
| "https://mp.weixin.qq.com/s/abc", | ||
| # 自家仓库主页是合法分享("看看我们的新工具"),允许 | ||
| "https://github.com/InvolutionHell/ChatBot", | ||
| "https://github.com/InvolutionHell/ChatBot/", | ||
| # 第三方仓库的任何路径都允许 | ||
| "https://github.com/torvalds/linux/commit/abc123", | ||
| "https://github.com/openai/openai-python/pull/42", | ||
| "https://scholar.google.com/scholar?q=rag", | ||
| # 只有 host 相似但不完全匹配就不该 skip(防范未来新域名放行策略) | ||
| "https://not-discord.com/x", | ||
|
|
@@ -47,6 +105,29 @@ def test_should_not_skip_other_urls(url: str) -> None: | |
| assert _should_skip(url) is False | ||
|
|
||
|
|
||
| @pytest.mark.parametrize( | ||
| "url", | ||
| [ | ||
| # 实际事故:bot 自己的 PR 通告被自己捕获成 #19 | ||
| "https://github.com/InvolutionHell/ChatBot/pull/2", | ||
| # 各种 dev 子路径都该 skip | ||
| "https://github.com/InvolutionHell/ChatBot/issues/5", | ||
| "https://github.com/InvolutionHell/ChatBot/commit/abc123", | ||
| "https://github.com/InvolutionHell/ChatBot/compare/main...feature", | ||
| "https://github.com/InvolutionHell/ChatBot/actions/runs/123", | ||
| "https://github.com/InvolutionHell/ChatBot/releases/tag/v1.0", | ||
| "https://github.com/InvolutionHell/ChatBot/discussions/10", | ||
| "https://github.com/InvolutionHell/ChatBot/blob/main/README.md", | ||
| "https://github.com/InvolutionHell/ChatBot/tree/main/src", | ||
| # 大小写漂移也要拦 | ||
| "https://github.com/INVOLUTIONHELL/ChatBot/pull/2", | ||
| "https://www.github.com/InvolutionHell/involutionhell/pull/320", | ||
| ], | ||
| ) | ||
| def test_should_skip_self_org_github_dev_chatter(url: str) -> None: | ||
| assert _should_skip(url) is True | ||
|
|
||
|
|
||
| def test_should_skip_handles_bad_url_gracefully() -> None: | ||
| # 坏 URL 不应抛异常;当前 urlparse 对大多数输入都不抛,兜底返回 False | ||
| assert _should_skip("not-a-url") is False | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
host = parsed.netloc.lower().split(":")[0]is a bit fragile (doesn’t handle IPv6 literals like[::1]:443and can be confused by userinfo in the URL). Preferparsed.hostname(already lowercased byurlparse) and then compare against_SKIP_HOSTS.