# 方案A Demo：本地生成 feed.xml 并用 raw 链接给 foLo 订阅

- 本 Notebook 演示如何聚合 RSS、筛选并在仓库根目录生成 `feed.xml`。
- 推送到远端后，可用如下 raw 链接在 foLo/Feedly 订阅：

``
https://raw.githubusercontent.com/JedimasterLu/acafeed/feedemit/feed.xml
``

你也可以把分支名改为你的默认分支（例如 main）。

In [1]:
import feedparser
from feedgen.feed import FeedGenerator
from datetime import datetime, timedelta, timezone
import re, os
from typing import Any, Dict, List, Optional

In [2]:
# 配置：源、过滤规则、元信息与输出位置

SOURCES = [
    {"name": "Nature Materials", "url": "http://www.nature.com/nmat/current_issue/rss/"},
    # 在这里添加更多源...
]

FILTERS = {
    "include": [],   # ["ai", "ml"] 仅包含包含任意关键词的条目
    "exclude": [],   # ["ad", "sponsor"] 排除包含任意关键词的条目
    "since_days": 7, # 仅保留最近 N 天；设为 None/删除以关闭
    "max_entries": 100, # 输出条目上限
}

META = {
    "title": "AcaFeed Combined Feed",
    "description": "Aggregated feed generated locally",
    "home_page": "https://github.com/JedimasterLu/acafeed",
    # 可选：如果已确定最终订阅URL，可填入作为 rel=self
    # "feed_link": "https://raw.githubusercontent.com/JedimasterLu/acafeed/feedemit/feed.xml"
}

OUTPUT_PATH = os.path.abspath(os.path.join(os.getcwd(), "feed.xml"))
print("Output ->", OUTPUT_PATH)

Output -> /Users/jylu/Projects/acafeed/feed.xml


In [3]:
# 工具函数：标准化文本 / 解析时间 / 去重键

from typing import Tuple

def normalize_text(s: Optional[str]) -> str:
    if not s:
        return ""
    s = re.sub(r"<[^>]+>", " ", s)
    return s.strip().lower()


def parse_dt(entry: Dict[str, Any]) -> datetime:
    for field in ("published_parsed", "updated_parsed"):
        dt_struct = entry.get(field)
        if dt_struct is not None:
            try:
                return datetime(*dt_struct[:6], tzinfo=timezone.utc)
            except Exception:
                pass
    return datetime.now(timezone.utc)


def entry_key(entry: Dict[str, Any]) -> str:
    return entry.get("link") or entry.get("id") or normalize_text(entry.get("title") or "")


def fetch_entries(sources: List[Dict[str, str]]) -> List[Dict[str, Any]]:
    all_entries = []
    for src in sources:
        url = src.get("url") or src.get("link")
        if not url:
            continue
        parsed = feedparser.parse(url)
        for e in getattr(parsed, "entries", []) or []:
            e["_source_name"] = src.get("name") or url
            all_entries.append(e)
    return all_entries


def apply_filters(entries: List[Dict[str, Any]], filters: Dict[str, Any]) -> List[Dict[str, Any]]:
    include = [x.lower() for x in (filters.get("include") or []) if x]
    exclude = [x.lower() for x in (filters.get("exclude") or []) if x]
    since_days = filters.get("since_days")
    cutoff = None
    if isinstance(since_days, int) and since_days > 0:
        cutoff = datetime.now(timezone.utc) - timedelta(days=since_days)

    def ok(e: Dict[str, Any]) -> bool:
        text = " ".join([normalize_text(e.get("title")), normalize_text(e.get("summary")), normalize_text(e.get("description"))])
        if include and not any(k in text for k in include):
            return False
        if exclude and any(k in text for k in exclude):
            return False
        if cutoff is not None and parse_dt(e) < cutoff:
            return False
        return True

    return [e for e in entries if ok(e)]

In [4]:
# 生成 feed.xml

# 1) 拉取与筛选
entries = fetch_entries(SOURCES)
filtered = apply_filters(entries, FILTERS)

# 2) 去重与排序
seen = set()
unique = []
for e in filtered:
    k = entry_key(e)
    if not k or k in seen:
        continue
    seen.add(k)
    unique.append(e)
unique.sort(key=parse_dt, reverse=True)

# 3) 裁剪数量
max_entries = FILTERS.get("max_entries")
if isinstance(max_entries, int) and max_entries > 0:
    unique = unique[:max_entries]

# 4) 构建 RSS 并写入
fg = FeedGenerator()
fg.id(META.get("home_page") or "https://github.com/JedimasterLu/acafeed")
fg.title(META.get("title") or "AcaFeed")
fg.link(href=META.get("home_page") or "https://github.com/JedimasterLu/acafeed", rel="alternate")
if META.get("feed_link"):
    fg.link(href=META["feed_link"], rel="self")
fg.description(META.get("description") or META.get("title") or "AcaFeed")
fg.language("en")

for e in unique:
    fe = fg.add_entry()
    fe.id(entry_key(e))
    fe.title(e.get("title") or (normalize_text(e.get("summary"))[:140] or "Untitled"))
    if e.get("link"):
        fe.link(href=e["link"])
    if e.get("summary") or e.get("description"):
        fe.description(e.get("summary") or e.get("description"))
    fe.published(parse_dt(e))
    if e.get("_source_name"):
        fe.category(term=e.get("_source_name"))

fg.rss_file(OUTPUT_PATH, pretty=True)
print(f"Wrote {OUTPUT_PATH} with {len(unique)} entries")

Wrote /Users/jylu/Projects/acafeed/feed.xml with 6 entries


In [5]:
# 查看 feed.xml 前若干行

with open(OUTPUT_PATH, 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        print(line.rstrip())
        if i > 40:
            break

<?xml version='1.0' encoding='UTF-8'?>
<rss xmlns:atom="http://www.w3.org/2005/Atom" xmlns:content="http://purl.org/rss/1.0/modules/content/" version="2.0">
  <channel>
    <title>AcaFeed Combined Feed</title>
    <link>https://github.com/JedimasterLu/acafeed</link>
    <description>Aggregated feed generated locally</description>
    <docs>http://www.rssboard.org/rss-specification</docs>
    <generator>python-feedgen</generator>
    <language>en</language>
    <lastBuildDate>Tue, 11 Nov 2025 21:25:48 +0000</lastBuildDate>
    <item>
      <title>Mechano-induced patterned domain formation by monocytes</title>
      <link>https://www.nature.com/articles/s41563-025-02397-2</link>
      <description>&lt;p&gt;Nature Materials, Published online: 05 November 2025; &lt;a href="https://www.nature.com/articles/s41563-025-02397-2"&gt;doi:10.1038/s41563-025-02397-2&lt;/a&gt;&lt;/p&gt;Human primary monocytes reversibly phase separate into regular, multicellular, multilayered domains on soft matrice

## 推送与订阅

1. 在终端提交并推送：

```fish
git add feed.xml
git commit -m "chore: update feed.xml"
git push origin feedemit
```

2. 在 foLo 的新增订阅里，粘贴这个 URL（当前分支 `feedemit`）：

``
https://raw.githubusercontent.com/JedimasterLu/acafeed/feedemit/feed.xml
```

若你用的是其他默认分支（如 `main`），把 URL 中的分支名替换为 `main` 即可。