In [37]:
%pip install --upgrade jupyter requests tqdm ipywidgets pandas

Collecting pandas
  Downloading pandas-2.2.3-cp312-cp312-win_amd64.whl.metadata (19 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.2.3-cp312-cp312-win_amd64.whl (11.5 MB)
   ---------------------------------------- 0.0/11.5 MB ? eta -:--:--
   -------------------- ------------------- 5.8/11.5 MB 35.3 MB/s eta 0:00:01
   ---------------------------------------- 11.5/11.5 MB 45.0 MB/s eta 0:00:00
Downloading tzdata-2025.2-py2.py3-none-any.whl (347 kB)
Installing collected packages: tzdata, pandas
Successfully installed pandas-2.2.3 tzdata-2025.2
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [13]:
!jupyter labextension enable widgetsnbextension

In [14]:
import requests
from tqdm.notebook import tqdm
import xml.etree.ElementTree as ET
import html
import time

In [15]:
def fetch_mementos_for_feed(feed_url, limit=10, retries=3, backoff=10, timeout=10):
    """
    Query the Wayback Machine TimeMap Link API for a given feed URL,
    parse the returned Link-format lines, and return up to `limit` memento URIs.
    """
    headers = {
        "User-Agent": "Mozilla/5.0 (compatible; ArchiveFetcher/1.0; +https://example.com)"
    }
    timemap_url = f"https://web.archive.org/web/timemap/link/{feed_url}"
    for attempt in range(retries):
        try:
            resp = requests.get(timemap_url, headers=headers, timeout=timeout)
            resp.raise_for_status()
            lines = resp.text.splitlines()
            mementos = []
            for line in lines:
                parts = line.split(';', 1)
                if not parts or not parts[0].startswith('<') or '>' not in parts[0]:
                    continue
                uri = parts[0].strip()[1:-1]
                mementos.append(uri)
                if len(mementos) >= limit:
                    break
            return mementos
        except requests.RequestException as e:
            print(f"[ERROR] Attempt {attempt + 1} failed for {timemap_url}: {e}")
            if attempt < retries - 1:
                sleep_time = backoff**(attempt + 1)
                print(f"[INFO] Retrying in {sleep_time}s...")
                time.sleep(sleep_time)
            else:
                print(f"[ERROR] All attempts failed for {timemap_url}")
                return None

In [16]:
def fetch_xml_from_memento(memento_url, retries=3, backoff=10, timeout=10):
    """
    Query the Memento URL, parse it as XML. Retries on failure with backoff.
    Returns an ElementTree root element, or None on failure.
    """
    headers = {
        "User-Agent": "Mozilla/5.0 (compatible; ArchiveFetcher/1.0; +https://example.com)"
    }

    for attempt in range(retries):
        try:
            resp = requests.get(memento_url, headers=headers, timeout=timeout)
            resp.raise_for_status()
            try:
                root = ET.fromstring(resp.content)
                return root
            except ET.ParseError as e:
                print(f"[ERROR] Failed to parse XML from memento {memento_url}: {e}")
                return None
        except requests.exceptions.RequestException as e:
            print(f"[ERROR] Attempt {attempt + 1} failed for {memento_url}: {e}")
            if attempt < retries - 1:
                sleep_time = backoff**(attempt + 1)
                print(f"[INFO] Retrying in {sleep_time}s...")
                time.sleep(sleep_time)
            else:
                print(f"[ERROR] All attempts failed for {memento_url}")
                return None

In [17]:
def extract_memento_data(source_url, content):
    """
    Extracts relevant data from the XML content of a Memento response.
    """
    # Get title and description of channel
    channel = content.find("channel")
    if channel is not None:
        channel_title = channel.find("title").text if channel.find("title") is not None else None
        channel_description = (
            channel.find("description").text
            if channel.find("description") is not None
            else None
        )
    else:
        channel_title = None
        channel_description = None
    
    # Parse the XML content
    items = content.findall(".//item")

    memento_data = []

    for item in items:
        # Extract required fields from each item
        title = item.find("title").text if item.find("title") is not None else None
        description = (
            item.find("description").text
            if item.find("description") is not None
            else None
        )
        link = item.find("link").text if item.find("link") is not None else None
        pubDate = (
            item.find("pubDate").text if item.find("pubDate") is not None else None
        )

        author_tags = ["author", "{http://purl.org/dc/elements/1.1/}creator", "dc:creator"]
        author = None
        for tag in author_tags:
            elem = item.find(tag)
            if elem is not None and elem.text:
                author = elem.text
                break

        memento_data.append(
            {
                "source": source_url,
                "channel_title": channel_title,
                "channel_description": channel_description,
                "title": title,
                "description": description,
                "link": link,
                "pubDate": pubDate,
                "author": author,
            }
        )

    return memento_data

In [18]:
feed_urls = [
    # For enthusiasts
    "https://machinelearningmastery.com/feed",
    "https://transferlab.ai/index.xml",
    # "http://feeds.feedburner.com/FeaturedBlogPosts-DataScienceCentral?format=xml", <--- Give error, not format as xml RSS
    # For company
    "https://eng.uber.com/tag/machine-learning/feed",
    "https://aws.amazon.com/blogs/machine-learning/feed",
    "http://news.mit.edu/rss/topic/artificial-intelligence2",
    "http://feeds.feedburner.com/nvidiablog",
    "https://openai.com/news/rss.xml",
    "http://feeds.feedburner.com/blogspot/gJZg",
    # For researchers
    "http://arxiv.org/rss/cs.LG",
    "http://arxiv.org/rss/stat.ML",
    "https://distill.pub/rss.xml",
    "https://bair.berkeley.edu/blog/feed.xml",
    "https://becominghuman.ai/feed",
    "https://www.microsoft.com/en-us/research/feed",
]

# Max number of mementos per feed
limit_per_feed = 20

In [19]:
# Create a dictionary to store the feed URLs and their corresponding memento URIs
feed_mementos = {}

In [20]:
for feed in tqdm(feed_urls, desc="Processing feeds", unit="feed"):
    # Skip if already processed
    if feed in feed_mementos and len(feed_mementos[feed]) == limit_per_feed:
        print(f"[INFO] Already processed {feed}")
        continue
    try:
        mementos = fetch_mementos_for_feed(feed, limit=limit_per_feed + 2)
    except requests.HTTPError as e:
        print(f"[ERROR] Failed to fetch for {feed}: {e}")
        continue
    
    # Remove the first two mementos (if they exist)
    # Which is the original feed URL and the timemap URL
    if len(mementos) > 2:
        mementos = mementos[2:]

    # Store the memento URIs in the dictionary
    feed_mementos[feed] = mementos
    print(f"[INFO] Found {len(mementos)} mementos for {feed}")

Processing feeds:   0%|          | 0/14 [00:00<?, ?feed/s]

[INFO] Found 20 mementos for https://machinelearningmastery.com/feed
[INFO] Found 13 mementos for https://transferlab.ai/index.xml
[INFO] Found 15 mementos for https://eng.uber.com/tag/machine-learning/feed
[INFO] Found 20 mementos for https://aws.amazon.com/blogs/machine-learning/feed
[INFO] Found 20 mementos for http://news.mit.edu/rss/topic/artificial-intelligence2
[INFO] Found 20 mementos for http://feeds.feedburner.com/nvidiablog
[INFO] Found 9 mementos for https://openai.com/news/rss.xml
[INFO] Found 20 mementos for http://feeds.feedburner.com/blogspot/gJZg
[INFO] Found 20 mementos for http://arxiv.org/rss/cs.LG
[INFO] Found 20 mementos for http://arxiv.org/rss/stat.ML
[INFO] Found 20 mementos for https://distill.pub/rss.xml
[INFO] Found 20 mementos for https://bair.berkeley.edu/blog/feed.xml
[INFO] Found 20 mementos for https://becominghuman.ai/feed
[INFO] Found 20 mementos for https://www.microsoft.com/en-us/research/feed


In [21]:
# Print the first 5 memento URIs for each feed
for feed, mementos in feed_mementos.items():
    print(f"\nFeed: {feed}")
    for memento in mementos[:5]:
        print(f"- {memento}")


Feed: https://machinelearningmastery.com/feed
- https://web.archive.org/web/https://machinelearningmastery.com/feed
- https://web.archive.org/web/20150912134829/http://machinelearningmastery.com/feed/
- https://web.archive.org/web/20160315034406/http://machinelearningmastery.com/feed/
- https://web.archive.org/web/20170606124339/http://machinelearningmastery.com/feed/
- https://web.archive.org/web/20171021225645/http://machinelearningmastery.com/feed/

Feed: https://transferlab.ai/index.xml
- https://web.archive.org/web/https://transferlab.ai/index.xml
- https://web.archive.org/web/20240212185349/https://transferlab.ai/index.xml
- https://web.archive.org/web/20240421141756/https://transferlab.ai/index.xml
- https://web.archive.org/web/20240725170059/https://transferlab.ai/index.xml
- https://web.archive.org/web/20240910065617/https://transferlab.ai/index.xml

Feed: https://eng.uber.com/tag/machine-learning/feed
- https://web.archive.org/web/https://eng.uber.com/tag/machine-learning/fe

In [22]:
# Print the first momento content for each feed
for feed, mementos in feed_mementos.items():
    print(f"\nFeed: {feed}")
    memento = mementos[0]
    print(f"Fetching memento: {memento}")
    content = fetch_xml_from_memento(memento)
    if content is None:
        print(f"[ERROR] Failed to fetch memento {memento}")
        continue
    memento_data = extract_memento_data(feed, content)
    print(f"Found {len(memento_data)} items in memento {memento}")
    for item in memento_data[:1]:
        print(f"- Title: {item['title']}")
        print(f"  Description: {item['description']}")
        print(f"  Link: {item['link']}")
        print(f"  PubDate: {item['pubDate']}")
        print(f"  Author: {item['author']}")
        print()


Feed: https://machinelearningmastery.com/feed
Fetching memento: https://web.archive.org/web/https://machinelearningmastery.com/feed
Found 10 items in memento https://web.archive.org/web/https://machinelearningmastery.com/feed
- Title: 3 Ways Vibe Coding and AI-Assisted Development Are 2 Different Things
  Description: Vibe coding and AI-assisted development are two trendy terms in today's tech jargon.
  Link: https://machinelearningmastery.com/3-ways-vibe-coding-and-ai-assisted-development-are-2-different-things/
  PubDate: Mon, 31 Mar 2025 11:00:41 +0000
  Author: Iván Palomares Carrascosa


Feed: https://transferlab.ai/index.xml
Fetching memento: https://web.archive.org/web/https://transferlab.ai/index.xml
[ERROR] Attempt 1 failed for https://web.archive.org/web/https://transferlab.ai/index.xml: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/https://transferlab.ai/index.xml (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection

In [25]:
feed_datas = {}

In [29]:
for feed, mementos in feed_mementos.items():
    for memento in tqdm(mementos, desc=f"Processing mementos for {feed}", unit="memento"):
        if memento in feed_datas:
            continue
        content = fetch_xml_from_memento(memento)
        if content is None:
            continue
        data = extract_memento_data(feed, content)
        if data is None:
            continue
        feed_datas[memento] = data

Processing mementos for https://machinelearningmastery.com/feed:   0%|          | 0/20 [00:00<?, ?memento/s]

Processing mementos for https://transferlab.ai/index.xml:   0%|          | 0/13 [00:00<?, ?memento/s]

Processing mementos for https://eng.uber.com/tag/machine-learning/feed:   0%|          | 0/15 [00:00<?, ?memen…

Processing mementos for https://aws.amazon.com/blogs/machine-learning/feed:   0%|          | 0/20 [00:00<?, ?m…

Processing mementos for http://news.mit.edu/rss/topic/artificial-intelligence2:   0%|          | 0/20 [00:00<?…

Processing mementos for http://feeds.feedburner.com/nvidiablog:   0%|          | 0/20 [00:00<?, ?memento/s]

Processing mementos for https://openai.com/news/rss.xml:   0%|          | 0/9 [00:00<?, ?memento/s]

Processing mementos for http://feeds.feedburner.com/blogspot/gJZg:   0%|          | 0/20 [00:00<?, ?memento/s]

[ERROR] Attempt 1 failed for https://web.archive.org/web/20080724154728/http://feeds.feedburner.com/blogspot/gJZg: ('Received response with content-encoding: gzip, but failed to decode it.', error('Error -3 while decompressing data: incorrect header check'))
[INFO] Retrying in 10s...
[ERROR] Attempt 2 failed for https://web.archive.org/web/20080724154728/http://feeds.feedburner.com/blogspot/gJZg: ('Received response with content-encoding: gzip, but failed to decode it.', error('Error -3 while decompressing data: incorrect header check'))
[INFO] Retrying in 100s...


KeyboardInterrupt: 

In [31]:
# Print the first memento data
for memento, data in list(feed_datas.items()):
    print(f"\nMemento: {memento}")
    for item in data[:1]:
        print(f"- Source: {item['source']}")
        print(f"  Channel Title: {item['channel_title']}")
        print(f"  Channel Description: {item['channel_description']}")
        print(f"- Title: {item['title']}")
        print(f"  Description: {item['description']}")
        print(f"  Link: {item['link']}")
        print(f"  PubDate: {item['pubDate']}")
        print(f"  Author: {item['author']}")


Memento: https://web.archive.org/web/https://machinelearningmastery.com/feed
- Source: https://machinelearningmastery.com/feed
  Channel Title: MachineLearningMastery.com
  Channel Description: Making developers awesome at machine learning
- Title: 3 Ways Vibe Coding and AI-Assisted Development Are 2 Different Things
  Description: Vibe coding and AI-assisted development are two trendy terms in today's tech jargon.
  Link: https://machinelearningmastery.com/3-ways-vibe-coding-and-ai-assisted-development-are-2-different-things/
  PubDate: Mon, 31 Mar 2025 11:00:41 +0000
  Author: Iván Palomares Carrascosa

Memento: https://web.archive.org/web/20150912134829/http://machinelearningmastery.com/feed/
- Source: https://machinelearningmastery.com/feed
  Channel Title: Machine Learning Mastery
  Channel Description: Making programmers awesome at machine learning
- Title: How Do I Get Started In Machine Learning? (the short version)
  Description: <p>I get daily emails asking the question: How

In [33]:
# Special code to retrive link for http://proceedings.mlr.press//feed.xml
feed = "http://proceedings.mlr.press//feed.xml"
content = fetch_xml_from_memento(feed)
memento_data = extract_memento_data(feed, content)
for item in tqdm(memento_data, desc="Processing memento data", unit="item"):
    # Example link: https://proceedings.mlr.press//v1/assets/rss/feed.xml
    memento = f"https://proceedings.mlr.press//{item['title']}/assets/rss/feed.xml"
    content = fetch_xml_from_memento(memento)
    if content is None:
        continue
    data = extract_memento_data(feed, content)
    if data is None:
        continue
    feed_datas[memento] = data

Processing memento data:   0%|          | 0/257 [00:00<?, ?item/s]

[ERROR] Failed to parse XML from memento https://proceedings.mlr.press//v263/assets/rss/feed.xml: not well-formed (invalid token): line 36, column 19
[ERROR] Failed to parse XML from memento https://proceedings.mlr.press//v250/assets/rss/feed.xml: not well-formed (invalid token): line 901, column 291


In [34]:
# Flatten all items from all mementos into one list
all_items = []
seen_keys = set()

for memento_data in feed_datas.values():
    for item in memento_data:
        key = (item["title"], item["link"], item["pubDate"], item["author"])
        if key not in seen_keys:
            seen_keys.add(key)
            all_items.append(item)

In [35]:
# Compare the number of unique items with the original list
print(f"\nTotal items fetched: {sum(len(data) for data in feed_datas.values())}")
print(f"\nTotal unique items: {len(all_items)}")



Total items fetched: 36040

Total unique items: 26650


In [38]:
# Save the unique items to a CSV file
import pandas as pd
import os
from datetime import datetime
output_dir = "memento_data"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, f"memento_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv")
df = pd.DataFrame(all_items)
df.to_csv(output_file, index=False)
print(f"\nSaved unique items to {output_file}")


Saved unique items to memento_data\memento_data_20250419_004034.csv
