In [4]:
import ast
from time import sleep
from typing import List

import duckdb
import requests
from bs4 import BeautifulSoup
from pydantic import BaseModel
import pandas as pd
from tqdm import tqdm

In [2]:
class FeedItem(BaseModel, arbitrary_types_allowed=True):
    title: str
    link: str
    guid: str
    categories: List[str]
    dc_creator: str
    pub_date: pd.Timestamp
    atom_updated: pd.Timestamp
    content_encoded: str


In [3]:
cdata_wrapper = lambda x: f"<![CDATA[{x}]]>"
def get_elements(url: str) -> FeedItem:
    response = requests.get(url)
    soup = BeautifulSoup(response.content)

    title = cdata_wrapper(soup.find("h1").text.strip())
    link = url
    guid = f"https://medium.com/p/{url.split('-')[-1]}"
    categories = [cdata_wrapper(i.text.strip()) for i in soup.find_all("a", href=lambda x: x and "medium.com/tag/" in x)]
    dc_creator = cdata_wrapper(soup.find("a", {"data-testid": "authorName"}).text.strip())
    pub_date = soup.find("span", {"data-testid": "storyPublishDate"})
    pub_date = pd.to_datetime(pub_date.text.strip())
    atom_updated = pub_date
    content = cdata_wrapper(soup.find("article").get_text(separator="\n").strip())
    
    return FeedItem(
        title=title,
        link=link,
        guid=guid,
        categories=categories,
        dc_creator=dc_creator,
        pub_date=pub_date,
        atom_updated=atom_updated,
        content_encoded=content
    )

In [18]:
with open("./data/backfill_urls.txt", "r") as f:
    urls = f.read().splitlines()

In [19]:
results = []
for url in tqdm(urls, desc="Processing URLs"):
    try:
        item = get_elements(url)
        results.append(item)
    except Exception as e:
        print(f"Error processing {url}: {e}")
    sleep(1)  # To avoid hitting the server too hard
df = pd.DataFrame([item.model_dump() for item in results])
df.to_csv("./data/medium_feed.csv", index=False)
print("Data extraction complete. Saved to ./data/medium_feed.csv")

Processing URLs:   0%|          | 0/84 [00:00<?, ?it/s]

Processing URLs: 100%|██████████| 84/84 [02:41<00:00,  1.92s/it]

Data extraction complete. Saved to ./data/medium_feed.csv





In [7]:
feed_frame = """<rss xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:cc="http://cyber.law.harvard.edu/rss/creativeCommonsRssModule.html" version="2.0">
<channel>
<title>
<![CDATA[ Generative AI in the Newsroom - Medium ]]>
</title>
<description>
<![CDATA[ The Generative AI in the Newsroom project is an effort to collaboratively figure out how and when (or when not) to use generative AI in news production. - Medium ]]>
</description>
<link>https://generative-ai-newsroom.com?source=rss----df04cefca135---4</link>
<image>
<url>https://cdn-images-1.medium.com/proxy/1*TGH72Nnw24QL3iV9IOm4VA.png</url>
<title>Generative AI in the Newsroom - Medium</title>
<link>https://generative-ai-newsroom.com?source=rss----df04cefca135---4</link>
</image>
<generator>GAIN GH Action</generator>
<lastBuildDate>Thu, 29 May 2025 18:16:28 GMT</lastBuildDate>
<atom:link href="" rel="self" type="application/rss+xml"/>
<webMaster>
<![CDATA[ nicholas.hagar@northwestern.edu ]]>
</webMaster>
{items}
</channel>
</rss>
"""

item_frame = """<item>
<title>{title}</title>
<link>{link}</link>
<guid isPermaLink="false">{guid}</guid>
{categories}
<dc:creator>{dc_creator}</dc:creator>
<pubDate>{pub_date}</pubDate>
<atom:updated>{atom_updated}</atom:updated>
<content:encoded>{content_encoded}</content:encoded>
</item>"""

categories_frame = """<category>{cat}</category>"""

def make_item(row) -> str:
    categories = "".join([categories_frame.format(cat=cat) for cat in ast.literal_eval(row["categories"])])
    return item_frame.format(
        title=row["title"],
        link=row["link"],
        guid=row["guid"],
        categories=categories,
        dc_creator=row["dc_creator"],
        pub_date=row["pub_date"],
        atom_updated=row["atom_updated"],
        content_encoded=row["content_encoded"]
    )

def make_feed(df: pd.DataFrame) -> str:
    items = "".join([make_item(row) for _, row in df.iterrows()])
    return feed_frame.format(items=items)

In [8]:
df = pd.read_csv("./data/medium_feed.csv")

In [9]:
feed = make_feed(df)

In [10]:
with open("./data/gain_feed.xml", "w") as f:
    f.write(feed)
print("RSS feed generated and saved to ./data/gain_feed.xml")

RSS feed generated and saved to ./data/gain_feed.xml


In [11]:
# convert csv to duckdb
con = duckdb.connect(database="./data/gain_feed.duckdb", read_only=False)
con.execute("CREATE TABLE IF NOT EXISTS medium_feed AS SELECT * FROM read_csv_auto('./data/medium_feed.csv')")
con.close()