In [21]:
%run 00_utils.ipynb

In [22]:
RSS_FEEDS = {
    "SMH": [
        'https://www.smh.com.au/rss/feed.xml',
        'https://www.smh.com.au/rss/sport.xml',
        'https://www.smh.com.au/rss/business.xml',
        'https://www.smh.com.au/rss/lifestyle.xml',
        'https://www.smh.com.au/rss/culture.xml',

    ],
    "SBS": [
        'https://www.sbs.com.au/news/feed',
        'https://www.sbs.com.au/news/topic/australia/feed',
        'https://www.sbs.com.au/news/topic/latest/feed',
        'https://www.sbs.com.au/news/topic/australia/feed',
    ],
    "The Guardian": [
        'https://www.theguardian.com/australia-news/rss'
        'https://www.theguardian.com/au/sport/rss',
        'https://www.theguardian.com/au/culture/rss',
        'https://www.theguardian.com/au/lifeandstyle/rss',
        'https://www.theguardian.com/au/business/rss'

    ],
    "ESPN": [
        'https://www.espn.com.au/espn/rss/news',
    ],
    "ABC": [
        'https://www.abc.net.au/news/feed/10719986/rss.xml'
        'https://www.abc.net.au/news/feed/51120/rss.xml',
        'https://www.abc.net.au/news/feed/103728564/rss.xml',
        'https://www.abc.net.au/news/feed/103728568/rss.xml',
        'https://www.abc.net.au/news/feed/103728570/rss.xml'
    ],
    "Canberra Times": [
        'https://www.canberratimes.com.au/rss.xml'
    ]
    # "News.com.au": [
    #     'https://www.news.com.au/content-feeds/latest-news-national/',
    #     'https://www.news.com.au/content-feeds/latest-news-lifestyle/',
    #     'https://www.news.com.au/content-feeds/latest-news-sport/',
    #     'https://www.news.com.au/content-feeds/latest-news-entertainment/',
    #     'https://www.news.com.au/content-feeds/latest-news-finance/'
    # ],
    # "Businessnews.com.au": [
    #     'https://www.businessnews.com.au/rssfeed/latest.rss'
    # ],
    # "The Age": [
    #         'https://www.theage.com.au/rss/feed.xml',
    #         'https://www.theage.com.au/rss/sport.xml',
    #         'https://www.theage.com.au/rss/culture.xml'
    #         'https://www.theage.com.au/rss/business.xml',
    #         'https://www.theage.com.au/rss/lifestyle.xml'
    # ]
}

In [23]:
import feedparser
import pandas as pd
import time
from datetime import datetime, timezone
import re
import unicodedata
from bs4 import BeautifulSoup


def clean_text(txt: str | None) -> str | None:
    """
    Cleans the given HTML or plain text input by removing HTML tags,
    invisible characters, and diacritics. This can be used to sanitize
    and normalize text content extracted from various sources.

    Args:
        txt: The input text which may contain HTML tags.
    Returns:
        The cleaned text
    """
    if not txt:
        return None

    text = BeautifulSoup(txt, "html.parser").get_text(" ", strip=True)

    # Remove invisible chars in RSS text
    regex = re.compile(r"[\ufeff\u200b\u200c\u200d]")
    text = regex.sub("", text)

    # decompose then drop diacritics/combining marks (cafÃ© -> cafe)
    text = unicodedata.normalize("NFKD", text)
    text = "".join(ch for ch in text if not unicodedata.combining(ch))

    return text


def clean_url(url: str | None) -> str | None:
    if not url:
        return url
    return url.split("?", 1)[0].split("#", 1)[0]


def parse_date(st: time.struct_time | None) -> datetime | None:
    if not st:
        return None
    else:
        return datetime(*st[:6], tzinfo=timezone.utc)


def make_summary(title: str, description: str) -> str:
    sep = "" if title.endswith((".", "!", "?")) else "."
    return f"{title}{sep} {description}"


In [24]:
def fetch_au_news(
    feeds: dict[str, list[str]],
    limit_per_feed: int = 40,
) -> pd.DataFrame:
    """
    Fetches and processes Australian news articles from specified RSS feeds.

    Args:
        feeds: A dictionary where keys are source names, and
            values are lists of RSS feed URLs associated with those sources.
        limit_per_feed : The maximum number of entries to fetch
            from each feed. Defaults to 40.

    Returns:
        A DataFrame containing the processed news articles
    """

    records = []

    for source, feed_urls in feeds.items():
        for feed_url in feed_urls:
            feed = feedparser.parse(feed_url)
            for entry in feed.entries[:limit_per_feed]:
                records.append({
                    "url": entry.get("link"),
                    "source": source,
                    "title": entry.get("title"),
                    "description": entry.get("description"),
                    "author": entry.get("author") or source,
                    "published": parse_date(entry.get("published_parsed"))
                })

    df = pd.DataFrame(records)

    # Filter out records with null title or description
    df = df[(df["title"] != 'null')&(df["description"] != 'null')]
    df.dropna(subset=["title", "description"], inplace=True, ignore_index=True)

    # Clean Text
    df["title"] = df["title"].apply(clean_text)
    df["description"] = df["description"].apply(clean_text)

    # Create a summary from title and description
    df["summary"] = df.apply(lambda row: make_summary(row["title"], row["description"]), axis=1)

    # Clean urls
    df["url"] = df["url"].apply(clean_url)

    # Deduplicate across categories
    df = df.drop_duplicates(subset=["url"])

    # Sort by published date descending
    df = df.sort_values("published", ascending=False).reset_index(drop=True)

    return df

In [25]:
# Fetch news data
df = fetch_au_news(RSS_FEEDS)
df.head()

Unnamed: 0,url,source,title,description,author,published,summary
0,https://www.espn.com.au/tennis/story/_/id/4767...,ESPN,ðŸŽ¾AO live: Wawrinka stuns in five-set marathon,Novak Djokovic and Jannik Sinner headline a bu...,ESPN,2026-01-23 04:25:42+00:00,ðŸŽ¾AO live: Wawrinka stuns in five-set marathon....
1,https://www.espn.com.au/tennis/story/_/id/4769...,ESPN,'What was that for?' Osaka asks of terse Cirstea,Naomi Osaka received a cool response from Sora...,ESPN,2026-01-23 04:25:42+00:00,'What was that for?' Osaka asks of terse Cirst...
2,https://www.espn.com.au/golf/story/_/id/476898...,ESPN,"McIlroy to Rahm, Hatton: Pay fines, play Ryder...",Rory McIlroy wants Ryder Cup team-mates Jon Ra...,PA,2026-01-23 04:25:42+00:00,"McIlroy to Rahm, Hatton: Pay fines, play Ryder..."
3,https://www.espn.com.au/nba/story/_/id/4768975...,ESPN,"Giannis cites chemistry, selfish play after routs",Giannis Antetokounmpo says chemistry issues mi...,ESPN,2026-01-23 04:25:42+00:00,"Giannis cites chemistry, selfish play after ro..."
4,https://www.espn.com.au/afl/story/_/id/4768933...,ESPN,Hawks break tradition and appoint co-captains,Midfielder Jai Newcombe has been elevated to s...,ESPN,2026-01-23 04:25:42+00:00,Hawks break tradition and appoint co-captains....


In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307 entries, 0 to 306
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype              
---  ------       --------------  -----              
 0   url          307 non-null    object             
 1   source       307 non-null    object             
 2   title        307 non-null    object             
 3   description  307 non-null    object             
 4   author       307 non-null    object             
 5   published    307 non-null    datetime64[ns, UTC]
 6   summary      307 non-null    object             
dtypes: datetime64[ns, UTC](1), object(6)
memory usage: 16.9+ KB


In [27]:
df['source'].value_counts()

source
The Guardian      108
SMH                90
SBS                45
ESPN               38
Canberra Times     14
ABC                12
Name: count, dtype: int64

In [28]:
# Export to sheets
df_to_sheets(df, "data_feed", SHEET_URL)

"Data uploaded successfully to sheet: 'data_feed'"