In [None]:
%pip install feedparser

In [1]:
# rss_reuters.py
# parses Reuters RSS and extracts basic fields
import feedparser
from bs4 import BeautifulSoup  # for cleaning HTML in summary
import datetime

RSS_URL = "https://feeds.reuters.com/reuters/topNews"  # example feed

def parse_feed(url=RSS_URL, max_items=10):
    feed = feedparser.parse(url)
    items = []
    for entry in feed.entries[:max_items]:
        # common fields
        title = entry.get("title")
        link = entry.get("link")
        published = entry.get("published") or entry.get("updated")
        summary_html = entry.get("summary", "")
        summary_text = BeautifulSoup(summary_html, "html.parser").get_text(strip=True)

        items.append({
            "title": title,
            "link": link,
            "published": published,
            "summary": summary_text
        })
    return items

if __name__ == "__main__":
    for item in parse_feed():
        print(item["published"], item["title"])
        print(item["link"])
        print(item["summary"][:200], "...\n")


In [2]:
import feedparser
from bs4 import BeautifulSoup

RSS_URL = "http://feeds.bbci.co.uk/news/rss.xml"

feed = feedparser.parse(RSS_URL)

print("Feed title:", feed.feed.get("title"))
print("Number of entries:", len(feed.entries))

for entry in feed.entries[:5]:
    title = entry.get("title")
    link = entry.get("link")
    summary_html = entry.get("summary", "")
    summary_text = BeautifulSoup(summary_html, "html.parser").get_text(strip=True)

    print("----")
    print("Title:", title)
    print("Link:", link)
    print("Summary:", summary_text)


Feed title: BBC News
Number of entries: 36
----
Title: Katie Razzall: A seismic moment that shows rift at top of BBC
Link: https://www.bbc.com/news/articles/c07m2v1z4evo?at_medium=RSS&at_campaign=rss
Summary: There may be more to this than meets the eye, says the BBC's culture and media editor.
----
Title: US Senate passes deal aimed at ending government shutdown
Link: https://www.bbc.com/news/articles/cpd2p2eddnzo?at_medium=RSS&at_campaign=rss
Summary: The measure signals a major breakthrough but the deal still needs to get over more hurdles in Congress.
----
Title: Jailed hacking kingpin tells how his gang stole millions
Link: https://www.bbc.com/news/articles/cm2w0pvg4wko?at_medium=RSS&at_campaign=rss
Summary: One of the world's most prominent cyber-criminals speaks to the BBC in an exclusive interview.
----
Title: 'Killed because they are Alawites': Fear among Syria's minorities after the fall of Assad
Link: https://www.bbc.com/news/articles/crex1zp3213o?at_medium=RSS&at_campaign=r

In [None]:
import sys
sys.path.append("D:/DH/Senior/Paperboy") 

import feedparser
import requests
from bs4 import BeautifulSoup
import pickle
import uuid
import time
import random
import datetime
from src.api.embedding import embed_bgem3


articles = []

RSS_URL = [
    "http://feeds.bbci.co.uk/news/rss.xml",
    "http://feeds.bbci.co.uk/news/uk/rss.xml",
    "http://feeds.bbci.co.uk/news/technology/rss.xml",
    "http://feeds.bbci.co.uk/news/business/rss.xml"
]
HEADERS = {"User-Agent": "Mozilla/5.0"}

def get_article_text(url: str) -> str:
    """Fetch full BBC article text with retry backoff."""
    for attempt in range(20):
        try:
            resp = requests.get(url, headers=HEADERS, timeout=20)
            resp.raise_for_status()
            soup = BeautifulSoup(resp.text, "html.parser")
            paragraphs = soup.select("article div[data-component='text-block']")
            text = "\n".join(p.get_text(strip=True) for p in paragraphs)
            if text:
                return text
        except requests.RequestException as e:
            wait = 5 ** attempt + random.random()
            print(f"[Retry {attempt+1}] Error fetching {url}: {e}. Waiting {wait:.1f}s")
            time.sleep(wait)
    return "[Error fetching after 3 attempts]"


def process_article(article: dict) -> dict:
    """Fetch full text and embedding for one article."""
    text = get_article_text(article["url"])
    try:
        embedding = embed_bgem3(text)
    except Exception as e:
        embedding = f"[Embedding error: {e}]"
    article["body"] = text
    article["embedding"] = embedding
    return article

if __name__ == "__main__":
    for url in RSS_URL:
        feed = feedparser.parse(url)
        print("Feed title:", feed.feed.get("title"))
        print("Entries:", len(feed.entries))

        for entry in feed.entries:  # first 3 articles
            id = str(uuid.uuid4())
            title = entry.get("title")
            published_at = (
                datetime.datetime(*entry.published_parsed[:6]).isoformat()
                if "published_parsed" in entry
                else None
            )
            url = entry.get("link")
            author = entry.get("author", "BBC")
            tags = entry.get("tags")
            section = entry.tags[0].term if "tags" in entry and entry.tags else "general"
            description = BeautifulSoup(entry.get("summary", ""), "html.parser").get_text(strip=True)
            text = title + "\n\n" + description
            thumbnail = None
            if "media_content" in entry:
                thumbnail = entry.media_content[0].get("url")
            elif "media_thumbnail" in entry:
                thumbnail = entry.media_thumbnail[0].get("url")

            try:
                full_text = get_article_text(url)
            except Exception as e:
                continue

            article = {
                "id": id,
                "title": title,
                "headline": "",
                "url": url,
                "thumbnail": thumbnail,
                "section": section,
                "published_at": published_at,
                "description": description,
                "full_text": full_text, 
                "author": author,
            }
            
            article = process_article(article)
            articles.append(article)
            
            time.sleep(5)

        


Feed title: BBC News
Entries: 37
[Retry 1] Error fetching https://www.bbc.com/news/articles/c07m2v1z4evo?at_medium=RSS&at_campaign=rss: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None)). Waiting 1.4s
[Retry 2] Error fetching https://www.bbc.com/news/articles/c07m2v1z4evo?at_medium=RSS&at_campaign=rss: HTTPSConnectionPool(host='www.bbc.com', port=443): Read timed out. (read timeout=20). Waiting 5.6s
[Retry 1] Error fetching https://www.bbc.com/news/articles/c07m2v1z4evo?at_medium=RSS&at_campaign=rss: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None)). Waiting 2.0s
[Retry 1] Error fetching https://www.bbc.com/news/articles/cpd2p2eddnzo?at_medium=RSS&at_campaign=rss: HTTPSConnectionPool(host='www.bbc.com', port=443): Read timed out. (read timeout=20). Waiting 1.1s
[Retry 1] Error fetching https://www.bbc.com/news/artic

In [3]:
print(len(articles))

with open(f"D:/DH/Senior/Paperboy/src/pickled_data/raw_crawl_bbc.pkl", "wb") as f:
    pickle.dump(articles, f)  
print(f"Pickled {len(articles)} stories.")

0
Pickled 0 stories.


In [None]:
import sys, time, random
sys.path.append("D:/DH/Senior/Paperboy")  
from src.api.embedding import embed_bgem3

count = 0
failed = []

for i, doc in enumerate(articles):
    text = f"{doc['title']}\n\n{doc['description']}"
    if len(text) > 4000:
        text = text[:4000]

    while True:  # keep retrying until success
        try:
            embedded = embed_bgem3(text)
            data = embedded.get("data", None)

            if data:
                doc['embedding'] = data[0]  # add embedding
                print(f"✅ Embedded article {i+1}/{len(articles)}")
                break  # success, move on
            else:
                print(f"⚠️ Empty response for article {i}, retrying...")
            
        except Exception as e:
            print(f"❌ Error on article {i}: {e}. Retrying...")

        # exponential backoff with jitter
        sleep_time = min(60, 2 ** min(6, count)) + random.random()
        time.sleep(sleep_time)

    count += 1

print(f"\nFinished. Embedded {count} articles. Failed: {failed}")



In [None]:
with open(f"D:/DH/Senior/Paperboy/src/pickled_data/embedded_bbc.pkl", "wb") as f:
    pickle.dump(articles, f)  
print(f"Pickled {len(articles)} stories.")

In [None]:
import sys
sys.path.append("D:/DH/Senior/Paperboy") 

import feedparser
import requests
from bs4 import BeautifulSoup
from src.api.embedding import embed_bgem3
import pickle
import uuid
import time
import random
import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed

HEADERS = {"User-Agent": "Mozilla/5.0"}
MAX_ARTICLES = 20
CONCURRENT_REQUESTS = 1
SAVE_INTERVAL = 20

RSS_FEEDS = [
    "http://feeds.bbci.co.uk/news/world/rss.xml",
    "http://feeds.bbci.co.uk/news/uk/rss.xml",
    "http://feeds.bbci.co.uk/news/technology/rss.xml",
    "http://feeds.bbci.co.uk/news/business/rss.xml",
]


def get_article_text(url: str) -> str:
    """Fetch full BBC article text with retry backoff."""
    for attempt in range(20):
        try:
            resp = requests.get(url, headers=HEADERS, timeout=20)
            resp.raise_for_status()
            soup = BeautifulSoup(resp.text, "html.parser")
            paragraphs = soup.select("article div[data-component='text-block']")
            text = "\n".join(p.get_text(strip=True) for p in paragraphs)
            if text:
                return text
        except requests.RequestException as e:
            wait = 5 ** attempt + random.random()
            print(f"[Retry {attempt+1}] Error fetching {url}: {e}. Waiting {wait:.1f}s")
            time.sleep(wait)
    return "[Error fetching after 3 attempts]"


def process_article(article: dict) -> dict:
    """Fetch full text and embedding for one article."""
    text = get_article_text(article["url"])
    try:
        embedding = embed_bgem3(text)
    except Exception as e:
        embedding = f"[Embedding error: {e}]"
    article["body"] = text
    article["embedding"] = embedding
    return article


def collect_feed_articles(feed_url: str, limit: int = 5) -> list:
    """Parse RSS feed into article dicts."""
    feed = feedparser.parse(feed_url)
    articles = []

    for entry in feed.entries[:limit]:
        id = str(uuid.uuid4())
        title = entry.get("title", "")
        url = entry.get("link", "")
        published_at = (
            datetime.datetime(*entry.published_parsed[:6]).isoformat()
            if "published_parsed" in entry
            else None
        )
        author = entry.get("author", "BBC")
        section = entry.tags[0].term if getattr(entry, "tags", None) else "general"
        description = BeautifulSoup(entry.get("summary", ""), "html.parser").get_text(strip=True)
        
        thumbnail = None
        if hasattr(entry, "media_content"):
            thumbnail = entry.media_content[0].get("url")
        elif hasattr(entry, "media_thumbnail"):
            thumbnail = entry.media_thumbnail[0].get("url")

        articles.append({
            "id": id,
            "title": title,
            "headline": "",
            "url": url,
            "thumbnail": thumbnail,
            "section": section,
            "published_at": published_at,
            "description": description,
            "author": author,
        })

    return articles


if __name__ == "__main__":
    articles = []

    # Step 1 — Collect URLs
    articles = []
    for feed_url in RSS_FEEDS:
        print(f"Fetching feed: {feed_url}")
        articles.extend(collect_feed_articles(feed_url))
        time.sleep(1)

    print(f"Total articles collected: {len(articles)}")
    articles = articles[:MAX_ARTICLES]

    # Step 2 — Fetch full content concurrently
    with ThreadPoolExecutor(max_workers=CONCURRENT_REQUESTS) as executor:
        futures = {executor.submit(process_article, a): a for a in articles}

        for i, future in enumerate(as_completed(futures), 1):
            result = future.result()
            articles.append(result)

            if i % SAVE_INTERVAL == 0:
                with open(f"D:/DH/Senior/Paperboy/src/pickled_data/embedded_bbc.pkl", "wb") as f:
                    pickle.dump(articles, f)
                print(f"Progress: {i}/{len(articles)} articles saved.")

            time.sleep(2)

    # Step 3 — Final save
    with open(f"D:/DH/Senior/Paperboy/src/pickled_data/embedded_bbc.pkl", "wb") as f:
        pickle.dump(articles, f)  
    print(f"Pickled {len(articles)} stories.")


Fetching feed: http://feeds.bbci.co.uk/news/world/rss.xml
Fetching feed: http://feeds.bbci.co.uk/news/uk/rss.xml
Fetching feed: http://feeds.bbci.co.uk/news/technology/rss.xml
Fetching feed: http://feeds.bbci.co.uk/news/business/rss.xml
Total articles collected: 20
[Retry 1] Error fetching https://www.bbc.com/news/videos/c20p2m83p5go?at_medium=RSS&at_campaign=rss: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None)). Waiting 1.2s
[Retry 2] Error fetching https://www.bbc.com/news/videos/c20p2m83p5go?at_medium=RSS&at_campaign=rss: HTTPSConnectionPool(host='www.bbc.com', port=443): Read timed out. (read timeout=20). Waiting 5.7s
[Retry 4] Error fetching https://www.bbc.com/news/videos/c20p2m83p5go?at_medium=RSS&at_campaign=rss: HTTPSConnectionPool(host='www.bbc.com', port=443): Read timed out. (read timeout=20). Waiting 125.9s


In [3]:
print(all_articles[150])

{'link': 'https://www.bbc.com/news/articles/cqx2jz1l477o?at_medium=RSS&at_campaign=rss', 'text': 'For someone pushing his company to break new ground, Ilkka Paananen appears relaxed.Not wearing shoes, like everyone else in the office - it\'s a Finnish thing I\'m told - he tells me the mobile gaming industry needs shaking up."We need to take bigger risks," says Mr Paananen the chief executive of Finland\'s Supercell - a giant  in the world of mobile games."We have to create new kinds of game experiences," he says.The company already has some of the most successful mobile games ever released; last year Clash of Clans and Brawl Stars generatedmore than a billion dollarsbetween them.Nonetheless, over the past couple of years Mr Paananen has "significantly" increased investment in new games, hired more staff and set up new game studios."We have a lot of very, very talented, ambitious teams who are trying to reimagine what mobile games might look like in, say, 2030, and I wish I had the answ

In [9]:
empty_text_count = 0
empty_full_text_count = 0

for article in articles:
    if not article.get("text", "").strip():
        empty_text_count += 1
    if not article.get("full_text", "").strip():
        empty_full_text_count += 1

total = len(articles)
print(f"Total articles: {total}")
print(f"Articles with empty 'text': {empty_text_count} ({empty_text_count/total*100:.2f}%)")
print(f"Articles with empty 'full_text': {empty_full_text_count} ({empty_full_text_count/total*100:.2f}%)")

Total articles: 189
Articles with empty 'text': 0 (0.00%)
Articles with empty 'full_text': 30 (15.87%)
