In [None]:
# pip install beautifulsoup4 lxml

In [1]:
from bs4 import BeautifulSoup
import html as htmllib
import re

def extract_plain_text(fields):
    """
    fields = article.get("fields", {}) from Guardian API result.
    Returns plain cleaned text or None.
    """
    if not fields:
        return None


    # fallback to HTML body
    body_html = fields.get("body")
    if not body_html:
        return None

    soup = BeautifulSoup(body_html, "lxml")

    # remove unwanted tags (figures, scripts, asides, ads, nav, footer)
    for tag in soup(["script", "style", "aside", "figure", "iframe", "header", "footer", "nav"]):
        tag.decompose()


    # get text with paragraph spacing
    text = soup.get_text(separator=" ")
    text = htmllib.unescape(text)

    # collapse multiple newlines and whitespace
    text = re.sub(
    r"(?im)^\s*(Updated\s+at\s+)?\d{1,2}\.\d{2}\s*(am|pm)\s+BST\s*$", 
    "", 
    text)
    text = re.sub(r'\n\s*\n+', '\n\n', text)   # keep paragraph breaks
    lines = [ln.strip() for ln in text.splitlines()]
    clean = "\n".join([ln for ln in lines if ln])

    # optional: trim extremely long articles (e.g., keep first N chars)
    # clean = clean[:20000]

    return clean


In [2]:
import requests
import csv
import uuid
from time import sleep

API_KEY = "022783ad-6e46-4ba2-af9a-3a7464467d02"
BASE_URL = "https://content.guardianapis.com/search"
PAGE_SIZE = 200  # max 200

def crawl_guardian(pages, output_file="guardian_articles.csv"):
    all_articles = []

    # open CSV for writing
    with open(output_file, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f, quoting=csv.QUOTE_ALL)  # quote all fields
        writer.writerow([
            "id", "title", "headline", "url", "thumbnail",
            "section", "published_at", "trail_text", "body", "byline"
        ])

        for page in range(pages, pages + 5):
            params = {
                "api-key": API_KEY,
                "page": page,
                "page-size": PAGE_SIZE,
                "show-fields": "trailText,body,thumbnail,headline,byline",
                "order-by": "newest"
            }
            resp = requests.get(BASE_URL, params=params)
            data = resp.json()

            results = data.get("response", {}).get("results", [])
            if not results:
                break

            for article in results:
                id = str(uuid.uuid4())
                fields = article.get("fields", {})
                thumbnail = fields.get("thumbnail","")
                headline = fields.get("headline", "")
                author = fields.get("byline", "")
                
                body = extract_plain_text(fields)
                row = [
                    id,
                    article.get("webTitle", ""),
                    headline,
                    article.get("webUrl", ""),
                    thumbnail,
                    article.get("sectionName", ""),
                    article.get("webPublicationDate", ""),
                    fields.get("trailText", ""),
                    body,
                    author
                ]
                writer.writerow(row)
                all_articles.append(row)

            print(f"Page {page} done, got {len(results)} articles.")
            sleep(1)  # polite delay

    print(f"Saved {len(all_articles)} articles to {output_file}")
    return all_articles


In [3]:
pages = 1
all_articles = crawl_guardian(pages)  # adjust number of pages
cols = ["id","title", "headline", "url", "thumbnail", "section","published_at","description","body", "author"]
all_articles = [dict(zip(cols, row)) for row in all_articles]

import pickle
with open(f"D:/DH/Senior/Paperboy/src/pickled_data/raw_crawl_{pages}.pkl", "wb") as f:
    pickle.dump(all_articles, f)  
print(f"Pickled {len(all_articles)} stories.")

Page 1 done, got 200 articles.
Page 2 done, got 200 articles.
Page 3 done, got 200 articles.
Page 4 done, got 200 articles.
Page 5 done, got 200 articles.
Saved 1000 articles to guardian_articles.csv
Pickled 1000 stories.


In [4]:
print(all_articles[0])

{'id': '3bc950a0-d96c-4f6d-9908-cec33dd45c26', 'title': 'Minister says MoD looking at security risk from Chinese cars, with staff told to avoid secret talk in vehicles – UK politics live', 'headline': 'Minister says MoD looking at security risk from Chinese cars, with staff told to avoid secret talk in vehicles – UK politics live', 'url': 'https://www.theguardian.com/politics/live/2025/nov/19/pmqs-uk-politics-latest-news-john-healey-rachel-reeves-keir-starmer-labour', 'thumbnail': 'https://media.guim.co.uk/1d1bcd43c8e82c398cf9d963d4cff7e9676f2ad1/0_0_3750_3000/500.jpg', 'section': 'Politics', 'published_at': '2025-11-19T10:23:25Z', 'description': 'Luke Pollard, a defence minister, was asked about a claim that MoD officials have been told not to discuss secrets in Chinese cars', 'body': '10.23am  GMT\nThis is  what   Dan Sabbagh , the Guardian’s defence and security editor, posted during John Healey’s speech.\nWatching a rare speech from def sec John  Healey  in 9 Downing St. A special 

In [6]:
import sys, time, random
sys.path.append("D:/DH/Senior/Paperboy")  
from src.api.embedding.embedding import embed_bgem3

count = 0
failed = []

for i, doc in enumerate(all_articles):
    text = f"{doc['title']}\n\n{doc['description']}"
    if len(text) > 4000:
        text = text[:4000]

    while True:  # keep retrying until success
        try:
            embedded = embed_bgem3(text)
            data = embedded.get("data", None)

            if data:
                doc['embedding'] = data[0]  # add embedding
                print(f"✅ Embedded article {i+1}/{len(all_articles)}")
                break  # success, move on
            else:
                print(f"⚠️ Empty response for article {i}, retrying...")
            
        except Exception as e:
            print(f"❌ Error on article {i}: {e}. Retrying...")

        # exponential backoff with jitter
        sleep_time = min(60, 2 ** min(6, count)) + random.random()
        time.sleep(sleep_time)

    count += 1

print(f"\nFinished. Embedded {count} articles. Failed: {failed}")



✅ Embedded article 1/1000
✅ Embedded article 2/1000
✅ Embedded article 3/1000
✅ Embedded article 4/1000
✅ Embedded article 5/1000
✅ Embedded article 6/1000
✅ Embedded article 7/1000
✅ Embedded article 8/1000
✅ Embedded article 9/1000
✅ Embedded article 10/1000
✅ Embedded article 11/1000
✅ Embedded article 12/1000
✅ Embedded article 13/1000
✅ Embedded article 14/1000
✅ Embedded article 15/1000
✅ Embedded article 16/1000
✅ Embedded article 17/1000
✅ Embedded article 18/1000
✅ Embedded article 19/1000
✅ Embedded article 20/1000
✅ Embedded article 21/1000
✅ Embedded article 22/1000
✅ Embedded article 23/1000
✅ Embedded article 24/1000
✅ Embedded article 25/1000
✅ Embedded article 26/1000
✅ Embedded article 27/1000
✅ Embedded article 28/1000
✅ Embedded article 29/1000
✅ Embedded article 30/1000
✅ Embedded article 31/1000
✅ Embedded article 32/1000
✅ Embedded article 33/1000
✅ Embedded article 34/1000
✅ Embedded article 35/1000
✅ Embedded article 36/1000
✅ Embedded article 37/1000
✅ Embedded

In [7]:
from pathlib import Path
import pickle

ROOT = Path("d:/DH/Senior/Paperboy/src")  # your project root
file_path = ROOT / "pickled_data" / f"articles_{pages}.pkl"

with open(file_path, "wb") as f:
    pickle.dump(all_articles, f)

print("Loaded:", len(all_articles))

Loaded: 1000


In [None]:
%pip install pandas

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [8]:
import pandas as pd

# Show first 5 rows
row = all_articles[666]
# print(f"Article {i+1}")
print("Title:", row['title'])
print("Headline:", row['headline'])
print("URL:", row['url'])
print("Thumbnail:", row['thumbnail'])
print("Section:", row['section'])
print("Published at:", row['published_at'])
print("Trail text:", row['description'])
print("body:", row['body'])
print("Author:", row['author'])
print("embedding:", row['embedding'])

Title: Jannik Sinner and Carlos Alcaraz win to set up clash in ATP Finals’ climax
Headline: Jannik Sinner and Carlos Alcaraz win to set up clash in ATP Finals’ climax
URL: https://www.theguardian.com/sport/2025/nov/15/sinner-extends-indoor-dominance-with-win-over-de-minaur-to-reach-atp-world-tour-final
Thumbnail: https://media.guim.co.uk/87766dc48281311cdf5727c46c6c520176d9bdcc/318_0_2562_2050/500.jpg
Section: Sport
Published at: 2025-11-15T16:22:59Z
Trail text: The world’s top two players, Carlos Alcaraz and Jannik Sinner, will face one another in the climax of the ATP Finals in Turin
body: The 2025 men’s tennis season will conclude with a final showdown between the two best players in the world after Jannik Sinner and Carlos Alcaraz reached the final of the ATP Finals.   Sinner continued his total dominance of the indoor season as the Italian held off an admirable early challenge from Alex de Minaur before bulldozing his path into the tournament’s final for a third consecutive year w

In [9]:
import psycopg2

def get_connection():
    return psycopg2.connect(
        host="localhost",
        port=5436,
        dbname="paperboy",
        user="minhhieu",
        password="minhhieu888"
    )

# UPSERT function
def upsert_articles(all_articles):
    sql = """
    INSERT INTO articles (id, title, headline, url, thumbnail, section, published_at, description, body, author, embedding)
    VALUES (%(id)s, %(title)s, %(headline)s, %(url)s, %(thumbnail)s, %(section)s, %(published_at)s, %(description)s, %(body)s, %(author)s, %(embedding)s)
    ON CONFLICT (id) DO NOTHING;
    """


    with get_connection() as conn:
        with conn.cursor() as cur:
            cur.executemany(sql, all_articles)  # batch insert
        conn.commit()
    print(f"Upserted {len(all_articles)} articles.")


In [None]:
empty_count = sum(1 for a in all_articles if not a.get("author"))
print(empty_count)

31


In [None]:
# for i, article in enumerate(all_articles):
#     try:
#         if article['embedding'] == None:
#             text = f"{article['title']}\n\n{article['description']}"
#             if len(text) > 4000:
#                 text = text[:4000]
#             embedded = embed_bgem3(text)
#             data = embedded.get("data", None)
#             if data:
#                 article['embedding'] = data[0]  # add embedding
#             print(f"Embedded article {article['title']}")
#         else:
#             continue
#     except:
#         text = f"{article['title']}\n\n{article['description']}"
#         if len(text) > 4000:
#             text = text[:4000]
#         embedded = embed_bgem3(text)
#         data = embedded.get("data", None)
#         if data:
#             article['embedding'] = data[0]  # add embedding
#             print(f"Embedded article {article['title']}")

In [10]:
upsert_articles(all_articles)

Upserted 1000 articles.


In [None]:
empty_text_count = 0
empty_full_text_count = 0

for article in all_articles:
    if article['embedding'] == None:
        empty_text_count += 1
    if article['title'] == "":
        empty_full_text_count += 1

total = len(all_articles)
print(f"Total articles: {total}")
print(f"Articles with empty 'text': {empty_text_count} ({empty_text_count/total*100:.2f}%)")
print(f"Articles with empty 'full_text': {empty_full_text_count} ({empty_full_text_count/total*100:.2f}%)")

Total articles: 1000
Articles with empty 'text': 0 (0.00%)
Articles with empty 'full_text': 0 (0.00%)
