In [1]:
# pip install beautifulsoup4 lxml

In [2]:
from bs4 import BeautifulSoup
import html as htmllib
import re

def extract_plain_text(fields):
    """
    fields = article.get("fields", {}) from Guardian API result.
    Returns plain cleaned text or None.
    """
    if not fields:
        return None


    # fallback to HTML body
    body_html = fields.get("body")
    if not body_html:
        return None

    soup = BeautifulSoup(body_html, "lxml")

    # remove unwanted tags (figures, scripts, asides, ads, nav, footer)
    for tag in soup(["script", "style", "aside", "figure", "iframe", "header", "footer", "nav"]):
        tag.decompose()


    # get text with paragraph spacing
    text = soup.get_text(separator=" ")
    text = htmllib.unescape(text)

    # collapse multiple newlines and whitespace
    text = re.sub(
    r"(?im)^\s*(Updated\s+at\s+)?\d{1,2}\.\d{2}\s*(am|pm)\s+BST\s*$", 
    "", 
    text)
    text = re.sub(r'\n\s*\n+', '\n\n', text)   # keep paragraph breaks
    lines = [ln.strip() for ln in text.splitlines()]
    clean = "\n".join([ln for ln in lines if ln])

    # optional: trim extremely long articles (e.g., keep first N chars)
    # clean = clean[:20000]

    return clean


In [3]:
import requests
import csv
import uuid
from time import sleep

API_KEY = "022783ad-6e46-4ba2-af9a-3a7464467d02"
BASE_URL = "https://content.guardianapis.com/search"
PAGE_SIZE = 200  # max 200

def crawl_guardian(pages, output_file="guardian_articles.csv"):
    all_articles = []

    # open CSV for writing
    with open(output_file, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f, quoting=csv.QUOTE_ALL)  # quote all fields
        writer.writerow([
            "id", "title", "headline", "url", "thumbnail",
            "section", "published_at", "trail_text", "body", "byline"
        ])

        for page in range(pages, pages + 2):
            params = {
                "api-key": API_KEY,
                "page": page,
                "page-size": PAGE_SIZE,
                "show-fields": "trailText,body,thumbnail,headline,byline",
                "order-by": "newest"
            }
            resp = requests.get(BASE_URL, params=params)
            data = resp.json()

            results = data.get("response", {}).get("results", [])
            if not results:
                break

            for article in results:
                id = str(uuid.uuid4())
                fields = article.get("fields", {})
                thumbnail = fields.get("thumbnail","")
                headline = fields.get("headline", "")
                author = fields.get("byline", "")
                
                body = extract_plain_text(fields)
                row = [
                    id,
                    article.get("webTitle", ""),
                    headline,
                    article.get("webUrl", ""),
                    thumbnail,
                    article.get("sectionName", ""),
                    article.get("webPublicationDate", ""),
                    fields.get("trailText", ""),
                    body,
                    author
                ]
                writer.writerow(row)
                all_articles.append(row)

            print(f"Page {page} done, got {len(results)} articles.")
            sleep(1)  # polite delay

    print(f"Saved {len(all_articles)} articles to {output_file}")
    return all_articles


In [4]:
from pathlib import Path
import pickle

pages = 1
marker = "09_12_25"

all_articles = crawl_guardian(pages)  # adjust number of pages
cols = ["id","title", "headline", "url", "thumbnail", "section","published_at","description","body", "author"]
all_articles = [dict(zip(cols, row)) for row in all_articles]

ROOT = Path("d:/DH/Senior/Paperboy/src")  # your project root
marker_dir = ROOT / "pickled_data" / marker
marker_dir.mkdir(parents=True, exist_ok=True)

file_path = marker_dir / f"raw_crawl_{marker}.pkl"
with open(file_path, "wb") as f:
    pickle.dump(all_articles, f)  
print(f"Pickled {len(all_articles)} stories.")

Page 1 done, got 200 articles.
Page 2 done, got 200 articles.
Saved 400 articles to guardian_articles.csv
Pickled 400 stories.


In [5]:
print(all_articles[0])



In [6]:
import sys, time, random
sys.path.append("D:/DH/Senior/Paperboy")  
from src.api.embedding.embedding import embed_bgem3

count = 0
failed = []

for i, doc in enumerate(all_articles):
    text = f"{doc['title']}\n\n{doc['description']}"
    if len(text) > 4000:
        text = text[:4000]

    while True:  # keep retrying until success
        try:
            embedded = embed_bgem3(text)
            data = embedded.get("data", None)

            if data:
                doc['embedding'] = data[0]  # add embedding
                print(f"✅ Embedded article {i+1}/{len(all_articles)}")
                break  # success, move on
            else:
                print(f"⚠️ Empty response for article {i}, retrying...")
            
        except Exception as e:
            print(f"❌ Error on article {i}: {e}. Retrying...")

        # exponential backoff with jitter
        sleep_time = min(60, 2 ** min(6, count)) + random.random()
        time.sleep(sleep_time)

    count += 1

print(f"\nFinished. Embedded {count} articles. Failed: {failed}")



✅ Embedded article 1/400
✅ Embedded article 2/400
✅ Embedded article 3/400
✅ Embedded article 4/400
✅ Embedded article 5/400
✅ Embedded article 6/400
✅ Embedded article 7/400
✅ Embedded article 8/400
✅ Embedded article 9/400
✅ Embedded article 10/400
✅ Embedded article 11/400
✅ Embedded article 12/400
✅ Embedded article 13/400
✅ Embedded article 14/400
✅ Embedded article 15/400
✅ Embedded article 16/400
✅ Embedded article 17/400
✅ Embedded article 18/400
✅ Embedded article 19/400
✅ Embedded article 20/400
✅ Embedded article 21/400
✅ Embedded article 22/400
✅ Embedded article 23/400
✅ Embedded article 24/400
✅ Embedded article 25/400
✅ Embedded article 26/400
✅ Embedded article 27/400
✅ Embedded article 28/400
✅ Embedded article 29/400
✅ Embedded article 30/400
✅ Embedded article 31/400
✅ Embedded article 32/400
✅ Embedded article 33/400
✅ Embedded article 34/400
✅ Embedded article 35/400
✅ Embedded article 36/400
✅ Embedded article 37/400
✅ Embedded article 38/400
✅ Embedded article 39

In [12]:
from pathlib import Path
import pickle

ROOT = Path("d:/DH/Senior/Paperboy/src")  # your project root
marker_dir = ROOT / "pickled_data" / marker
marker_dir.mkdir(parents=True, exist_ok=True)

file_path = marker_dir / f"articles_{marker}.pkl"

with open(file_path, "wb") as f:
    pickle.dump(all_articles, f)

print("Loaded:", len(all_articles))

Loaded: 400


In [8]:
%pip install pandas




In [14]:
import pandas as pd

# Show first 5 rows
row = all_articles[66]
# print(f"Article {i+1}")
print("Title:", row['title'])
print("Headline:", row['headline'])
print("URL:", row['url'])
print("Thumbnail:", row['thumbnail'])
print("Section:", row['section'])
print("Published at:", row['published_at'])
print("Trail text:", row['description'])
print("body:", row['body'])
print("Author:", row['author'])
print("embedding:", row['embedding'])

Title: Sydney Sweeney, Richard Linklater and Emma Thompson are up for most egregious snub in the 2026 Golden Globe nominations
Headline: Sydney Sweeney, Richard Linklater and Emma Thompson are up for most egregious snub in the 2026 Golden Globe nominations
URL: https://www.theguardian.com/film/2025/dec/08/golden-globe-nominations-2026-snub-richard-linklater-emma-thompson-paul-thomas-anderson
Thumbnail: https://media.guim.co.uk/397c5ccd3bb27be9e9579eb2d16b17e87d280503/2748_1201_3665_2932/500.jpg
Section: Film
Published at: 2025-12-08T17:43:30Z
Trail text: Linklater is missing from the best director list despite having two nominated films, and actors including Sydney Sweeney and Josh O’Connor are nowhere to be seen. It looks like Paul Thomas Anderson’s year
body: It’s become traditional to look for the snubs in any award list – and heaven help anyone whose job it is to curate the “in memoriam” montage on the night and then the next morning apologise for the inevitable hurtful omissions. 

In [11]:
empty_text_count = 0
empty_full_text_count = 0

for article in all_articles:
    if article['embedding'] == None:
        empty_text_count += 1
    if article['title'] == "":
        empty_full_text_count += 1

total = len(all_articles)
print(f"Total articles: {total}")
print(f"Articles with empty 'text': {empty_text_count} ({empty_text_count/total*100:.2f}%)")
print(f"Articles with empty 'full_text': {empty_full_text_count} ({empty_full_text_count/total*100:.2f}%)")

Total articles: 400
Articles with empty 'text': 0 (0.00%)
Articles with empty 'full_text': 0 (0.00%)


In [10]:
import psycopg2

def get_connection():
    return psycopg2.connect(
        host="localhost",
        port=5436,
        dbname="paperboy",
        user="minhhieu",
        password="minhhieu888"
    )

# UPSERT function
def upsert_articles(all_articles):
    sql = """
    INSERT INTO articles (id, title, headline, url, thumbnail, section, published_at, description, body, author, embedding)
    VALUES (%(id)s, %(title)s, %(headline)s, %(url)s, %(thumbnail)s, %(section)s, %(published_at)s, %(description)s, %(body)s, %(author)s, %(embedding)s)
    ON CONFLICT (id) DO NOTHING;
    """


    with get_connection() as conn:
        with conn.cursor() as cur:
            cur.executemany(sql, all_articles)  # batch insert
        conn.commit()
    print(f"Upserted {len(all_articles)} articles.")
    
    
upsert_articles(all_articles)

Upserted 400 articles.
