In [None]:
!pip install requests beautifulsoup4 pandas



In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin

In [None]:
def get_guardian_articles(base_url, max_articles=20):
    response = requests.get(base_url)
    soup = BeautifulSoup(response.text, 'html.parser')

    articles = []
    links = soup.select("a[data-link-name='article']")
    seen = set()

    for link in links:
        href = link.get("href")
        full_url = urljoin(base_url, href)
        title = link.get_text(strip=True)

        if full_url and full_url not in seen and title:
            seen.add(full_url)
            article = {
                'title': title,
                'url': full_url
            }

            try:
                article_page = requests.get(full_url)
                article_soup = BeautifulSoup(article_page.text, 'html.parser')

                # Description
                meta = article_soup.find("meta", {"name": "description"})
                article["description"] = meta["content"].strip() if meta else "No description"

                # Date
                date_tag = article_soup.find("meta", {"property": "article:published_time"})
                article["date"] = date_tag["content"][:10] if date_tag else "No date"

            except Exception as e:
                article["description"] = "Error"
                article["date"] = "Error"

            articles.append(article)

        if len(articles) >= max_articles:
            break

    return articles

In [None]:
BASE_URL = "https://www.theguardian.com/international"
data = get_guardian_articles(BASE_URL, max_articles=20)

# Save to CSV
df = pd.DataFrame(data)
df.to_csv("guardian_articles.csv", index=False)

# Display
print("Total articles scraped:", len(data))
df.head()

Total articles scraped: 20


Unnamed: 0,title,url,description,date
0,"Israel committing genocide in Gaza, say Israel...",https://www.theguardian.com/world/2025/jul/28/...,Reports detailing intentional targeting of Pal...,2025-07-28
1,Australia won’t receive Aukus nuclear submarin...,https://www.theguardian.com/world/2025/jul/28/...,Former prime minister Malcolm Turnbull says th...,2025-07-28
2,Mastercard and Visa face backlash after hundre...,https://www.theguardian.com/world/2025/jul/29/...,Payment platforms demand services remove NSFW ...,2025-07-28
3,‘There’s an arrogance to the way they move aro...,https://www.theguardian.com/world/2025/jul/27/...,"Like so many others, I moved from London to Po...",2025-07-27
4,Von der Leyen ducks Trump’s trade blitz – but ...,https://www.theguardian.com/world/2025/jul/28/...,"Europe may have staved off an economic clash, ...",2025-07-28


In [None]:
from google.colab import files
files.download("guardian_articles.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>