Get URLs from Sitemap

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

# Get URLs from sitemap

sitemap_url = 'https://www.delfi.ee/sitemap/news-1.xml'

response = requests.get(sitemap_url)
soup = BeautifulSoup(response.content, 'xml')

url_tags = soup.find_all("url")

urls = [url.loc.text for url in url_tags]

df = pd.DataFrame(urls, columns=["URL"])
df.to_excel("Delfi_URLs.xlsx", index = False)
df.head

Get URLs from sections

In [None]:
import os
import requests
import pandas as pd
from tqdm import tqdm
import json

# Base API URL for Delfi
API_URL = "https://content.api.delfi.ee/content/v3/graphql"

# Query parameters for fetching articles
QUERY_PARAMS = {
    "operationName": "portal_root_getCategories",
    "extensions": {
        "persistedQuery": {
            "version": 1,
            "sha256Hash": "7bf55f21032140dc9aab84161289df72486f220c5a7d46084c77405c1cda83ed"
        }
    }
}

# Function to fetch only the `id` and `url`
def fetch_articles(section_id, limit, offset):
    variables = {
        "getCount": True,
        "issueOnly": "false",
        "offset": offset,
        "limit": limit,
        "id": section_id,
        "domain": "www.delfi.ee",
    }
    try:
        response = requests.get(
            API_URL,
            params={
                "operationName": QUERY_PARAMS["operationName"],
                "variables": json.dumps(variables),
                "extensions": json.dumps(QUERY_PARAMS["extensions"]),
            },
            timeout=30,
        )

        if response.status_code == 200:
            data = response.json()
            # Get articles or return an empty list if structure is invalid
            items = data.get("data", {}).get("headlines", {}).get("items", [])
            return [{"id": article["id"], "url": f"https://www.delfi.ee/{article['id']}"} for article in items]
        else:
            print(f"Request failed with status code {response.status_code}")
            return []
    except Exception as e:
        print(f"Error fetching articles: {e}")
        return []

# Main function to collect articles
def collect_articles(section_name, section_id):
    collected_articles = []
    limit = 41  # Delfi API limit
    offset = 0

    with tqdm(total=max_articles, desc=f"Scraping {section_name}") as pbar:
            while len(collected_articles) < max_articles:
                # Stop if we're about to exceed the API limit of 10,000
                if offset >= 10000:
                    break
                
                # Adjust the limit for the last batch if needed
                batch_limit = min(limit, 10000 - offset)

                articles = fetch_articles(section_id, batch_limit, offset)
                if not articles:
                    break

                collected_articles.extend(articles)
                offset += batch_limit
                pbar.update(len(articles))

    return collected_articles

# Save articles to CSV
def save_to_csv(section_name, articles):
    os.makedirs("delfi_sections", exist_ok=True)
    file_path = os.path.join("delfi_sections", f"{section_name}.csv")
    pd.DataFrame(articles).to_csv(file_path, index=False)
    print(f"Saved {len(articles)} articles to {file_path}")

if __name__ == "__main__":
    # Define sections to scrape (name and section ID)
    sections = {
        #"eesti": 120,
        #"maailm": 123,
        #"jalgpall": 24314751,
        #"arvamus": 67583634,
        #"kultuur": 67583652,
        #"majandus": 96669082,
        #"krimi": 1727246,
        #"tehnoloogia": 92429137
        #"korvpall": 24172545,
        #"digi": 19375415,
        #"teadus": 19375436,
    }

    max_articles = 10_000
    # Iterate over sections and scrape articles
    for section_name, section_id in sections.items():
        print(f"Starting scrape for section: {section_name}")
        articles = collect_articles(section_name, section_id)
        if articles:
            save_to_csv(section_name, articles)
        else:
            print(f"No articles found for section: {section_name}")


Get URLs from topics

In [None]:
import os
import requests
import pandas as pd
from tqdm import tqdm
import json
import time

# Timeout duration in seconds
TIMEOUT = 30

# Base API URL for Delfi
API_URL = "https://content.api.delfi.ee/content/v3/graphql"

# Query parameters for topic-based fetching
QUERY_PARAMS = {
    "operationName": "portal_root_getTags",
    "extensions": {
        "persistedQuery": {
            "version": 1,
            "sha256Hash": "176980ee3361fa76a9757213443e0f8e4f0d62864af7bf97550d248a5f9243a1"
        }
    }
}

# Function to fetch only the `id` and `url` for a topic
def fetch_articles_by_topic(topic_id, limit, offset, retries=3):
    variables = {
        "getCount": True,
        "issueOnly": "false",
        "offset": offset,
        "limit": limit,
        "id": topic_id,
        "authorLanguage": "ET",
        "channelLanguage": "ET",
    }
    for attempt in range(retries):
        try:
            response = requests.get(
                API_URL,
                params={
                    "operationName": QUERY_PARAMS["operationName"],
                    "variables": json.dumps(variables),
                    "extensions": json.dumps(QUERY_PARAMS["extensions"]),
                },
                timeout=TIMEOUT,
            )

            if response.status_code == 200:
                data = response.json()
                items = data.get("data", {}).get("headlines", {}).get("items", [])
                return [{"id": article["id"], "url": f"https://www.delfi.ee/{article['id']}"} for article in items]
            else:
                print(f"Request failed with status code {response.status_code}")
                time.sleep(5)
        except Exception as e:
            print(f"Error fetching articles for topic {topic_id}: {e}")
            return []
    
    print(f"Failed to fetch articles for topic {topic_id} after {retries} attempts.")
    return []

# Main function to collect articles by topic with a progress bar
def collect_articles_by_topic(topic_name, topic_id, max_articles):
    collected_articles = []
    limit = 41
    offset = 0

    with tqdm(total=max_articles, desc=f"Scraping {topic_name}") as pbar:
        while len(collected_articles) < max_articles and (offset + limit) <= max_articles:
            articles = fetch_articles_by_topic(topic_id, limit, offset)
            if not articles:
                break
            collected_articles.extend(articles)
            offset += limit
            pbar.update(len(articles))

    return collected_articles[:max_articles]

# Save articles to CSV
def save_to_csv(topic_name, articles):
    os.makedirs("delfi_topics", exist_ok=True)
    file_path = os.path.join("delfi_topics", f"{topic_name}.csv")
    pd.DataFrame(articles).to_csv(file_path, index=False)
    print(f"Saved {len(articles)} articles to {file_path}")

if __name__ == "__main__":
    # Define topics to scrape (name and topic ID)
    topics = {
        #"koroona": 88761335,
        #"etc": 123
    }

    # Set the number of articles to scrape per topic
    max_articles = 10000

    # Iterate over topics and scrape articles
    for topic_name, topic_id in topics.items():
        print(f"Starting scrape for topic: {topic_name}")
        articles = collect_articles_by_topic(topic_name, topic_id, max_articles)
        if articles:
            save_to_csv(topic_name, articles)
        else:
            print(f"No articles found for topic: {topic_name}")


Fetch And Update Articles

In [None]:
import requests
import json
import pandas as pd
from tqdm import tqdm
import time
from bs4 import BeautifulSoup

# Base API URL for Delfi
API_URL = "https://content.api.delfi.ee/content/v3/graphql"

def clean_html(html_content):
    soup = BeautifulSoup(html_content, "html.parser")
    for a in soup.find_all('a'):  # Remove hyperlinks but keep the text
        a.unwrap()
    return soup.get_text(strip=True)

# Fetch article content using the API structure
def fetch_article_content(session, article_id):
    variables = {"id": article_id}
    print(article_id)
    extensions = {
        "persistedQuery": {
            "version": 1,
            "sha256Hash": "805a5fb102a1fa46e23693593ca6994fe556d7e087edc3281cf8c940840e1daf"
        }
    }
    retries = 2
    for attempt in range(retries):        
        try:
            response = session.get(
                API_URL,
                params={
                    "operationName": "portal_root_getArticleBodyByID",
                    "variables": json.dumps(variables),
                    "extensions": json.dumps(extensions)
                },
                timeout=10
            )

            # Check if the request was successful
            if response.status_code == 200:
                response_json = response.json()
                article_data = response_json.get("data", {}).get("article", {}).get("data", [])

                if not article_data:
                    print(f"No valid article data found for ID {article_id}.")
                    return None

                article = article_data[0].get("content", {})  # Use the first element of the list

                # Check paywall access
                paywall_access = article.get("paywall", {}).get("access", True)
                if not paywall_access:
                    print(f"Article ID {article_id} is behind a paywall and cannot be accessed.")
                    return None

                # Extract body content if paywall access is granted
                lead_content = article.get("lead", {}).get("content", [])
                if lead_content:  # Check if the list is not empty
                    lead_html = lead_content[0].get("html", "")
                else:
                    lead_html = ""  # Default to empty string if no content is found

                # Start with cleaned lead content
                all_text = clean_html(lead_html)

                # Proceed with 'body' content
                body_content = article.get("body", {}).get("content", [])
                hrefs = []

                for fragment in body_content:
                    if fragment.get("type") in ["paragraph", "heading", "span"]:
                        all_text += " " + clean_html(fragment.get("html", ""))
                    elif fragment.get("type") == "pullout":
                        all_text += " " + clean_html(fragment.get("attrs", {}).get("text", {}).get("html", ""))
                    elif fragment.get("type") == "sidebar":
                        title_text = clean_html(fragment.get("attrs", {}).get("title", {}).get("text", "")).replace("\n", " ")
                        all_text += " " + title_text

                        body_attrs = fragment.get("attrs", {}).get("body", {})
                        if 'text' in body_attrs:
                            body_text = clean_html(body_attrs.get("text", "")).replace("\n", " ")
                            all_text += " " + body_text
                        else:
                            for content in body_attrs.get("content", []):
                                if 'html' in content:
                                    body_text = clean_html(content['html']).replace("\n", " ")
                                    all_text += " " + body_text


                    # Extract hrefs if present
                    if fragment.get("html", ""):
                        soup = BeautifulSoup(fragment.get("html"), "html.parser")
                        hrefs.extend([a['href'] for a in soup.find_all('a', href=True)])

                return all_text, hrefs

            elif response.status_code == 503:
                print(f"Server unavailable, retrying in 1 second... (Attempt {attempt + 1} of {retries})")
                time.sleep(1)
                continue
        
            else:
                print(f"Failed to fetch article body for ID {article_id}. Status code: {response.status_code}")
                return None
        
        except Exception as e:
            print(f"Error fetching article body for ID {article_id}: {e}")
            return None
    print("Failed after maximum retries")
    return None

# Update the CSV with the fetched article body
def update_csv_with_article_content(session, file_path, delay):
    if not file_path:
        print(f"File not found: {file_path}")
        return

    # Read the existing CSV
    df = pd.read_csv(file_path)
    df["content"] = None
    df["hrefs"] = None

    # Process each article ID
    for index, row in tqdm(df.iterrows(), total=df.shape[0], desc=f"Updating {file_path}"):
        if not pd.isnull(row["content"]):
            continue

        article_id = row["id"]
        result = fetch_article_content(session, article_id)
        if result:
            content, hrefs = result
            df.at[index, "content"] = content
            df.at[index, "hrefs"] = hrefs
        # Delay between requests
        time.sleep(delay)

    # Save the updated CSV
    df.to_csv(file_path, index=False)
    print(f"Updated file saved to {file_path}")

# Main entry point
if __name__ == "__main__":
    session = requests.Session()
    # ADD YOUR BEARER TOKEN HERE
    session.headers.update({
        "Authorization": ""
    })
    
    # Define the sections to update
    sections = ["soda"]
    #base_path = "delfi_sections"
    base_path = "delfi_topics"
    
    for section_name in sections:
        csv_file = f"{base_path}/{section_name}.csv"
        print(f"Processing {csv_file}...")
        update_csv_with_article_content(session, csv_file, delay=0)
