Get URLs from Sitemap

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

# Get URLs from sitemap

sitemap_url = 'https://www.postimees.ee/sitemap/news'

response = requests.get(sitemap_url)
soup = BeautifulSoup(response.content, 'xml')

url_tags = soup.find_all("url")

urls = [url.loc.text for url in url_tags]

df = pd.DataFrame(urls, columns=["URL"])
#df.to_excel("Postimees_URLs.xlsx", index = False)
df.head()
print(len(df))

100


Collect URLs from Specified Subsections

In [None]:
import os
import requests
import pandas as pd
from tqdm import tqdm

# Timeout duration in seconds
TIMEOUT = 30

# Base API URL for article fetching
API_URL = "https://services.postimees.ee/rest/v1/sections/{section}/articles"

# Function to fetch articles and their metadata from the API
def fetch_articles(section_id, limit, offset):
    params = {
        "limit": limit,
        "offset": offset,
        "filterSectionFeedArticles": "true"
    }
    try:
        response = requests.get(API_URL.format(section=section_id), params=params, timeout=TIMEOUT)
        if response.status_code == 200:
            data = response.json()
            articles = []
            for article in data:
                article_data = {
                    "id": article["id"],
                    "url": f"https://www.postimees.ee/{article['id']}",
                    "slug": article.get("slug"),
                    "headline": article.get("headline"),
                    "dateCreated": article.get("dateCreated"),
                    "dateModified": article.get("dateModified"),
                    "datePublished": article.get("datePublished"),
                    "isPremium": article.get("isPremium", False),
                    "relatedArticleCount": article.get("meta", {}).get("relatedArticleCount"),
                    "wordCount": article.get("meta", {}).get("wordCount"),
                    "authors": ", ".join(
                        f"{author.get('name')} ({author.get('authorType')})"
                        for author in article.get("authors", [])
                    ),
                    "terms": ", ".join(term.get("term") for term in article.get("terms", [])),
                    "sections": ", ".join(section.get("name") for section in article.get("sections", []))
                }
                articles.append(article_data)
            return articles
    except requests.exceptions.RequestException as e:
        print(f"Request failed for section {section_id} at offset {offset}: {e}")
        return []
    return []

# Main function to collect articles
def collect_articles(section_name, section_id, max_articles):
    collected_articles = []
    limit = 100
    offset = 0

    with tqdm(total=max_articles, desc=f"Scraping {section_name}") as pbar:
        while len(collected_articles) < max_articles and (offset + limit) <= 10000:
            articles = fetch_articles(section_id, limit, offset)
            if not articles:
                break
            collected_articles.extend(articles)
            offset += limit
            pbar.update(len(articles))

    return collected_articles[:max_articles]

# Save articles to CSV
def save_to_csv(section_name, articles):
    os.makedirs("postimees_sections", exist_ok=True)
    file_path = os.path.join("postimees_sections", f"{section_name}.csv")
    pd.DataFrame(articles).to_csv(file_path, index=False)
    print(f"Saved {len(articles)} articles to {file_path}")

if __name__ == "__main__":
    # Define sections to scrape (name and section ID)
    sections = {
        #"eesti": 122,
        "majandus": 517,
        "maailm": 123,
        "sport": 124,
        "arvamus": 127,
        "kultuur": 2187,
        "haridus": 5304,
        "teadus": 3371,
        "soda": 5821,
        "koroona": 5246
    }

    # Set the number of articles to scrape per section
    max_articles = 10_000  # 10,000 is the maximum the API allows

    # Iterate over sections and scrape articles
    for section_name, section_id in sections.items():
        print(f"Starting scrape for section: {section_name}")
        articles = collect_articles(section_name, section_id, max_articles)
        if articles:
            save_to_csv(section_name, articles)
        else:
            print(f"No articles found for section: {section_name}")


Not selenium (Fetch and Update Articles)

In [None]:
import requests
import json
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup
import time

# Load cookies from a JSON file
def load_cookies(file_path):
    with open(file_path, "r") as file:
        cookies = json.load(file)
    # Convert cookies to a format compatible with requests
    return {cookie["name"]: cookie["value"] for cookie in cookies}

# Fetch article content using requests
def fetch_article_content(session, article_url, article_id):
    try:
        response = session.get(article_url, timeout=10)  # Adjust timeout as needed
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, "html.parser")

            # Locate the correct article by matching data-article-id
            article_selector = f'article[data-article-id="{article_id}"]'
            article_element = soup.select_one(article_selector)

            if not article_element:
                print(f"Article with id {article_id} not found in {article_url}")
                return None, None

            # Locate elements with the specific class
            content_items = article_element.find_all(
                lambda tag: tag.name == "div" and
                            "article-body__item--htmlElement" in tag.get("class", [])
            )

            if not content_items:
                print(f"No content containers found for {article_id} in {article_url}")
                return None, None

            # Extract text from <p> and <h2> and hrefs from <a> tags within each container
            content = []
            hrefs = []
            for container in content_items:
                paragraphs_and_headings = container.find_all(["p", "h2"])
                content.extend(element.get_text(strip=True) for element in paragraphs_and_headings)
                links = container.find_all("a", href=True)
                hrefs.extend(link["href"] for link in links)

                content_text = " ".join(content).strip() if content else None
                hrefs_list = ", ".join(hrefs) if hrefs else None
            return content_text, hrefs_list
        else:
            print(f"Failed to fetch {article_url}. Status code: {response.status_code}")
            return None, None
    except Exception as e:
        print(f"Error fetching {article_url}: {e}")
        return None, None

def update_csv_with_content(session, file_path, delay):
    if not file_path:
        print(f"File not found: {file_path}")
        return

    # Read the existing CSV
    df = pd.read_csv(file_path)

    if "content" not in df.columns:
        df["content"] = None
    if "hrefs" not in df.columns:
        df["hrefs"] = None
    for index, row in tqdm(df.iterrows(), total=df.shape[0], desc=f"Updating {file_path}"):
        if not pd.isnull(row["content"]) and not pd.isnull(row["hrefs"]):
            continue
        article_url = row["url"]
        article_id = row['id']
        content, hrefs = fetch_article_content(session, article_url, article_id)
        if content:
            df.at[index, "content"] = content
        if hrefs:
            df.at[index, "hrefs"] = hrefs
        time.sleep(delay)

    # Save the updated CSV
    df.to_csv(file_path, index=False)
    print(f"Updated file saved to {file_path}")

if __name__ == "__main__":
    # Load cookies
    cookies_file = "www.postimees.ee_cookies.json"  # Update with actual path
    cookies = load_cookies(cookies_file)

    # Set up a requests session
    session = requests.Session()
    session.cookies.update(cookies)

    # Define the sections to update
    sections = ["koroona"] #, "sport", "arvamus", "kultuur", "haridus", "teadus", "soda", "koroona"
    base_path = "postimees_sections"

    for section_name in sections:
        csv_file = f"{base_path}/{section_name}.csv"
        print(f"Processing {csv_file}...")
        update_csv_with_content(session, csv_file, delay=1)


Selenium

In [None]:
import time
import json
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import json
import pandas as pd
import time
from tqdm import tqdm

def check_and_refresh_cookies(cookie_file):
    with open(cookie_file, "r") as file:
        cookies = json.load(file)

    for cookie in cookies:
        if "expirationDate" in cookie:
            if time.time() > cookie["expirationDate"]:
                print(f"Cookie {cookie['name']} has expired.")
                #TODO Hande relogin or cookie update

    return cookies

#check_and_refresh_cookies("www.postimees.ee_cookies.json")

def load_cookies(file_path):
    with open(file_path, "r") as file:
        return json.load(file)
    
def add_cookies_to_driver(driver, cookies, base_url):
    driver.get(base_url)  # Open the base URL
    time.sleep(2)  # Ensure the page is fully loaded
    for cookie in cookies:
        driver.add_cookie({
            "name": cookie["name"],
            "value": cookie["value"],
            "domain": cookie["domain"],
            "path": cookie["path"],
            "secure": cookie.get("secure", False),
            "httpOnly": cookie.get("httpOnly", False),
        })

def setup_driver():
    chrome_options = Options()
    #chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    driver = webdriver.Chrome(service=Service("chromedriver-win64/chromedriver.exe"), options=chrome_options)
    return driver

def fetch_article_content(driver, article_url, article_id):
    try:
        driver.get(article_url)
        time.sleep(2)  # Wait for the page to load

        # Locate the correct article by matching data-article-id
        article_selector = f'article[data-article-id="{article_id}"]'
        article_element = driver.find_element(By.CSS_SELECTOR, article_selector)

        if not article_element:
            print(f"Article with id {article_id} not found on {article_url}")
            return None

        # Locate all content containers with relevant classes
        content_containers = article_element.find_elements(By.XPATH, ".//div[contains(@class, 'article-body-content')]")

        if not content_containers:
            print(f"No content containers found for article {article_id} on {article_url}")
            return None

        # Extract <p> and <h2> elements within each container
        content = []
        for container in content_containers:
            paragraphs_and_headings = container.find_elements(By.XPATH, ".//p | .//h2")
            content.extend(element.text.strip() for element in paragraphs_and_headings if element.text.strip())

        return " ".join(content).strip() if content else None
    except Exception as e:
        print(f"Failed to fetch content for {article_url}: {e}")
        return None

def update_csv_with_content(driver, file_path):
    if not file_path:
        print(f"File not found: {file_path}")
        return

    # Read the existing CSV
    df = pd.read_csv(file_path)

    if "content" not in df.columns:
        df["content"] = None

    for index, row in tqdm(df.iterrows(), total=df.shape[0], desc=f"Updating {file_path}"):
        if pd.isnull(row["content"]):  # Skip articles that already have content
            article_url = row["url"]
            article_id = str(row["id"])
            content = fetch_article_content(driver, article_url, article_id)
            if content:
                df.at[index, "content"] = content

    df.to_csv(file_path, index=False)
    print(f"Updated file saved to {file_path}")


if __name__ == "__main__":
    # Load cookies
    cookies_file = "www.postimees.ee_cookies.json"
    cookies = load_cookies(cookies_file)
    
    # Set up the Selenium driver
    driver = setup_driver()
    
    try:
        # Add cookies to the driver
        base_url = "https://www.postimees.ee/"
        add_cookies_to_driver(driver, cookies, base_url)
        
        # Define the sections to update
        sections = ["koroona"]
        base_path = "postimees_sections"

        for section_name in sections:
            csv_file = f"{base_path}/{section_name}.csv"
            print(f"Processing {csv_file}...")
            update_csv_with_content(driver, csv_file)
    finally:
        driver.quit()

Cookie check

In [None]:
import json
import time

def check_and_refresh_cookies(cookie_file):
    with open(cookie_file, "r") as file:
        cookies = json.load(file)

    for cookie in cookies:
        if "expirationDate" in cookie:
            if time.time() > cookie["expirationDate"]:
                print(f"Cookie {cookie['name']} has expired.")
                #TODO Hande relogin or cookie update

    return cookies

check_and_refresh_cookies("")