In [1]:
import json

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

import time
from bs4 import BeautifulSoup

import locale
from datetime import datetime

In [None]:
def scrap_articles(soup):

    articles = soup.find_all('div', class_='post__text')
    results = []
    for i, article in enumerate(articles):
        try:
            # Check if title starts with "Réservé aux abonnés"
            title = article.find('h2', class_='post__title').text.strip() if article.find('h2', class_='post__title') else "No title"
            reserved = title.startswith("Réservé aux abonnés\n\n\n")

            # Replace the string if present
            title = title.replace("Réservé aux abonnés\n\n\n", "")

            link = f"https://www.letemps.ch/{article.find('a')['href']}" if article.find('a') else "No link"
            abstract = article.find('div', class_='post__lead').text.strip() if article.find('div', class_='post__lead') else "No abstract"
            author = article.find('div', class_="post__author").text.strip() if article.find('div', class_="post__author") else "No author"
            date = article.find('time', class_="post__publication-date").text if article.find('time', class_="post__publication-date") else "No date"

            # Append the result including the new boolean variable
            results.append({
                "title": title,
                "link": link,
                "abstract": abstract,
                "author": author,
                "date": date,
                "reserved": reserved  # Boolean for "Réservé aux abonnés"
            })
        except AttributeError as e:
            print(f"Error in article {i}: {e}")
            print(f"Article content: {article}")

    return results

def convert_timestamp(date_string):
    # Set the locale to French
    locale.setlocale(locale.LC_TIME, 'fr_FR.UTF-8')  # For Unix-like systems

    # Split the string to extract the two timestamps
    parts = date_string.split(". / Modifié le ")
    date_created_str = parts[0].replace("Publié le ", "").strip()
    date_modified_str = parts[1].strip() if len(parts) > 1 else None

    # Convert the "date_created" string to a datetime object
    date_created = datetime.strptime(date_created_str, '%d %B %Y à %H:%M')

    # Convert the "date_modified" string to a datetime object if it exists
    if date_modified_str:
        date_modified = datetime.strptime(date_modified_str, '%d %B %Y à %H:%M')
    else:
        date_modified = None

    return {
        "date_created": date_created.isoformat(),
        "date_modified": date_modified.isoformat() if date_modified else None
    }

def paginate():
    try:
        # Wait for the 'Next' button (which contains the span text) to be clickable
        time.sleep(2)
        next_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, "//span[text()='Articles suivants']"))
        )
        # Click the 'Next' button
        next_button.click()

        # Wait for the new page to load (check staleness of the previous button)
        WebDriverWait(driver, 10).until(
            EC.staleness_of(next_button)
        )

    except Exception as e:
        print(f"Error during pagination: {e}")


def run_scraping(topic):

    scraped_articles = []

    url = f"https://www.letemps.ch/articles?query={topic}&button="

    driver.get(url)

    while True:
        try:

            time.sleep(3)
            soup = BeautifulSoup(driver.page_source)

            articles = scrap_articles(soup)
            scraped_articles.extend(articles)

            # save articles
            with open(f"../data/scraping/letemps_{topic}.json", "w", encoding="utf-8") as fp:
                json.dump(scraped_articles, fp, ensure_ascii=False)

            paginate()


        except Exception as e:
            print("No more pages or error occurred:", e)
            driver.quit()
            break


In [3]:
driver = webdriver.Firefox()

In [4]:
topic = "palestine"

run_scraping(topic)

Error during pagination: Message: 

Error during pagination: Message: 

Error during pagination: Message: 

Error during pagination: Message: 

Error during pagination: Message: 

Error during pagination: Message: 

Error during pagination: Message: 



KeyboardInterrupt: 

In [None]:
url = f"https://www.letemps.ch/?s={topic}"

driver.get(url)

soup = BeautifulSoup(driver.page_source)

articles = soup.find_all('div', class_='post__text')

results = []
for i, article in enumerate(articles):
    try:
        # Check if title starts with "Réservé aux abonnés"
        title = article.find('h2', class_='post__title').text.strip() if article.find('h2', class_='post__title') else "No title"
        reserved = title.startswith("Réservé aux abonnés\n\n\n")

        # Replace the string if present
        title = title.replace("Réservé aux abonnés\n\n\n", "")

        link = f"https://www.letemps.ch/{article.find('a')['href']}" if article.find('a') else "No link"
        abstract = article.find('div', class_='post__lead').text.strip() if article.find('div', class_='post__lead') else "No abstract"
        author = article.find('div', class_="post__author").text.strip() if article.find('div', class_="post__author") else "No author"
        date = article.find('time', class_="post__publication-date").text if article.find('time', class_="post__publication-date") else "No date"

        # Append the result including the new boolean variable
        results.append({
            "title": title,
            "link": link,
            "abstract": abstract,
            "author": author,
            "date": date,
            "reserved": reserved  # Boolean for "Réservé aux abonnés"
        })
    except AttributeError as e:
        print(f"Error in article {i}: {e}")
        print(f"Article content: {article}")

# Output the results
results


[{'title': 'Malgré les soupçons, la Croix-Rouge russe est la bienvenue à Genève',
  'link': 'https://www.letemps.ch//monde/malgre-les-soupcons-la-croix-rouge-russe-est-la-bienvenue-a-geneve',
  'abstract': 'Un rapport interne de la Fédération internationale des sociétés de la Croix-Rouge et du Croissant-Rouge épargne la branche russe. Des révélations de la presse et l’Ukraine l’accusent pourtant de participer au transfert illégal d’enfants ukrainiens',
  'author': 'Frédéric Koller',
  'date': 'Publié le 27 septembre 2024 à 15:55. / Modifié le 27 septembre 2024 à 16:19.',
  'reserved': True},
 {'title': 'Au Japon, Shigeru Ishiba prend la tête du Parti libéral-démocrate au pouvoir et sera le prochain premier ministre',
  'link': 'https://www.letemps.ch//monde/asie-oceanie/shigeru-ishiba-prend-la-tete-de-son-parti-et-sera-le-prochain-premier-ministre-du-japon',
  'abstract': 'Sa cinquième tentative a été la bonne. Longtemps considéré comme l’éternel numéro deux de son parti, l’homme de 67