In [33]:

import json

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

import time
from bs4 import BeautifulSoup

import locale
from datetime import datetime

import tqdm

In [8]:
def scrap_articles(soup):
    
    articles = soup.find_all('div', class_='post__text')
    results = []
    for i, article in enumerate(articles):
        try:
            # Check if title starts with "Réservé aux abonnés"
            title = article.find('h2', class_='post__title').text.strip() if article.find('h2', class_='post__title') else "No title"
            reserved = title.startswith("Réservé aux abonnés\n\n\n")
            
            # Replace the string if present
            title = title.replace("Réservé aux abonnés\n\n\n", "")
            
            link = f"https://www.letemps.ch/{article.find('a')['href']}" if article.find('a') else "No link"
            abstract = article.find('div', class_='post__lead').text.strip() if article.find('div', class_='post__lead') else "No abstract"
            author = article.find('div', class_="post__author").text.strip() if article.find('div', class_="post__author") else "No author"
            date = article.find('time', class_="post__publication-date").text if article.find('time', class_="post__publication-date") else "No date"
            
            # Append the result including the new boolean variable
            results.append({
                "title": title,
                "link": link,
                "abstract": abstract,
                "author": author,
                "date": date,
                "reserved": reserved  # Boolean for "Réservé aux abonnés"
            })
        except AttributeError as e:
            print(f"Error in article {i}: {e}")
            print(f"Article content: {article}")

    return results

def convert_timestamp(date_string):
    # Set the locale to French
    locale.setlocale(locale.LC_TIME, 'fr_FR.UTF-8')  # For Unix-like systems

    # Split the string to extract the two timestamps
    parts = date_string.split(". / Modifié le ")
    date_created_str = parts[0].replace("Publié le ", "").strip()
    date_modified_str = parts[1].strip() if len(parts) > 1 else None

    # Convert the "date_created" string to a datetime object
    date_created = datetime.strptime(date_created_str, '%d %B %Y à %H:%M')

    # Convert the "date_modified" string to a datetime object if it exists
    if date_modified_str:
        date_modified = datetime.strptime(date_modified_str, '%d %B %Y à %H:%M')
    else:
        date_modified = None

    return {
        "date_created": date_created.isoformat(),
        "date_modified": date_modified.isoformat() if date_modified else None
    }

def paginate():
    try:
        # Wait for the 'Next' button (which contains the span text) to be clickable
        time.sleep(2)
        next_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, "//span[text()='Articles suivants']"))
        )
        # Click the 'Next' button
        next_button.click()

        # Wait for the new page to load (check staleness of the previous button)
        WebDriverWait(driver, 10).until(
            EC.staleness_of(next_button)
        )
        
    except Exception as e:
        print(f"Error during pagination: {e}")
        

def run_scraping(topic):
    
    scraped_articles = []

    url = f"https://www.letemps.ch/articles?query={topic}&button="

    driver.get(url)

    while True:
        try:

            time.sleep(3)
            soup = BeautifulSoup(driver.page_source)
            
            articles = scrap_articles(soup)
            scraped_articles.extend(articles)

            # save articles
            with open(f"../data/scraping/letemps_{topic}.json", "w", encoding="utf-8") as fp:
                json.dump(scraped_articles, fp, ensure_ascii=False)
            
            paginate()
                
            
        except Exception as e:
            print("No more pages or error occurred:", e)
            driver.quit()
            break


In [9]:
driver = webdriver.Firefox()

# Run the scraping of article references for topic "palestine"


In [11]:
topic = "palestine"

run_scraping(topic)

Error during pagination: Message: Element <span> could not be scrolled into view
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.sys.mjs:8:8
WebDriverError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:193:5
ElementNotInteractableError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:353:5
webdriverClickElement@chrome://remote/content/marionette/interaction.sys.mjs:167:11
interaction.clickElement@chrome://remote/content/marionette/interaction.sys.mjs:136:11
clickElement@chrome://remote/content/marionette/actors/MarionetteCommandsChild.sys.mjs:205:29
receiveMessage@chrome://remote/content/marionette/actors/MarionetteCommandsChild.sys.mjs:85:31

Error during pagination: Message: Element <span> could not be scrolled into view
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.sys.mjs:8:8
WebDriverError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:193:5
ElementNotInteractableError@chrome://remote/content/shared/webdriver/Errors

KeyboardInterrupt: 

# Scrap individual articles (only not reserved)

In [None]:
# with open('./lecourrier_cookies.json', 'r') as file:
#     cookies = json.load(file)

In [None]:
# def authenticate(url, cookies):

#     driver.get(url)
    
#     for cookie in cookies:
#         driver.add_cookie(cookie)

# def scrap_article(soup):

#     return soup.find_all("article")[0].text

In [12]:
with open('../data/scraping/letemps_palestine.json', 'r', encoding="utf-8") as file:
    article_list = json.load(file)

In [57]:
#filter out free articles (around 1/5)
free_article_list = [article for article in article_list if article['reserved'] == False]
print(len(article_list)),
print(len(free_article_list))


2600
522


In [71]:
# for some reasons, some articles are duplicated
free_article_list_unique = list({v['link']:v for v in free_article_list}.values())
len(free_article_list_unique)

170

In [None]:
#driver = webdriver.Firefox(service=Service(GeckoDriverManager().install()))
driver = webdriver.Firefox()

#authenticate("https://lecourrier.ch/")

The geckodriver version (0.30.0) detected in PATH at /usr/local/bin/geckodriver might not be compatible with the detected firefox version (130.0.1); currently, geckodriver 0.35.0 is recommended for firefox 130.*, so it is advised to delete the driver in PATH and retry


In [72]:
scraped_articles = []

for article in tqdm.tqdm(free_article_list_unique):

    time.sleep(2)
    
    url = article["link"]
    driver.get(url)    
    soup = BeautifulSoup(html_content, 'html.parser')
    
    html_content = driver.page_source

    soup = BeautifulSoup(html_content, 'html.parser')

    full_html = str(soup.prettify())

    scraped_articles.append({article["link"]: full_html})

    # save articles
    with open(f"../data/scraping/letemps_palestine_articles_full_html_1.json", "w", encoding="utf-8") as fp:
        json.dump(scraped_articles, fp, ensure_ascii=False)

 61%|██████    | 103/170 [46:17<30:06, 26.97s/it]


TimeoutException: Message: Navigation timed out after 300000 ms
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.sys.mjs:8:8
WebDriverError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:193:5
TimeoutError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:740:5
bail@chrome://remote/content/marionette/sync.sys.mjs:211:19
