In [5]:
import requests
import json
import time
from bs4 import BeautifulSoup
from urllib.parse import urlparse, parse_qs
from selenium import webdriver
from selenium.webdriver.common.by import By


In [6]:
def read_urls(file_path):
    with open(file_path, 'r') as file:
        return [line.strip() for line in file if line.strip()]

In [7]:
def scrape_article(url):
    """
    Cette fonction extrait des informations d'un article web à partir de l'URL fournie.
    
    Arguments:
    url (str): L'URL de l'article web à scraper.
    
    Retourne:
    dict: Un dictionnaire contenant l'URL, le titre, le thème, le contenu et les topics de l'article.
    None: Si une erreur se produit lors du scraping.
    
    Le dictionnaire retourné a la structure suivante:
    {
        "url": str,    # L'URL de l'article
        "title": str,  # Le titre de l'article
        "theme": str,  # Le thème de l'article
        "content": str,  # Le contenu complet de l'article
        "topic": list  # Une liste de topics associés à l'article
    }
    """
    try:
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
        response = requests.get(url, headers=headers)
        response.raise_for_status()

        
        soup = BeautifulSoup(response.text, 'html.parser')
        # titre
        title = soup.find('h1', class_="content__title")
        title = title.get_text(strip=True)
        
        # contenu
        article_content = soup.find_all('p')
        content = '\n'.join([p.get_text(strip=True) for p in article_content])

        # thème
        theme = soup.find('a', class_="is-topic")
        theme = theme.get_text(strip=True)

        # topic
        tags_ul = soup.find('ul', class_='content__tags')
        tags = []
        if tags_ul:
            for li in tags_ul.find_all('li'):
                # trouver  <a> dans <li>
                a_tag = li.find('a')
                if a_tag:
                    tags.append(a_tag.get_text(strip=True))
  
        return {"url": url, "title": title, "theme": theme, "content": content, "topic": tags}
        
    except Exception as e:
        print(f"Failed to scrape {url}: {str(e)}")
        return None

In [12]:
def main():
    file_path = '../src/urls_en.txt'
    urls = read_urls(file_path)
    articles = []
    for url in urls:
        article = scrape_article(url)
        if article:
            articles.append(article)
        time.sleep(0.1)  
    with open('../data/raw/scraped_data.json', 'w', encoding='utf-8') as json_file:
        json.dump(articles, json_file, ensure_ascii=False, indent=4)

if __name__ == "__main__":
    main()

Failed to scrape https://www.cbsnews.com/video/how-ai-powered-robots-are-helping-small-farms-fight-labor-shortages/: 'NoneType' object has no attribute 'get_text'
