In [1]:
import threading
import requests
from bs4 import BeautifulSoup
from queue import Queue
import time
import csv
import urllib.robotparser

NUM_THREADS = 5
url_queue = Queue()
urls_to_crawl = [
    'https://www.youtube.com',
    'https://www.wikipedia.org',
    'https://www.reddit.com'
]

output_file = 'crawl_results.csv'

# Initialisation du fichier CSV
with open(output_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['URL', 'Title'])

def can_fetch(url):
    rp = urllib.robotparser.RobotFileParser()
    rp.set_url(url + '/robots.txt')
    rp.read()
    return rp.can_fetch('*', url)

def crawl(url):
    if not can_fetch(url):
        print(f'Crawling disallowed by robots.txt for {url}')
        return

    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        title = soup.title.string if soup.title else 'No title'
        with threading.Lock():
            with open(output_file, mode='a', newline='', encoding='utf-8') as file:
                writer = csv.writer(file)
                writer.writerow([url, title])
        print(f'Title of {url}: {title}')
    except requests.RequestException as e:
        print(f'Error crawling {url}: {e}')

def worker():
    while True:
        url = url_queue.get()
        if url is None:
            break
        crawl(url)
        time.sleep(1)  # Pause d'une seconde entre les requêtes
        url_queue.task_done()

# Fonction pour mesurer le temps d'exécution
def main():
    start_time = time.time()

    threads = []
    for _ in range(NUM_THREADS):
        t = threading.Thread(target=worker)
        t.start()
        threads.append(t)

    for url in urls_to_crawl:
        url_queue.put(url)

    url_queue.join()

    for _ in range(NUM_THREADS):
        url_queue.put(None)
    for t in threads:
        t.join()

    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f'Total execution time: {elapsed_time:.2f} seconds')

if __name__ == "__main__":
    main()


Title of https://www.wikipedia.org: Wikipedia
Title of https://www.youtube.com: YouTube
Title of https://www.reddit.com: Reddit - Dive into anything
Total execution time: 2.50 seconds


In [10]:
import threading
import requests
from bs4 import BeautifulSoup
from queue import Queue
import time
import csv
import urllib.robotparser

NUM_THREADS = 5
MAX_URLS = 150  # Augmenter le nombre d'URLs à crawler
url_queue = Queue()
visited_urls = set()
initial_urls = [
    'https://www.wikipedia.org',
    'https://www.reddit.com',
    'https://www.bbc.com',
    'https://www.cnn.com',
    'https://www.github.com',
    'https://www.stackoverflow.com',
    'https://www.medium.com',
    'https://www.quora.com',
    'https://www.nytimes.com',
    'https://www.theguardian.com'
]

output_file = 'crawl_results.csv'

with open(output_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['URL', 'Title'])

def can_fetch(url):
    try:
        rp = urllib.robotparser.RobotFileParser()
        rp.set_url(url + '/robots.txt')
        rp.read()
        return rp.can_fetch('*', url)
    except Exception as e:
        print(f'Error reading robots.txt for {url}: {e}')
        return False

def crawl(url):
    if not can_fetch(url):
        print(f'Crawling disallowed by robots.txt for {url}')
        return

    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        title = soup.title.string if soup.title else 'No title'
        with threading.Lock():
            with open(output_file, mode='a', newline='', encoding='utf-8') as file:
                writer = csv.writer(file)
                writer.writerow([url, title])
        print(f'Title of {url}: {title}')
        
        # Find and queue internal links
        for link in soup.find_all('a', href=True):
            href = link['href']
            if href.startswith('http'):
                if href not in visited_urls and len(visited_urls) < MAX_URLS:
                    visited_urls.add(href)
                    url_queue.put(href)
    except requests.RequestException as e:
        print(f'Error crawling {url}: {e}')

def worker():
    while True:
        url = url_queue.get()
        if url is None:
            break
        crawl(url)
        time.sleep(1)  # Augmenter le délai entre les requêtes à 1 seconde
        url_queue.task_done()

def main():
    start_time = time.time()

    threads = []
    for _ in range(NUM_THREADS):
        t = threading.Thread(target=worker)
        t.start()
        threads.append(t)

    for url in initial_urls:
        url_queue.put(url)
        visited_urls.add(url)

    url_queue.join()

    for _ in range(NUM_THREADS):
        url_queue.put(None)
    for t in threads:
        t.join()

    end_time = time.time()
    elapsed_time = end_time - start_time

    minutes, seconds = divmod(elapsed_time, 60)
    print(f'Total execution time: {int(minutes)} minutes and {seconds:.2f} seconds')

if __name__ == "__main__":
    main()


Title of https://www.bbc.com: BBC Home - Breaking News, World News, US News, Sports, Business, Innovation, Climate, Culture, Travel, Video & Audio
Title of https://www.wikipedia.org: Wikipedia
Title of https://www.reddit.com: Reddit - Dive into anything
Title of https://www.cnn.com: Breaking News, Latest News and Videos | CNNCrawling disallowed by robots.txt for https://www.stackoverflow.com

Title of https://www.github.com: GitHub: Let’s build from here · GitHub
Crawling disallowed by robots.txt for https://www.medium.com
Crawling disallowed by robots.txt for https://www.quora.com
Title of https://www.nytimes.com: The New York Times - Breaking News, US News, World News and Videos
Title of https://www.bbc.co.uk/sounds: BBC Sounds - Music. Radio. Podcasts
Title of https://www.bbc.com/weather: BBC Weather - Home
Title of https://www.bbc.com/news/live/world-us-canada-69069142: Trump trial live: Former president rages at verdict in historic case - BBC News
Title of https://www.theguardian.

Crawling disallowed by robots.txt for https://apnews.com/article/trump-trial-deliberations-jury-testimony-verdict-85558c6d08efb434d05b694364470aa0
Title of https://www.foxnews.com/politics/what-happens-trump-convicted-legal-experts-break-down: What happens after Trump's conviction? Legal experts break it down | Fox News
Crawling disallowed by robots.txt for https://www.politico.com/news/2024/05/30/donald-trump-guilty-hush-money-trial-00160460
Title of https://www.theguardian.com/us-news/article/2024/may/30/trump-trial-hush-money-verdict?CMP=Share_iOSApp_Other: Donald Trump found guilty of hush-money plot to influence 2016 election | Donald Trump trials | The Guardian
Crawling disallowed by robots.txt for https://www.newsweek.com/live-updates-jury-begins-day-2-deliberations-trump-hush-money-trial-1906289
Crawling disallowed by robots.txt for https://apnews.com/live/trump-trial-jury-updates-day-2
Crawling disallowed by robots.txt for https://www.vanityfair.com/news/story/donald-trump-tri

Title of https://www.cbsnews.com/newyork/news/biden-campaign-trump-trial-verdict/?intcid=CNR-01-0623: 
    Biden campaign warns: "Convicted felon or not," Trump could still be president - CBS New York
Error crawling https://www.washingtonpost.com/politics/2024/05/30/trump-guilty-what-happens-next/?utm_campaign=wp_main&utm_medium=social&utm_source=reddit.com: HTTPSConnectionPool(host='www.washingtonpost.com', port=443): Read timed out. (read timeout=10)
Title of https://www.rollingstone.com/politics/politics-news/trump-conviction-extremists-calls-violence-1235030317/: Violence After Trump Guilty Verdict? Extremists Fantasize of War 
Title of https://www.bostonglobe.com/2024/05/30/metro/trump-verdict-reactions/?camp=bg:brief:rss:MSN&rss_id=MSN_rss_brief: Trump verdict: Politicians, notables react to guilty verdict in Trump hush-money case
Crawling disallowed by robots.txt for https://www.nytimes.com/2024/05/30/nyregion/trump-prison-secret-service.html
Crawling disallowed by robots.txt fo