In [1]:
import threading
import requests
from bs4 import BeautifulSoup
from queue import Queue
import time
import csv
import urllib.robotparser

NUM_THREADS = 5
url_queue = Queue()
urls_to_crawl = [
    'https://www.youtube.com',
    'https://www.wikipedia.org',
    'https://www.reddit.com'
]

output_file = 'crawl_results.csv'

# Initialisation du fichier CSV
with open(output_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['URL', 'Title'])

def can_fetch(url):
    rp = urllib.robotparser.RobotFileParser()
    rp.set_url(url + '/robots.txt')
    rp.read()
    return rp.can_fetch('*', url)

def crawl(url):
    if not can_fetch(url):
        print(f'Crawling disallowed by robots.txt for {url}')
        return

    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        title = soup.title.string if soup.title else 'No title'
        with threading.Lock():
            with open(output_file, mode='a', newline='', encoding='utf-8') as file:
                writer = csv.writer(file)
                writer.writerow([url, title])
        print(f'Title of {url}: {title}')
    except requests.RequestException as e:
        print(f'Error crawling {url}: {e}')

def worker():
    while True:
        url = url_queue.get()
        if url is None:
            break
        crawl(url)
        time.sleep(1)  # Pause d'une seconde entre les requêtes
        url_queue.task_done()

# Fonction pour mesurer le temps d'exécution
def main():
    start_time = time.time()

    threads = []
    for _ in range(NUM_THREADS):
        t = threading.Thread(target=worker)
        t.start()
        threads.append(t)

    for url in urls_to_crawl:
        url_queue.put(url)

    url_queue.join()

    for _ in range(NUM_THREADS):
        url_queue.put(None)
    for t in threads:
        t.join()

    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f'Total execution time: {elapsed_time:.2f} seconds')

if __name__ == "__main__":
    main()


Title of https://www.wikipedia.org: Wikipedia
Title of https://www.youtube.com: YouTube
Title of https://www.reddit.com: Reddit - Dive into anything
Total execution time: 2.50 seconds


In [2]:
import threading
import requests
from bs4 import BeautifulSoup
from queue import Queue
import time
import csv
import urllib.robotparser

NUM_THREADS = 5
MAX_URLS = 50  # Limite le nombre d'URLs à crawler
url_queue = Queue()
visited_urls = set()
initial_urls = [
    'https://www.wikipedia.org',
    'https://www.reddit.com',
    'https://www.youtube.com'
]

output_file = 'crawl_results.csv'

with open(output_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['URL', 'Title'])

def can_fetch(url):
    try:
        rp = urllib.robotparser.RobotFileParser()
        rp.set_url(url + '/robots.txt')
        rp.read()
        return rp.can_fetch('*', url)
    except Exception as e:
        print(f'Error reading robots.txt for {url}: {e}')
        return False

def crawl(url):
    if not can_fetch(url):
        print(f'Crawling disallowed by robots.txt for {url}')
        return

    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        title = soup.title.string if soup.title else 'No title'
        with threading.Lock():
            with open(output_file, mode='a', newline='', encoding='utf-8') as file:
                writer = csv.writer(file)
                writer.writerow([url, title])
        print(f'Title of {url}: {title}')
        
        # Find and queue internal links
        for link in soup.find_all('a', href=True):
            href = link['href']
            if href.startswith('http'):
                if href not in visited_urls and len(visited_urls) < MAX_URLS:
                    visited_urls.add(href)
                    url_queue.put(href)
    except requests.RequestException as e:
        print(f'Error crawling {url}: {e}')

def worker():
    while True:
        url = url_queue.get()
        if url is None:
            break
        crawl(url)
        # Reduce delay between requests to 0.5 to 1 second
        time.sleep(0.5)
        url_queue.task_done()

def main():
    start_time = time.time()

    threads = []
    for _ in range(NUM_THREADS):
        t = threading.Thread(target=worker)
        t.start()
        threads.append(t)

    for url in initial_urls:
        url_queue.put(url)
        visited_urls.add(url)

    url_queue.join()

    for _ in range(NUM_THREADS):
        url_queue.put(None)
    for t in threads:
        t.join()

    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f'Total execution time: {elapsed_time:.2f} seconds')

if __name__ == "__main__":
    main()


Title of https://www.wikipedia.org: Wikipedia
Title of https://www.reddit.com: Reddit - Dive into anything
Title of https://en.wikipedia.org/wiki/List_of_Wikipedia_mobile_applications: List of Wikipedia mobile applications - Wikipedia
Crawling disallowed by robots.txt for https://creativecommons.org/licenses/by-sa/4.0/
Title of https://meta.wikimedia.org/wiki/Special:MyLanguage/List_of_Wikipedias: List of Wikipedias - Meta
Title of https://meta.wikimedia.org/wiki/Privacy_policy: Privacy policy - Meta
Title of https://meta.wikimedia.org/wiki/Terms_of_use: Terms of use - Meta
Title of https://donate.wikimedia.org/?utm_medium=portal&utm_campaign=portalFooter&utm_source=portalFooter: Make your donation now - Wikimedia Foundation
Title of https://play.google.com/store/apps/details?id=org.wikipedia&referrer=utm_source%3Dportal%26utm_medium%3Dbutton%26anid%3Dadmob: Wikipedia - Apps on Google Play
Title of https://accounts.reddit.com/adsregister?utm_source=web3x_consumer&utm_name=user_menu_cta