In [1]:
import requests
from bs4 import BeautifulSoup
import psycopg2
from psycopg2 import sql
import time
from urllib.parse import urljoin
from urllib.parse import urlparse
import urllib.robotparser
import re
from langdetect import detect

In [2]:
ignore_extensions = (
    '.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp',
    '.mp4', '.avi', '.mov', '.wmv', '.flv', '.webm', '.mkv',
    '.mp3', '.wav', '.ogg', '.flac', '.aac',
    '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx',
    '.zip', '.rar', '.7z', '.tar', '.gz',
    '.exe', '.bin', '.iso', '.dmg'
)

ignore_keywords = (
    'index', 'connexion', 'inscription', 'mailto', 'tel', 'xml',
    'login', 'javascript', 'logout', 'register', 'signup', 'user',
    'account', 'settings', 'preferences', 'profile', 'admin',
    'private', 'dashboard', 'terms', 'privacy', 'policy', 'license',
    'captcha', 'auth', 'subscribe', 'unsubscribe', 'download',
    'uploads', 'file', 'files', 'attachment', 'comments', 'cgv', 'faq', 'amazon', 'shop', 'snapchat', 'tiktok', 'twitter', 'facebook', 'instagram'
)

In [3]:
def get_connection():
    return psycopg2.connect(
    dbname='Google_Crawler',  # Nom de votre base de données
    user='postgres',     # Votre utilisateur PostgreSQL
    password='240305',  # Votre mot de passe PostgreSQL
    host='localhost',    # Adresse du serveur PostgreSQL
    port='5432' )

In [None]:
nb_url_to_start = input("Combien d'url voulez-vous crawler ? ")
nb_url_to_start = int(nb_url_to_start)
urls = []

for i in range(nb_url_to_start):
    url = input("URL :")
    if url.endswith('/'):
        url = url[:-1]
    urls.append(url)


In [None]:
print(urls)

In [None]:
def crawler(urls, depth):
    conn = get_connection()
    cursor = conn.cursor()
    if depth == 0:
        return 'Done'
    
    for url in urls:
        response = requests.get(url)
        if response.status_code != 200:
            print("Erreur de connexion")
            continue

        content = response.content
        str_content = content.decode('utf-8')

        soup = BeautifulSoup(str_content, "html.parser")
        text = soup.get_text(separator=' ', strip=True)
        # Compter le nombre de mots 
        words = re.findall(r'\w+', text)

        if detect(text) != 'fr':
            print("La page n'est pas en français")
            continue

        # Ajout a la bdd
        try :

            query = """
                INSERT INTO pages (url, content, search_vector, titre, nombre_mots)
                VALUES (%s, %s, to_tsvector('french', %s), %s, %s)
                ON CONFLICT (url) DO UPDATE
                SET content = EXCLUDED.content,
                    search_vector = to_tsvector('french', EXCLUDED.content),
                    updated_at = CURRENT_TIMESTAMP;
                """

            cursor.execute(query, (url, text, text, soup.title.string if soup.title else '', len(words)))
            conn.commit()

            query = sql.SQL("SELECT id FROM pages WHERE url = %s;")
            cursor.execute(query, (url,))
            page_id = cursor.fetchone()[0]
    
            # Recuperation du chemin racine de l'url
            root_url = f'{urlparse(url).scheme}://{urlparse(url).netloc}' 
            if not root_url.endswith('/'):
                root_url += '/'
            url_robots =root_url + 'robots.txt'
    
            # Recuperation du fichier robots.txt
            rp = urllib.robotparser.RobotFileParser()
            rp.set_url(url_robots)
            rp.read()

            print("Insertion des liens dans la base de données")
            list_url = []
            for link in soup.find_all('a' , href=True):
                relative_url = link.get('href')
                absolute_url = urljoin(url, relative_url)

                # Verifier si le lien est autorisé par le robots.txt
                if not rp.can_fetch('*', absolute_url):
                    print("Le lien n'est pas autorisé par le robots.txt")
                    continue
                

                if (absolute_url == url or 
                    any(keyword in absolute_url for keyword in ignore_keywords) or 
                    any(absolute_url.endswith(ext) for ext in ignore_extensions)):
                    continue
                
                query = """
                INSERT INTO to_crawl (id_url_source,url)
                VALUES (%s, %s)
                ON CONFLICT (url) DO NOTHING;
                """
                cursor.execute(query, (page_id, absolute_url))

                list_url.append(absolute_url)
            print(len(list_url))
            conn.commit()   
            

        except Exception as e:
            print(e)
            conn.rollback()

        crawler(list_url,depth-1)  
        

In [None]:
conn = get_connection()
cursor = conn.cursor()
crawler(urls,3)
cursor.close()
conn.close()

# Crawler sur les sites deja dans la base en l'enrichissant

In [None]:
import concurrent.futures
import time
import psycopg2
import requests
from bs4 import BeautifulSoup
from langdetect import detect
from psycopg2 import sql
from psycopg2.pool import SimpleConnectionPool

# Configuration de la connexion à la base de données
DB_CONFIG = {
    'dbname': 'Google_Crawler',
    'user': 'postgres',
    'password': '240305',
    'host': 'localhost',
    'port': '5432'
}

# Pool de connexions
connection_pool = SimpleConnectionPool(1, 20, **DB_CONFIG)

def get_db_connection():
    return connection_pool.getconn()

def release_db_connection(conn):
    connection_pool.putconn(conn)

def normalize_url(base_url, link):
    """Normalize the URL to handle relative URLs."""
    if link.startswith('http'):
        return link
    return urljoin(base_url, link)

# Fonction pour traiter chaque URL
def crawl_url(url):
    conn = get_db_connection()
    cursor = conn.cursor()

    try:
        response = requests.get(url, timeout=10)
        if response.status_code != 200:
            print(f"Erreur de connexion pour l'URL: {url}")
            cursor.execute(
                sql.SQL("""UPDATE to_crawl SET crawled = TRUE WHERE url = %s;"""),
                (url,)
            )
            conn.commit()
            return

        content = response.content
        string_content = content.decode('utf-8')

        soup = BeautifulSoup(string_content, "html.parser")
        text = soup.get_text(separator=' ', strip=True)
        words = re.findall(r'\w+', text)


        if detect(text) != 'fr':
            print(f"La page {url} n'est pas en français")
            cursor.execute(
                sql.SQL("""UPDATE to_crawl SET crawled = TRUE WHERE url = %s;"""),
                (url,)
            )
            conn.commit()
            return
        
        title = soup.title.string if soup.title else 'NULL'

        cursor.execute(
            sql.SQL("""INSERT INTO pages (url, content, search_vector, titre, nombre_mots)
            VALUES (%s, %s, to_tsvector('french', %s), %s, %s)
            ON CONFLICT (url) DO UPDATE
            SET content = EXCLUDED.content,
                search_vector = to_tsvector('french', EXCLUDED.content),
                updated_at = CURRENT_TIMESTAMP;"""),
            (url, text, text, title, len(words))
        )
        conn.commit()

        # Extraire les liens
        links = set()
        for link in soup.find_all('a', href=True):
            full_url = normalize_url(url, link['href'])
            if urlparse(full_url).netloc == urlparse(url).netloc:  # interne
                links.add((url, full_url))

        # Insérer les liens dans la table links
        for from_url, to_url in links:
            cursor.execute(
                sql.SQL("""INSERT INTO links (from_page_id, to_page_id)
                SELECT p1.id, p2.id
                FROM pages p1, pages p2
                WHERE p1.url = %s AND p2.url = %s
                ON CONFLICT (from_page_id, to_page_id) DO NOTHING;"""),
                (from_url, to_url)
            )
        conn.commit()

        cursor.execute(
            sql.SQL("""UPDATE to_crawl SET crawled = TRUE WHERE url = %s;"""),
            (url,)
        )
        conn.commit()

    except (psycopg2.Error, requests.RequestException) as e:
        print(f"Erreur pour l'URL {url}: {e}")
        conn.rollback()

    finally:
        cursor.close()
        release_db_connection(conn)

conn = get_db_connection()
cursor = conn.cursor()
cursor.execute("SELECT url FROM to_crawl WHERE crawled = false ORDER BY created_at ASC ;")
urls_to_crawl = cursor.fetchall()
nbWorker = 8
while True:
    time.sleep(1)
 
    if not urls_to_crawl:
        print("Aucune URL à crawler")
        break

    urls = []
    for i in range(nbWorker):
        if i < len(urls_to_crawl):
            urls.append(urls_to_crawl[i][0])

    print(urls)


    with concurrent.futures.ThreadPoolExecutor(max_workers=nbWorker) as executor:
        executor.map(crawl_url, [url for url in urls])

    for i in range(nbWorker):
        if i < len(urls_to_crawl):
            urls_to_crawl.pop(0)
    print("Fin de l'itération")
cursor.close()
conn.close()



# Mettre dans la bdd tous les mots 

In [None]:
try :
    conn = get_connection()
    cursor = conn.cursor()
    query = sql.SQL("SELECT mot, occurences FROM index_mot where mot = 'montagne'")

    cursor.execute(query)

    occurences = cursor.fetchall()
    
    if len(occurences) > 0:
        occurences = occurences[0][1]

        pattern = re.compile(r'\((\d+),([^\)]+)\)')
        matches = pattern.findall(occurences)
        matches = [(int(match[0]), match[1]) for match in matches]
        # supprimer les anti slash
        matches = [(count, url.replace('\\','')) for count, url in matches]
        matches = [(count, url.replace('"','')) for count, url in matches]
        matches = [(count, url.replace(' ','')) for count, url in matches]

        matches.sort(reverse=True)

        for match in matches:
            print(match[0], match[1])


except psycopg2.Error as e:
    print(e)
    conn.rollback()

finally:
    cursor.close()
    conn.close()

# Mettre en place TF-IDF

In [None]:
import numpy as np
try : 
    print("Recherche de la page la plus pertinente")
    conn = get_connection()
    cursor = conn.cursor()

    query = sql.SQL("SELECT COUNT(*) FROM pages")
    cursor.execute(query)
    nb_pages = cursor.fetchone()[0]
        
    query = sql.SQL("SELECT mot FROM index_mot ORDER BY mot DESC")
    cursor.execute(query)
    mots = cursor.fetchall()
    for mot in mots:
        print(mot[0])
        query = sql.SQL("SELECT occurences FROM index_mot WHERE mot = %s")
        cursor.execute(query, (mot[0],))
        occurences = cursor.fetchall()

    

        if len(occurences) > 0:
            occurences = occurences[0][0]
            pattern = re.compile(r'\((\d+),([^\)]+)\)')
            matches = pattern.findall(occurences)
            matches = [(int(match[0]), match[1]) for match in matches]
            matches = [(count, url.replace('\\','')) for count, url in matches]
            matches = [(count, url.replace('"','')) for count, url in matches]
            matches = [(count, url.replace(' ','')) for count, url in matches]
            
            # calcul du idf
            idf = np.log(nb_pages / len(matches))
        

            # calcul du tf idf
            liste_url = []
            for url in matches:
                cursor.execute(sql.SQL("SELECT nombre_mots FROM pages WHERE url = %s"), (url[1],))
                nb_mots = cursor.fetchone()[0]
                tf = url[0] / nb_mots
                tf_idf = tf * np.log(idf)
                liste_url.append((url[1], tf_idf))


                # calculer maintenant le PageRank
                

            liste_url.sort(key=lambda x: x[1], reverse=True)
            for url in liste_url:
                print(url[0], url[1])
            print("La page la plus pertinente est : ", liste_url[0][0])
            print()
            break

except psycopg2.Error as e:
    print(e)
    conn.rollback()

finally:
    cursor.close()
    conn.close()

In [None]:
import numpy as np
import psycopg2
from psycopg2 import sql
import re

def get_connection():
    # Connexion à la base de données
    conn = psycopg2.connect(dbname='Google_Crawler', user='postgres', password='240305', host='localhost', port='5432')
    return conn

try:
    print("Recherche de la page la plus pertinente")
    conn = get_connection()
    cursor = conn.cursor()

    # Récupérer le nombre total de pages
    cursor.execute("SELECT COUNT(*) FROM pages")
    nb_pages = cursor.fetchone()[0]

    # Récupérer les pages et construire le mapping id -> index
    cursor.execute("SELECT id, url FROM pages")
    pages = cursor.fetchall()
    page_ids = {page_id: index for index, (page_id, _) in enumerate(pages)}
    urls = {index: url for index, (_, url) in enumerate(pages)}

    # Initialiser la matrice de liens
    link_matrix = np.zeros((nb_pages, nb_pages))

    # Récupérer les liens et remplir la matrice de liens
    cursor.execute("SELECT from_page_id, to_page_id FROM links")
    links = cursor.fetchall()
    for from_page_id, to_page_id in links:
        if from_page_id in page_ids and to_page_id in page_ids:
            link_matrix[page_ids[from_page_id], page_ids[to_page_id]] = 1

    # Normaliser la matrice de liens
    out_link_counts = np.sum(link_matrix, axis=1)
    for i in range(nb_pages):
        if out_link_counts[i] > 0:
            link_matrix[i] /= out_link_counts[i]

    # Initialiser les PageRank
    page_rank = np.ones(nb_pages) / nb_pages
    damping_factor = 0.85
    num_iterations = 100

    for _ in range(num_iterations):
        page_rank = (1 - damping_factor) / nb_pages + damping_factor * link_matrix.T.dot(page_rank)

    # Afficher les résultats
    page_rank_urls = [(urls[index], page_rank[index]) for index in range(nb_pages)]
    page_rank_urls.sort(key=lambda x: x[1], reverse=True)

    # Calculer le score combiné TF-IDF + PageRank
    cursor.execute("SELECT mot FROM index_mot ORDER BY mot DESC")
    mots = cursor.fetchall()

    for mot in mots:
        print(mot[0])
        cursor.execute("SELECT occurences FROM index_mot WHERE mot = %s", (mot[0],))
        occurences = cursor.fetchone()[0]

        if occurences:
            pattern = re.compile(r'\((\d+),([^\)]+)\)')
            matches = pattern.findall(occurences)
            matches = [(int(match[0]), match[1].replace('\\', '').replace('"', '').replace(' ', '')) for match in matches]

            # Calcul du IDF
            idf = np.log(nb_pages / len(matches)) if len(matches) > 0 else 0

            # Calcul du TF-IDF et du PageRank
            liste_url = []
            for count, url in matches:
                if url in urls.values():
                    index = list(urls.values()).index(url)
                    cursor.execute("SELECT nombre_mots FROM pages WHERE url = %s", (url,))
                    nb_mots = cursor.fetchone()[0]
                    tf = count / nb_mots
                    tf_idf = tf * idf
                    page_rank_score = page_rank[index]
                    combined_score = 0.7*tf_idf + 0.3 * page_rank_score
                    liste_url.append((url, combined_score))

            liste_url.sort(key=lambda x: x[1], reverse=True)
            for url in liste_url:
                print(url[0], url[1])

            print("La page la plus pertinente est :", liste_url[0][0])
            print()

except psycopg2.Error as e:
    print("Erreur PostgreSQL:", e)
    conn.rollback()

finally:
    cursor.close()
    conn.close()
