In [30]:
import string
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from collections import Counter
import nltk

# Téléchargement les ressources NLTK 
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...


True

In [31]:
def preprocess_sentence(sentence, remove_stopwords=True, use_lemmatization=True):
    """
    Prétraitement d'une phrase :
    - minuscules
    - suppression ponctuation
    - stopwords
    - lemmatisation
    """
    if not sentence:
        return []

    # Minuscules + suppression ponctuation
    translator = str.maketrans("", "", string.punctuation)
    sentence = sentence.lower().translate(translator)

    # Tokenisation
    words = sentence.split()

    # Stopwords anglais
    if remove_stopwords:
        stop_words = set(stopwords.words("english"))
        words = [w for w in words if w not in stop_words]

    # Lemmatisation
    if use_lemmatization:
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(w) for w in words]

    return words

In [32]:
def are_synonyms(word1, word2):
    """
    Vérifie si deux mots sont synonymes avec WordNet
    """
    if word1 == word2:
        return True

    synsets1 = wordnet.synsets(word1)
    synsets2 = wordnet.synsets(word2)

    if not synsets1 or not synsets2:
        return False

    for syn1 in synsets1:
        for syn2 in synsets2:
            # Même synset → mots synonymes
            if syn1 == syn2:
                return True
    return False


In [None]:
def jaccard_similarity_sentences(sentence1, sentence2, remove_stopwords=True, use_lemmatization=True):
    """
    Calcule la similarité de Jaccard entre deux phrases 
    - prend en compte les répétitions de mots (Counter)
    - gère les synonymes via WordNet
    """
    try:
        # Prétraitement
        words1 = preprocess_sentence(sentence1, remove_stopwords, use_lemmatization)
        words2 = preprocess_sentence(sentence2, remove_stopwords, use_lemmatization)

        if not words1 or not words2:
            return 0.0

        counter1 = Counter(words1)
        counter2 = Counter(words2)

        # Gestion des synonymes dans l'intersection
        intersection_count = 0
        used_pairs = set()  

        for w1 in counter1:
            for w2 in counter2:
                if are_synonyms(w1, w2) and (w1, w2) not in used_pairs:
                    intersection_count += min(counter1[w1], counter2[w2])
                    used_pairs.add((w1, w2))
                    break  

        # Union pondérée
        union_count = sum(counter1.values()) + sum(counter2.values()) - intersection_count

        return intersection_count / union_count if union_count else 0.0

    except Exception as e:
        print(f"Erreur lors du calcul : {e}")
        return 0.0
