In [3]:
import string
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from collections import Counter
import nltk

# Téléchargement les ressources NLTK 
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [4]:
def preprocess_sentence(sentence, remove_stopwords=True, use_lemmatization=True):
    """
    Prétraitement d'une phrase :
    - minuscules
    - suppression ponctuation
    - stopwords
    - lemmatisation
    """
    if not sentence:
        return []

    # Minuscules + suppression ponctuation
    translator = str.maketrans("", "", string.punctuation)
    sentence = sentence.lower().translate(translator)

    # Tokenisation
    words = sentence.split()

    # Stopwords 
    if remove_stopwords:
        stop_words = set(stopwords.words("english"))
        words = [w for w in words if w not in stop_words]

    # Lemmatisation
    if use_lemmatization:
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(w) for w in words]

    return words

In [5]:
def get_related_forms(word):
    forms = set([word])
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            for deriv in lemma.derivationally_related_forms():
                forms.add(deriv.name())
    return forms

print(get_related_forms("happy"))  # {'happy', 'happiness'}


{'happy', 'felicitousness', 'felicity', 'happiness'}


In [6]:
def are_synonyms(word1, word2):
    """
    Vérifie si deux mots sont synonymes avec WordNet
    """
    if word1 == word2:
        return True

    synsets1 = wordnet.synsets(word1)
    synsets2 = wordnet.synsets(word2)

    if not synsets1 or not synsets2:
        return False

    for syn1 in synsets1:
        for syn2 in synsets2:
            # Même synset → mots synonymes
            if syn1 == syn2:
                return True
    return False


In [7]:
def are_related(w1, w2):
    """
    Vérifie si deux mots sont identiques, synonymes ou formes dérivées.
    """
    # Même mot
    if w1 == w2:
        return True

    # Vérifie s’ils sont synonymes
    for syn1 in wordnet.synsets(w1):
        for lemma in syn1.lemmas():
            if lemma.name() == w2:
                return True

    # Vérifie s’ils partagent une forme dérivée
    related_w1 = get_related_forms(w1)
    related_w2 = get_related_forms(w2)
    if related_w1 & related_w2:  # intersection non vide
        return True

    return False


In [8]:
def jaccard_similarity_sentences(sentence1, sentence2, remove_stopwords=True, use_lemmatization=True):
    """
    Calcule la similarité de Jaccard entre deux phrases 
    en tenant compte :
    - des synonymes
    - des formes dérivées (happy ↔ happiness)
    - des répétitions (Counter)
    """
    try:
        # Prétraitement (à adapter selon ta fonction preprocess_sentence)
        words1 = preprocess_sentence(sentence1, remove_stopwords, use_lemmatization)
        words2 = preprocess_sentence(sentence2, remove_stopwords, use_lemmatization)

        if not words1 or not words2:
            return 0.0

        counter1 = Counter(words1)
        counter2 = Counter(words2)

        intersection_count = 0
        used_pairs = set()

        for w1 in counter1:
            for w2 in counter2:
                if are_related(w1, w2) and (w1, w2) not in used_pairs:
                    intersection_count += min(counter1[w1], counter2[w2])
                    used_pairs.add((w1, w2))
                    break

        union_count = sum(counter1.values()) + sum(counter2.values()) - intersection_count
        return intersection_count / union_count if union_count else 0.0

    except Exception as e:
        print(f"Erreur lors du calcul : {e}")
        return 0.0


In [10]:
def main():
    print("===  Calculateur de similarité de phrases (Jaccard) ===")
    print("Saisissez deux phrases pour comparer leur similarité.")
    print("Tapez 'q' à tout moment pour quitter.\n")

    while True:
        # Saisie utilisateur
        s1 = input(" Entrez la première phrase : ")
        if s1.lower() == "q":
            break

        s2 = input(" Entrez la deuxième phrase : ")
        if s2.lower() == "q":
            break

        # Calcul et affichage du résultat
        score = jaccard_similarity_sentences(s1, s2)
        print(f"\n Similarité Jaccard = {score:.2f} (entre 0 et 1)\n")

        # Demander si l'utilisateur veut continuer
        again = input("Voulez-vous comparer d'autres phrases ? (o/n) : ").strip().lower()
        if again != "o":
            break

    print("\n Merci d'avoir utilisé le calculateur ! ")


if __name__ == "__main__":
    main()


===  Calculateur de similarité de phrases (Jaccard) ===
Saisissez deux phrases pour comparer leur similarité.
Tapez 'q' à tout moment pour quitter.


 Similarité Jaccard = 1.00 (entre 0 et 1)


 Merci d'avoir utilisé le calculateur ! 
