In [5]:
import sys
import os
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer
import time
import pandas as pd
from IPython.display import display
sys.path.append(os.path.abspath('..'))
from ActivesScripts.filters import filter_similar_entities, txt_to_string

In [11]:
def common_keywords(keywords):
    keywords_list = [item for sublist in keywords for item in sublist]
    word_count = {}
    
    for word in  keywords_list:
        if word in word_count:
            word_count[word] += 1
        else:
            word_count[word] = 1

    common_words = [word for word, count in word_count.items() if count > 1]
    
    return common_words

keywords_selected = [
    "Employed population",
    "Poverty line",
    "Working poverty",
    "Household poverty",
    "International poverty",
    "Working poor",
    "Poverty rate",
    "Income threshold",
    "Employment rate",
    "Household income",
    "Per-capita consumption",
    "Monetary requirements",
    "Basic goods",
    "Services pay",
    "Poor households",
    "Household members",
    "Per-capita income",
    "Living households",
    "Income expenditure",
    "Consumption levels"
]


print(f"la liste de mots clés à trouvés :\n\n {keywords_selected}\n")

txt_file = [fichier for fichier in os.listdir() if fichier.lower().endswith('.txt') and "Metadata" in fichier]
text = txt_to_string(txt_file[0])


# Modèles à tester
models = {
    "all-MiniLM-L6-v2": "all-MiniLM-L6-v2",
    "paraphrase-MiniLM-L6-v2": "paraphrase-MiniLM-L6-v2",
    # "paraphrase-distilroberta-base-v1": "paraphrase-distilroberta-base-v1",
    # "roberta-large": "roberta-large",
    # "bert-base-uncased": "bert-base-uncased",
    # "bert-base-cased": "bert-base-cased",
    # "t5-base": "t5-base",
}

all_keywords = []
results_dfs = []
print("Début du test...\n")
for model_name, model_path in models.items():

    sentence_model = SentenceTransformer(model_path)

    kw_model = KeyBERT(model=sentence_model)


    start_time = time.time()

    keywords = kw_model.extract_keywords(
        text,
        keyphrase_ngram_range=(1, 2),
        stop_words='english',
        use_maxsum=True,
        nr_candidates=20,
        top_n=15
    )
    entities_filtered = filter_similar_entities(keywords, threshold=60)
    filtered_keywords = [(kw[0], kw[1]) for kw in entities_filtered if kw[1] > 0.4]
    
    end_time = time.time()
    duree = end_time - start_time
    
    keywords, scores = [key[0] for key in filtered_keywords], [key[1] for key in filtered_keywords]
    all_keywords.append(keywords)
    df_results = pd.DataFrame({f"Mots-clés ({model_name})": keywords, f"Scores ({model_name})": scores})
    
    # Srocker les résultats dans le dataframe général
    results_dfs.append(df_results)
    print(f"Durée d'exécution pour le modèle {model_name} : {duree:.4f} secondes\n")

# Afficher tous les résultats
final_results = pd.concat(results_dfs, axis=1)
display(final_results)

if len(common_keywords(all_keywords)) > 0:
    print("------------------------------------------------------------------------------------------------------------------\n")
    print(f"Les mots-clés communs à tous les modèles : {common_keywords(all_keywords)}")
    print("------------------------------------------------------------------------------------------------------------------\n")
else:
    print("Aucun mot-clé commun trouvé entre les modèles.\n")
print("...Fin du test.")


la liste de mots clés à trouvés :

 ['Employed population', 'Poverty line', 'Working poverty', 'Household poverty', 'International poverty', 'Working poor', 'Poverty rate', 'Income threshold', 'Employment rate', 'Household income', 'Per-capita consumption', 'Monetary requirements', 'Basic goods', 'Services pay', 'Poor households', 'Household members', 'Per-capita income', 'Living households', 'Income expenditure', 'Consumption levels']

Début du test...

Durée d'exécution pour le modèle all-MiniLM-L6-v2 : 1.8099 secondes

Durée d'exécution pour le modèle paraphrase-MiniLM-L6-v2 : 1.9546 secondes



Unnamed: 0,Mots-clés (all-MiniLM-L6-v2),Scores (all-MiniLM-L6-v2),Mots-clés (paraphrase-MiniLM-L6-v2),Scores (paraphrase-MiniLM-L6-v2)
0,poor employed,0.4562,threshold generally,0.4734
1,households capita,0.465,considered poor,0.4735
2,non poor,0.4802,proportion employed,0.4808
3,income consumption,0.4965,employed population,0.4822
4,defined poor,0.5019,profit poverty,0.4877
5,income,0.5342,poor threshold,0.5412
6,profit poverty,0.5497,working poor,0.5482
7,international poverty,0.6303,international poverty,0.5719
8,levels poverty,0.6398,household poverty,0.5839
9,poverty households,0.6446,poverty line,0.6084


------------------------------------------------------------------------------------------------------------------

Les mots-clés communs à tous les modèles : ['profit poverty', 'international poverty', 'poverty line']
------------------------------------------------------------------------------------------------------------------

...Fin du test.
