In [None]:
import sys
import os
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer
import time
import pandas as pd
from IPython.display import display
sys.path.append(os.path.abspath('..'))
from ActivesScripts.toolbox import filter_similar_entities, read_txt_file

In [None]:
def common_keywords(keywords):
    keywords_list = [item for sublist in keywords for item in sublist]
    word_count = {}
    
    for word in  keywords_list:
        if word in word_count:
            word_count[word] += 1
        else:
            word_count[word] = 1

    common_words = [word for word, count in word_count.items() if count > 1]
    
    return common_words

keywords_selected = [
    "unemployment rate",
    "unemployed persons",
    "working age",
    "employment activities",
    "job opportunity",
    "short reference",
    "seek employment",
    "business undertaking",
    "financial resources",
    "labour force",
    "persons employment",
    "production goods",
    "remuneration exchange",
    "volunteer work",
    "labour underutilization"
]


print(f"la liste de mots clés à trouvés :\n\n {keywords_selected}\n")

txt_file = [fichier for fichier in os.listdir() if fichier.lower().endswith('.txt') and "Metadata" in fichier]
text = read_txt_file(txt_file[0])


# Modèles à tester
models = {
    "all-MiniLM-L6-v2": "all-MiniLM-L6-v2",
    "paraphrase-MiniLM-L6-v2": "paraphrase-MiniLM-L6-v2",
    # "paraphrase-distilroberta-base-v1": "paraphrase-distilroberta-base-v1",
    # "roberta-large": "roberta-large",
    # "bert-base-uncased": "bert-base-uncased",
    # "bert-base-cased": "bert-base-cased",
    # "t5-base": "t5-base",
}

all_keywords = []
results_dfs = []
print("Début du test...\n")
for model_name, model_path in models.items():

    sentence_model = SentenceTransformer(model_path)

    kw_model = KeyBERT(model=sentence_model)


    start_time = time.time()

    keywords = kw_model.extract_keywords(
        text,
        keyphrase_ngram_range=(1, 2),
        stop_words='english',
        use_maxsum=True,
        nr_candidates=20,
        top_n=15
    )
    entities_filtered = filter_similar_entities(keywords, threshold=60)
    filtered_keywords = [(kw[0], kw[1]) for kw in entities_filtered if kw[1] > 0.4]
    
    end_time = time.time()
    duree = end_time - start_time
    
    keywords, scores = [key[0] for key in filtered_keywords], [key[1] for key in filtered_keywords]
    all_keywords.append(keywords)
    df_results = pd.DataFrame({f"Mots-clés ({model_name})": keywords, f"Scores ({model_name})": scores})
    
    # Srocker les résultats dans le dataframe général
    results_dfs.append(df_results)
    print(f"Durée d'exécution pour le modèle {model_name} : {duree:.4f} secondes\n")

# Afficher tous les résultats
final_results = pd.concat(results_dfs, axis=1)
display(final_results)

if len(common_keywords(all_keywords)) > 0:
    print("------------------------------------------------------------------------------------------------------------------\n")
    print(f"Les mots-clés communs à tous les modèles : {common_keywords(all_keywords)}")
    print("------------------------------------------------------------------------------------------------------------------\n")
else:
    print("Aucun mot-clé commun trouvé entre les modèles.\n")
print("...Fin du test.")
