In [7]:
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer
import os, sys
import pandas as pd
sys.path.append(os.path.abspath('..'))
from ActivesScripts.toolbox import read_txt_file, filter_similar_entities

In [9]:
def entities_extractions(dossier):
    """
    Extrait les mots-clés de fichiers texte dans un dossier et enregistre les résultats dans des fichiers séparés.

    Paramètres:
    dossier : str
        Chemin vers le dossier contenant les fichiers texte (.txt).
    """

    odd_number = dossier[-2:]

    print(f"\nDébut de traitement de l'ODD {odd_number}...")
    sentence_model = SentenceTransformer("all-MiniLM-L6-v2")

    # Liste des fichiers texte à traiter
    txt_files = [fichier for fichier in os.listdir(dossier)
                 if fichier.endswith(".txt") and "keywords" not in fichier]

    keywords_ODD = []

    for txt_file in txt_files:
        # Convertir le fichier texte en une chaîne de caractères
        texte = read_txt_file(f"{dossier}/{txt_file}")

        # Configurer le modèle KeyBert pour l'extraction des mots-clés
        kw_model = KeyBERT(model=sentence_model)
        keywords = kw_model.extract_keywords(
            texte,
            keyphrase_ngram_range=(1, 2),
            stop_words='english',
#             use_maxsum=True,
            nr_candidates=20,
            top_n=10
        )
        
        # Filtrer les entités similaires
        entities_filtered = filter_similar_entities(keywords, threshold=80)

        # Filtrer les résultats par score et cible
        filtered_keywords = [(kw[0], kw[1], txt_file[12:14]) for kw in entities_filtered if kw[1] > 0.5]
        keywords_ODD.extend(filtered_keywords)

    # Stockage des résultats en dataframe
    df = pd.DataFrame(keywords_ODD, columns=["Mots-cles", "Scores", "Cibles"])
    grouped = df.groupby("Cibles")
    results_dfs = []
    for cible, group in grouped :
        df_targets = pd.DataFrame({f"Cible {cible}": group["Mots-cles"].values, "Scores": group["Scores"].values})
        results_dfs.append(df_targets)
    
    final_results = pd.concat(results_dfs, axis=1)
    
    display(final_results)
    
    print(f"...Fin de traitement de l'ODD {odd_number}\n")


entities_extractions("../MedaDocx/ODD01")


Début de traitement de l'ODD 01...


Unnamed: 0,Cible 01,Scores,Cible 02,Scores.1,Cible 03,Scores.2,Cible 04,Scores.3,Cible 05,Scores.4,Cible 0a,Scores.5,Cible 0b,Scores.6
0,international poverty,0.6882,poverty rate,0.7206,social protection,0.6254,water services,0.5502,affected disasters,0.5608,oda grants,0.6402,monetary poverty,0.6687
1,global poverty,0.6793,poverty line,0.6732,coverage social,0.5864,water sanitation,0.5024,affected people,0.536,income oecd,0.5559,poor monetary,0.571
2,assessing poverty,0.6189,national poverty,0.6481,social insurance,0.5758,land tenure,0.5982,economic loss,0.6234,assistance oda,0.5364,consumption poverty,0.527
3,defining poverty,0.6014,country poverty,0.6478,social protection,0.6265,tenure land,0.5825,loss monetary,0.5053,defines oda,0.5157,poverty determined,0.5266
4,poverty measures,0.5812,assessing poverty,0.6347,social insurance,0.5443,secure landholder,0.5751,sendai framework,0.6154,development welfare,0.5157,poverty levels,0.5135
5,poverty line,0.5672,defining poverty,0.6229,insurance social,0.5235,tenure rights,0.5526,framework indicators,0.54,poverty reduction,0.5094,expenditures governments,0.5055
6,poverty poverty,0.5667,rural poverty,0.6204,,,tenure security,0.5319,agreements sendai,0.5217,expenditure cofog,0.6398,poverty,0.5002
7,value poverty,0.5537,poverty according,0.5986,,,land rights,0.5311,implementation sendai,0.5128,government expenditure,0.6178,,
8,poverty given,0.5527,multidimensional poverty,0.6296,,,secure tenure,0.513,disaster risk,0.6066,expenditure government,0.6007,,
9,world poorest,0.5524,poverty measure,0.6171,,,,,integrate disaster,0.5333,expenditure defined,0.5646,,


...Fin de traitement de l'ODD 01



In [10]:
# if __name__ == "__main__":
#     # Exécution de tous les ODD
#     path = "../MedaDocx"
#     odd_files = os.listdir(path)
#     for odd_file in odd_files:
#         entities_extractions(f"{path}/{odd_file}")
#         print("----------------------------------------------------------------------------------------------------------")
