In [1]:
from ctransformers import AutoModelForCausalLM
from huggingface_hub import login
from transformers import AutoTokenizer, pipeline
from keybert.llm import TextGeneration
from keybert import KeyLLM
import os, re, time, pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def login_hf():
    file_path = "C:/Users/jeanm/Desktop/Ensg/Semestre3/ProjetRecherche/Ressources"
    api_keys = [api_keys for api_keys in os.listdir(file_path) if "api_keys" in api_keys]
    with open(f"{file_path}/{api_keys[0]}", 'r', encoding='utf-8') as file:
        file = file.read()
        hugging_face_api_key= file.split("\n")[3]
        return hugging_face_api_key
    
login(token=login_hf())

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to C:\Users\jeanm\.cache\huggingface\token
Login successful


In [3]:

# Fonction pour lire le contenu d'un fichier texte
def read_txt_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

# Fonction pour d√©couper un texte en morceaux de 512 tokens
def chunk_text(text):
    paragraphs = text.split("\n")  
    results = []
    for paragraph in paragraphs:
        if paragraph.strip():
            results.append(paragraph)
    return results

# Fonction pour extraire les mots-cl√©s
def extract_keywords_from_chunks(chunks, test_model):
    
    # Charger le mod√®le et le tokenizer
    model = AutoModelForCausalLM.from_pretrained(
        "TheBloke/Mistral-7B-Instruct-v0.1-GGUF",
        model_file=test_model,
        model_type="mistral",
        gpu_layers=0,
        hf=True
    )
    
    tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")
    
    generator = pipeline(
        model=model,
        tokenizer=tokenizer,
        task='text-generation',
        max_new_tokens=50,
        repetition_penalty=1.1
    )
    
    # Prompt pour extraire les mots-cl√©s
    keyword_prompt = """
    [INST]
    
    I have the following document:
    - [DOCUMENT]
    
    Please extract only the keywords related to the Sustainable Development Goals (SDGs) that are explicitly mentioned in this document. 
    The keywords should consist of 2 to 3 words and should be meaningful within the context of this document. 
    Ensure that the keywords are derived solely from the text provided and do not include any external references or interpretations. 
    Return the keywords in a structured and readable format, without adding any extra explanations or phrases such as:
    "Here are the keywords present in the document."
    [/INST]
    
    """
    keywords = []
    count = 0
    for chunk in chunks:
        count += 1
        print(f"Traitement du paragraphe {count} sur {len(chunks)}")
        prompt = keyword_prompt.replace("[DOCUMENT]", chunk)
        llm = TextGeneration(generator, prompt=prompt)
        kw_model = KeyLLM(llm)
        extracted_keywords = kw_model.extract_keywords([chunk])

        # Nettoyage pr√©liminaire des mots-cl√©s extraits
        cleaned_keywords = [kw.replace('* ', '').strip() for kw in extracted_keywords[0][0].split('\n') if kw]
        keywords.append(cleaned_keywords)
        print(f"Fin de traitement du paragraghe {count}")
    return keywords


def clean_keywords(keywords, max_words=2):
    # √âtape 1: Nettoyer les mots-cl√©s des  
    cleaned_keywords = []
    for kw in keywords:
        cleaned_kw = kw.lstrip('- ').strip()  
        cleaned_keywords.append(cleaned_kw)    

    # √âtape 2: Supprimer les doublons tout en conservant l'ordre
    cleaned_keywords = list(dict.fromkeys([kw.lower() for kw in cleaned_keywords]))

    # Etape 3 : Retirer les num√©ros et points
    filtered_keywords = [re.sub(r"^\d+\.\s*", "", keyword) for keyword in cleaned_keywords]
    
    # Retirer les mots-cl√©s contenant 'SGB' ou 'SGBS' et 'Sustainable Development'
    filtered_keywords = [kw for kw in filtered_keywords if kw.lower() not in ['sdg','sdgs', 'sustainable development']]

    # √âtape 4: Filtrer par nombre de mots
    results = []
    for kw in filtered_keywords:
        if  len(kw.split()) == max_words:
            results.append(kw)
            
    return results



def start(txt_file, model):
    
    start_time = time.time()
    print(f"Debut de traitement avec le mod√®le {model}...\n")
    
    # Charger le fichier texte
    document_content = read_txt_file(txt_file[0])
    
    # D√©couper le texte en paragraphes
    chunks = chunk_text(document_content)
    
    # Extraire les mots-cl√©s de chaque morceau
    keywords = extract_keywords_from_chunks(chunks, model)
    
     # Aplatir les mots-cl√©s dans une unique liste
    list_keywords = [keyword for sublist in keywords for keyword in sublist]
    
    # Nettoyer les mots-cl√©s
    cleaned_keywords = clean_keywords(list_keywords)
    
    # Cr√©er un DataFrame avec les mots-cl√©s d√©tect√©s
    keywords_df = pd.DataFrame(list_keywords, columns=["Keywords Detected"])
    
    # Cr√©er un DataFrame avec les mots-cl√©s nettoy√©s
    cleaned_keywords_df = pd.DataFrame(cleaned_keywords, columns=["Keywords Cleaned"])

    # Ajouter une colonne avec les mots-cl√©s nettoy√©s, align√©e sur les mots-cl√©s d√©tect√©s
    keywords_df["Keywords Cleaned"] = pd.Series(cleaned_keywords_df["Keywords Cleaned"].values)

    end_time = time.time()
    duree = end_time - start_time
    print(f"\n...Fin de traitement avec le mod√®le {model}\nDur√©e d'ex√©cution : {duree:.4f} secondes.\n")
    # display(keywords_df) 
   
    return keywords_df



In [4]:
model1 = "mistral-7b-instruct-v0.1.Q4_K_M.gguf"
txt_file = [fichier for fichier in os.listdir() if fichier.lower().endswith('.txt')and "Metadata" in fichier]
start(txt_file, model1)

Debut de traitement avec le mod√®le mistral-7b-instruct-v0.1.Q4_K_M.gguf...



Fetching 1 files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<?, ?it/s]
Fetching 1 files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 1001.98it/s]
CTransformersModel has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From üëâv4.50üëà onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


Traitement du paragraphe 1 sur 5
Fin de traitement du paragraghe 1
Traitement du paragraphe 2 sur 5
Fin de traitement du paragraghe 2
Traitement du paragraphe 3 sur 5
Fin de traitement du paragraghe 3
Traitement du paragraphe 4 sur 5
Fin de traitement du paragraghe 4
Traitement du paragraphe 5 sur 5
Fin de traitement du paragraghe 5

...Fin de traitement avec le mod√®le mistral-7b-instruct-v0.1.Q4_K_M.gguf
Dur√©e d'ex√©cution : 177.4727 secondes.



Unnamed: 0,Keywords Detected,Keywords Cleaned
0,Keywords related to SDGs:,fiscal sustainability
1,- Fiscal sustainability,net worth
2,- Net worth,economic classification
3,- Economic classification,social protection
4,- Functions of government,final outlays
5,- Health,old age
6,- Education,
7,- Social protection,
8,Keywords:,
9,- SDGs,


In [5]:
# model2 = "mistral-7b-instruct-v0.1.Q5_K_M.gguf"
# start(txt_file, model2)