# Classifier to NER

This notebook will first used a model to automate the classification of data and then apply the corresponding NER model to optimize the retrieval process.

## specific case

In [None]:
import spacy
import pandas as pd
import json
import os

In [None]:
ROOT_PATH = r'..'

In [None]:
def classify(classifier_model, data, label):
    """
    This function takes in text stored in csv file and outputs two lists of text for the different labels.
    
    args:
        classifier_model: the path to the model used to classify the text
        data: the path to the csv file containing the text
    """

    # Load the model and data
    nlp_classify = spacy.load(classifier_model)
    data = pd.read_csv(data)

    # Extract the text from the data and class them
    texts = data["text"].tolist()

    for text in texts:
        doc = nlp_classify(text)

    # Store them in corresponding lists
    ppv_list = []
    nppv_list = []

    for text in texts:
        doc = nlp_classify(text)
        if doc.cats.get(label, 0.0) > doc.cats.get(f"N{label}", 0.0):
            ppv_list.append(text)
        else:
            nppv_list.append(text)
    return ppv_list, nppv_list

def NER(ner_model, list:list):
    """
    This function takes in a list of text and outputs a list of tuples containing the text and the predicted NER labels.

    args:
        ner_model: the path to the model used to predict the NER labels
        list: the list of text to be predicted
    """
    nlp_ner = spacy.load(ner_model)

    # Initialize an empty list to store the output data
    output_data = []

    # Loop through items in the directory
    for text in list:
        doc = nlp_ner(text)
        
        # Extract labeled information
        labeled_data = [
            {
                "start": ent.start_char,
                "end": ent.end_char,
                "labels": ent.label_
            }
            for ent in doc.ents
        ]

        # Store text and labeled_data as a tuple and append to the output_data list
        output_data.append((text, {"label": labeled_data}))
    return output_data

def open_json(json_file:str):
    with open(json_file, "r", encoding="utf-8") as file:
        data = json.load(file)
    return data

def save_NER_to_json(list:list, output_path:str):
    with open(output_path, "w", encoding="utf-8") as json_file:
        json.dump(list, json_file, ensure_ascii=False, indent=4)
    
def json_file_length(json_file:str):
    with open(json_file, "r", encoding="utf-8") as file:
        data = json.load(file)
    lenght = len(data)
    print("Le dossier JSON contient", lenght, "éléments.")

def NER_pipeline(classifier_model, ner_model, data_input, output_path,label):
    """
    This function takes a classifier model, data input, and an output file path.
    It classifies the text using the classifier model and then applies the NER model
    to the classified text, returning a JSON file of predicted NER labels.

    Args:
        classifier_model: The path to the model used to classify the text.
        ner_model: The path to the model used to predict the NER labels.
        data_input: The path to the CSV file containing the text.
        output_path: The path to the JSON file to store the predicted NER labels.
    """
    # Use the classify function to classify the data
    ppv_list, nppv_list = classify(classifier_model, data_input,label)

    # Use the NER function to predict the NER labels
    ppv_output = NER(ner_model, ppv_list)

    # Save the predicted NER labels to a JSON file
    save_NER_to_json(ppv_output, output_path)

    # Print the length of the JSON file
    json_file_length(output_path)

Class...

In [None]:
all = ['OUV', 'INT', 'CAD', 'NOUV', 'NCAD', 'AG', 'AI', 'TOUS', 'AG OUV', 'AG INT', 'AG CAD', 'AI OUV', 'AI INT', 'AI CAD', 'NOUV AG', 'NCAD AG', 'NOUV AI', 'NCAD AI', 'ATOT',\
        'ATOT OUV', 'ATOT INT', 'ATOT CAD', 'PPV', 'PPVm', 'DATE']

!mkdir ../data/predicted_json 2> /dev/null

for LABEL in all :
    LABEL_UNDERSCORE=LABEL.replace(" ","_")
    CLASSIFIER_PATH=os.path.join(ROOT_PATH, f"models/classifyer/{LABEL_UNDERSCORE}/model-best")
    NER_PATH=os.path.join(ROOT_PATH, f"models/NER/{LABEL_UNDERSCORE}/model-best")
    DATA_PATH=os.path.join(ROOT_PATH, r"data/processed/data449_cleaned.csv")
    OUTPUT_PATH=os.path.join(ROOT_PATH, f"data/predicted_json/labeled_data_{LABEL_UNDERSCORE}_class_to_NER.json")
    NER_pipeline(classifier_model=CLASSIFIER_PATH, ner_model=NER_PATH, data_input=DATA_PATH, output_path=OUTPUT_PATH,label=LABEL)