https://www.youtube.com/watch?v=9mXoGxAn6pM

https://github.com/dreji18/NER-Training-Spacy-3.0/blob/main/NER%20Training%20with%20Spacy%20v3%20Notebook.ipynb

https://datacorner.fr/spacy/

In [112]:
import pandas as pd
import spacy

nlp = spacy.load("fr_core_news_sm")


In [113]:
def create_dataset(path, separator, scale, destination="DESTINATION", depart="DEPART", error="ERROR"):
    csv = pd.read_csv(path, sep=separator)
    # randomize csv
    csv = csv.sample(frac=1).reset_index(drop=True)
    # print(csv)
    dataset = []
    datasetTest = []
    index = 0
    # convert scale to integer to stop filling train dataset
    scale = int(len(csv) * scale)


    for text in csv.TEXT:
        entities = {'entities': []}
        if text.find(csv[depart][index]) != -1:
            if csv[error][index] == False:
                positionDestination = text.find(csv[destination][index]), text.find(csv[destination][index]) + len(csv[destination][index]), "DESTINATION"
                entities['entities'].append(positionDestination)
                positionDepart = text.find(csv[depart][index]), text.find(csv[depart][index]) + len(csv[depart][index]), "DEPART"
                entities['entities'].append(positionDepart)
        cell = text, entities
        if(index<scale):
            # ("texte" , {"entities": [(0, 5, "DESTINATION"), (6, 11, "DEPART")]})
            dataset.append(cell)
        else:
            datasetTest.append(cell)
        index += 1
    return dataset, datasetTest

def add_ner_to_nlp(train_data):
    # creation pipe vide
    nlp = spacy.blank('fr')
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        # ajout du pipe ner seulement
        nlp.add_pipe('ner')
        
    for text, annotations in train_data:
        for startIndex, endIndex, label in annotations.get('entities', []):
            ner.add_label(label)
    return nlp

Convert data to new spacy format

In [114]:
from tqdm import tqdm
from spacy.tokens import DocBin

db = DocBin()
dataset, datasetTest = create_dataset("../asset/SpeechDestination.csv", ",", 0.8)

for text, annot in tqdm(dataset): # data in previous format
    doc = nlp.make_doc(text) # create doc object from text
    ents = []
    for start, end, label in annot["entities"]: # add character indexes
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents # label the text with the ents
    db.add(doc)

db.to_disk("./train.spacy")

db = DocBin()
for text, annot in tqdm(datasetTest): # data in previous format
    doc = nlp.make_doc(text) # create doc object from text
    ents = []
    for start, end, label in annot["entities"]: # add character indexes
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents # label the text with the ents
    db.add(doc)

db.to_disk("./test.spacy")

100%|██████████| 68/68 [00:00<00:00, 1752.35it/s]
100%|██████████| 17/17 [00:00<00:00, 2133.80it/s]


#### Train du modèle :

Récupérer le fichier de configuration spacy :

https://spacy.io/usage/training#config

Et aller dans son répertoire taper cette commande :

python3 -m spacy init fill-config base_config.cfg config.cfg

Lancer la config de train :

python3 -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./train.spacy 



##### Tester le modèle : 

python3 -m spacy evaluate output/model-best/ train.spacy

ents_p, ents_r, ents_f are the precision, recall and fscore for the NER task.
tags_acc is the POS tagging accuracy.
token_acc seems to be the precision for token segmentation.

In [115]:
nlp = spacy.load(r"./output/model-best")
score = {'Accurate':0, 'Error':0, 'Total':0}
for text, annotations in datasetTest:
    doc = nlp(text)
    # get the word based on annotation index
    try:
        destination = text[annotations.get('entities', [])[0][0]:annotations.get('entities', [])[0][1]]
        depart = text[annotations.get('entities', [])[1][0]:annotations.get('entities', [])[1][1]]
        result = [(depart, 'DEPART'), (destination, 'DESTINATION')]
        # get ent.text where ent.label_ == 'DEPART'
        departPrediction = [ent.text for ent in doc.ents if ent.label_ == 'DEPART']
        # get ent.text where ent.label_ == 'DESTINATION'
        destinationPrediction = [ent.text for ent in doc.ents if ent.label_ == 'DESTINATION']

        # Evaluate the Model
        if(len(departPrediction) == 1 and len(destinationPrediction) == 1):
            if(departPrediction[0] == depart and destinationPrediction[0] == destination):
                score['Accurate'] += 1
            else:
                print("Error" + str(result) + " " + str(departPrediction) + " " + str(destinationPrediction))
                score['Error'] += 1
        else:
            print("Error" + str(result) + " " + str(departPrediction) + " " + str(destinationPrediction))
            score['Error'] += 1
        score['Total'] += 1
    except:
        print("Error")
        score['Error'] += 1
        score['Total'] += 1

    spacy.displacy.render(doc, style="ent", jupyter=True)

print('Accurate results : ' + str(score['Accurate']))
print('Error results : ' + str(score['Error']))
print('Total results : ' + str(score['Total']))
print('Accuracy : ' + str(score['Accurate']/score['Total']))


Error




Error[('Montpellier', 'DEPART'), ('Toulouse', 'DESTINATION')] [] []


Accurate results : 15
Error results : 2
Total results : 17
Accuracy : 0.8823529411764706


In [116]:
doc = nlp("Quel train dois-je prendre si je veux aller à Montpellier en partant de Toulouse?") # input sample text
spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter
doc = nlp("Il fait combien dehors Alexis?") # input sample text
spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter
doc = nlp("Je voudrais partir pour Paris en partant de Lille") # input sample text
spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter
