In [54]:
import pandas as pd
import spacy
import random
from spacy.training.example import Example

nlp = spacy.load("fr_core_news_sm")



In [55]:
# TODO : FULL MAJ EVERYWHERE

def create_dataset(path, separator, destination="DESTINATION", depart="DEPART", error="ERROR"):
    csv = pd.read_csv(path, sep=separator)
    dataset = []
    index = 0
    for text in csv.TEXT:
        entities = {'entities': []}
        if text.find(csv[depart][index]) != -1:
            if csv[error][index] == False:
                positionDestination = text.find(csv[destination][index]), text.find(csv[destination][index]) + len(csv[destination][index]), "DESTINATION"
                entities['entities'].append(positionDestination)
                positionDepart = text.find(csv[depart][index]), text.find(csv[depart][index]) + len(csv[depart][index]), "DEPART"
                entities['entities'].append(positionDepart)
        cell = text, entities
        # ("texte" , {"entities": [(0, 5, "DESTINATION"), (6, 11, "DEPART")]})
        dataset.append(cell)
        index += 1
    return dataset

def add_ner_to_nlp(train_data):
    # creation pipe vide
    nlp = spacy.blank('fr')
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        # ajout du pipe ner seulement
        nlp.add_pipe('ner')
        
    for text, annotations in train_data:
        for startIndex, endIndex, label in annotations.get('entities', []):
            ner.add_label(label)
    return nlp

In [56]:
# train data
def train_nlp(nlp, dataset):
    n_iter = 30
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.begin_training()
        for itn in range(n_iter):
            # change the order of the examples before every iteration
            random.shuffle(dataset)
            losses = {}
            for text, annotations in dataset:

                # Take a string and return a Doc object.
                doc = nlp.make_doc(text)
                # Construct an Example object from the predicted document and the reference annotations provided as a dictionary
                example = Example.from_dict(doc, annotations)

                # Update the model
                # "drop" is to improve the learning results. Deactivate/drop randoms neurons based on the percentage
                # sgd = Stochastic Gradient Descent. An optimizer, i.e. a callable to update the model’s weights
                nlp.update([example], sgd=optimizer, drop=0.2, losses=losses)
            # print('losses -', losses)
    return nlp


In [57]:
# TODO
def test_nlp(nlp, model):
    for text, annotations in model:
        doc = nlp(text)
        for ent in doc.ents:
            print('Entities', [(ent.text, ent.label_) for ent in doc.ents])

In [58]:
def get_destination(nlp, text) -> object:
    doc = nlp(text)
    return [word.text for word in doc.ents if word.label_ == "DESTINATION"]


def get_depart(nlp, text) -> object:
    doc = nlp(text)
    return [word.text for word in doc.ents if word.label_ == "DEPART"]


In [59]:
dataset = create_dataset("../asset/SpeechDestination.csv", ",")
nlp = add_ner_to_nlp(dataset)
nlp = train_nlp(nlp, dataset)



('Je veux aller de Marseille à Paris.', {'entities': [(29, 34, 'DESTINATION'), (17, 26, 'DEPART')]})
("Il faudrait que j'aille en vacance à Montpellier mais je part de Brest.", {'entities': [(37, 48, 'DESTINATION'), (65, 70, 'DEPART')]})
('Je vais rejoindre ma famille à Nice et je suis actuellement à Toulouse.', {'entities': [(31, 35, 'DESTINATION'), (62, 70, 'DEPART')]})
('Je vais bien mais je ne sais pas quel train prendre pour aller de Montpellier à Marseille.', {'entities': [(80, 89, 'DESTINATION'), (66, 77, 'DEPART')]})
('Je ne sais pas ce que je cherche.', {'entities': []})
("C'est une catastrophe mon train est annulé. Je devais faire Toulouse à Montpellier demain soir mais je ne sais pas comment faire.", {'entities': [(71, 82, 'DESTINATION'), (60, 68, 'DEPART')]})
("Comment dois-je m'y prendre pour aller de Lille à Carcassone?", {'entities': [(50, 60, 'DESTINATION'), (42, 47, 'DEPART')]})
('Quel est le meilleur itinéraire pour aller de Auxerre à Béziers en passant par Nantes?', 

In [61]:
# TODO : add more sentences, and test the model with sentences from the csv



# text = "Je veux aller à Paris en partant de Lyon"
# text = "Je veux aller à Lyon en partant de Paris"
# text = "Quel train dois-je prendre si je veux aller à Montpellier en partant de Toulouse?"
text = "Je veux partir de Montpellier pour aller à Lyon"
destinations = get_destination(nlp, text)
departs = get_depart(nlp, text)

# test_nlp(nlp, model)

print(f'Départ : {departs} \nArrivée : {destinations}')

Départ : ['Montpellier'] 
Arrivée : ['Lyon']
