In [None]:
import spacy
from enum import Enum
from spacy.symbols import PROPN, NOUN, CCONJ, ADP, VERB
import numpy as np
from spacy import displacy


texts = [
("Je pars de Paris pour arriver à Toulouse.", ['Paris', 'Toulouse']),
("Demain, j'irai à Lyon, mais aujourd'hui, je suis à Montpellier.", ['Lyon', 'Montpellier']),
("Je voudrais aller à Montpellier depuis Toulouse.", ['Montpellier', 'Toulouse']),
("Je vais aller de Toulouse à Lyon.", ['Toulouse', 'Lyon']),
("Demain, je fais le trajet Paris - Marseille-Saint-Charles", ['Paris', 'Marseille']),
("Demain, je ferai le trajet de Paris à Marseille", ['Paris', 'Marseille']),
("Je compte prendre un train depuis Paris, et avec un peu de chance si la SNCF n'est pas en retard, j'arriverai à Toulouse", ['Paris', 'Toulouse']),
("Ville de départ Toulouse et ville d'arrivée Paris", ['Toulouse', 'Paris']),
("Trajet Paris à Marseille", ['Paris', 'Marseille']),
("Trajet Paris depuis Marseille", ['Paris', 'Marseille']),
("aller de Toulouse à Paris demain", ['Paris', 'Paris']),
("aller a Toulouse depuis Paris demain", ['Toulouse', 'Paris']),
("départ Toulouse vers Perpignan", ['Toulouse', 'Perpignan']),
("je suis à Toulouse , je voudrais aller demain à Perpignan", ['Toulouse', 'Perpignan']),
("trajet de Paris a Toulouse ce soir", ['Paris', 'Toulouse']),
("je voudrais aller de Toulouse à Paris après demain", ['Toulouse', 'Paris']),
("trajet Paris Toulouse aujourd'hui", ['Paris', 'Toulouse']),
("je voudrais me rendre a Toulouse demain , je suis à Nantes aujourd'hui", ['Nantes', 'Toulouse']),
("trains disponible pour aller à marseille en venant de Paris", ['Paris', 'marseille']),
("Paris Marseille", []),
("Paris en venant de Marseille", ['Paris', 'Paris']),
("Je veux aller à Paris après être allé à Mulhouse depuis Lyon", ["Lyon", "Mulhouse", "Paris"]),
]


In [None]:
class RelDirection(Enum):
    NONE = 1
    START = 2
    DEST = 3

class RelStrength(Enum):
    NONE = 1
    WEAK = 2
    STRONG = 3

In [None]:
class Word:
    def __init__(self, word: str, direction: RelDirection, strength: RelStrength):
        self.word = word
        self.direction = direction
        self.strength = strength

class LinkedWord:
    def __init__(self, word: str, fixedWord: str, direction: RelDirection, strength: RelStrength):
        self.word = word
        self.fixedWord = fixedWord
        self.direction = direction
        self.strength = strength


In [None]:
# CCONJ links: 'cc'_child
CCONJ_Relation = [
    # Start
    Word("depuis",     RelDirection.START, RelStrength.STRONG),
    # Destination
    Word("puis",       RelDirection.DEST,  RelStrength.STRONG),
    Word("et",         RelDirection.DEST,  RelStrength.STRONG),
    Word("enfin",      RelDirection.DEST,  RelStrength.STRONG)
]

# NOUN links: 'nmod'_parent
NOUN_Relation = [
    # Start
    Word("provenance",     RelDirection.START, RelStrength.STRONG),
    # Destination
    Word("direction",      RelDirection.DEST,  RelStrength.WEAK),
    Word("destination",    RelDirection.DEST,  RelStrength.WEAK)
]

# ADP_FIXED has the priority 
# ADP links: 'case'_child, 'dep'_parent
ADP_FIXED_Relation = [
    # Start
    LinkedWord("à","partir",       RelDirection.START, RelStrength.STRONG),
    LinkedWord("en", "partant",    RelDirection.START, RelStrength.STRONG),
    LinkedWord("mais", "aujourd'hui", RelDirection.START, RelStrength.STRONG),
    # Destination
    LinkedWord("à","destination",  RelDirection.DEST,  RelStrength.STRONG),
    LinkedWord("en","direction",   RelDirection.DEST,  RelStrength.WEAK)
]
ADP_Relation = [
    # Start
    Word("de",     RelDirection.START, RelStrength.STRONG),
    Word("du",     RelDirection.START, RelStrength.STRONG),
    Word("des",    RelDirection.START, RelStrength.STRONG),
    Word("depuis", RelDirection.START, RelStrength.STRONG),
    # Destination
    Word("à",      RelDirection.DEST,  RelStrength.WEAK),
    Word("au",     RelDirection.DEST,  RelStrength.WEAK),
    Word("aux",    RelDirection.DEST,  RelStrength.WEAK),
    Word("dans",   RelDirection.DEST,  RelStrength.WEAK),
    Word("vers",   RelDirection.DEST,  RelStrength.WEAK),
    Word("en",     RelDirection.DEST,  RelStrength.WEAK),
    Word("par",    RelDirection.DEST,  RelStrength.WEAK) # par : "passer par Paris"
] 

# "partir" is ambiguous: "partir de ..." "partir à ..."
VERB_MARK_Relation = [
    Word("après",   RelDirection.START, RelStrength.WEAK),
    Word("avant",   RelDirection.DEST, RelStrength.STRONG),
    Word("de",   RelDirection.START, RelStrength.STRONG),
    Word('depuis', RelDirection.START, RelStrength.STRONG)
]
VERB_Relation = [
    # Start
    Word("décoller",   RelDirection.START, RelStrength.WEAK),
    Word("passer",     RelDirection.START, RelStrength.STRONG),
    Word("être",       RelDirection.START, RelStrength.STRONG),
    # Destination
    Word("arriver",    RelDirection.DEST,  RelStrength.STRONG),
    Word("aller",      RelDirection.DEST,  RelStrength.STRONG),
    Word("visiter",    RelDirection.DEST,  RelStrength.STRONG),
    Word("atterrir",   RelDirection.DEST,  RelStrength.STRONG),
    Word("découvrir",  RelDirection.DEST,  RelStrength.STRONG),
    Word("voyager",    RelDirection.DEST,  RelStrength.STRONG),
    Word("rendre",     RelDirection.DEST,  RelStrength.STRONG)
]

In [None]:
def getDirection(request):
    print(f"Request: {request}")
    locations = []
    location_final = []
    nlp = spacy.load("fr_core_news_sm")
    doc = nlp(request)
    
    
    # for debuging 
    #for token in doc:
    #    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop)
    #displacy.serve(doc, style="dep")
    
    # extract locations
    
    for i in doc.ents:
        if i.label_ == 'LOC' or i.label_ == 'GPE': 
            locations.append(i.text)
    print(f"Locations found: {locations}")

    if len(locations) <= 1:
        print("Error while finding locations")
    else:
        # Get token for each locations
        tokens = np.zeros(len(locations), dtype=object)
        
        print(tokens)
        
        for i in range(len(locations)):
            tokenFound = False
            # Priority: PROPN
            for token in doc:
                if token.pos == PROPN:
                    isUsable = True
                    for tokenSelected in tokens:
                        if type(tokenSelected) != int and tokenSelected == token:
                            isUsable = False
                    if isUsable:
                        if token.text in locations[i]:
                            tokens[i] = token
                            tokenFound = True
                            break

            # Secondary: NOUN
            if tokenFound == False:
                for token in doc:
                
                    if token.pos == NOUN:
                        isUsable = True
                        for tokenSelected in tokens:
                            if type(tokenSelected) != int and tokenSelected == token:
                                isUsable = False
                        if isUsable:
                            if token.text in locations[i]:
                                tokens[i] = token
                                tokenFound = True
                                break


            # None
            if tokenFound == False:
                print(f"Localization {locations[i]} not found")
                tokens[i] = None

        # Remove None tokens
        tmpTokens = tokens
        tokens = [] 
        for token in tmpTokens: 
            if token != None : 
                tokens.append(token)

        print(f"Token: {tokens}")

        # Weight tokens to prepare ordering
        weighedTokens = np.zeros(len(tokens), dtype=object)
        
        
        for i in range(len(tokens)):
            
            print(f"Token #{i+1} : {tokens[i].lemma_}")
            foundWeight = []
            parent = tokens[i].head
            # CCONJ (Mais où est donc Ornicar)
            for child in tokens[i].children:
                if child.pos == CCONJ:
                    for ref in CCONJ_Relation:
                        if ref.word == child.lemma_:
                            print(f"Found CCONJ: {ref.word} - {ref.strength.name} - {ref.direction.name}")
                            foundWeight.append(ref)
                            break
                    
                    

            # NOUN
            if len(foundWeight) <= 0: # Not prioritary over CCONJ
                if parent.pos == NOUN:
                    for ref in NOUN_Relation:
                        if ref.word == parent.lemma_:
                            print(f"Found NOUN: {ref.word} - {ref.strength.name} - {ref.direction.name}")
                            foundWeight.append(ref)
                            break

            # ADP_FIXED (Preposition : afin de; à moins de, venant de...)
            if len(foundWeight) <= 0: # Not prioritary over CCONJ and NOUN
                for child in tokens[i].children:
                    if child.pos == ADP:
                        for subChild in child.children:
                            if subChild.dep_ == 'fixed':
                                for ref in ADP_FIXED_Relation:
                                    if ref.word == child.lemma_ and ref.fixedWord == subChild.lemma_:
                                        print(f"Found ADP_FIXED: {ref.word} {ref.fixedWord} - {ref.strength.name} - {ref.direction.name}")
                                        foundWeight.append(ref)
                                        break

                
                    
            # ADP (Preposition : à, de, pour , sur , dans )
            if len(foundWeight) <= 0: # Not prioritary over CCONJ, NOUN and ADP_FIXED
                for child in tokens[i].children:
                    for ref in ADP_Relation:
                        if ref.word == child.lemma_:
                            print(f"Found ADP: {ref.word} - {ref.strength.name} - {ref.direction.name}")
                            foundWeight.append(ref)
                            break

            # VERB_MARK
            if len(foundWeight) <= 1: # Prioritary over CCONJ, NOUN and ADP_FIXED
                if parent.pos == VERB:
                    for child in parent.children:
                        if child.dep_ == 'mark' and child.pos == ADP:
                            for ref in VERB_MARK_Relation:
                                if ref.word == child.lemma_:
                                    print(f"Found VERB_MARK: {ref.word} - {ref.strength.name} - {ref.direction.name}")
                                    foundWeight.append(ref)
                                    break
                
            # VERB (Verbes ambigûs )
            if len(foundWeight) <= 1: # Prioritary over CCONJ, NOUN, ADP_FIXED and VERB_MARK
                for ref in VERB_Relation:
                    if ref.word == parent.lemma_:
                        print(f"Found VERB: {ref.word} - {ref.strength.name} - {ref.direction.name}")
                        foundWeight.append(ref)
                        break

            # Default - Keep position 
            if len(foundWeight) == 0: # Fallback
                print(f"Using default weight")
                foundWeight.append(Word("default", RelDirection.DEST,  RelStrength.WEAK))

            
            # Extract first strong relation
            selectedWeight = None
            for j in range(len(foundWeight)):
                if foundWeight[j].strength == RelStrength.STRONG:
                    selectedWeight = foundWeight[j]
                    break
            if selectedWeight is None:
                selectedWeight = foundWeight[0]

            print(f"Using: {selectedWeight.word}")
            print("---------------")
            weighedTokens[i] = (tokens[i], selectedWeight)


        # Order tokens
        orderedTokens = []
        # First pass for direction: START
        cptStrongStrength = 0
        for i in range(len(weighedTokens)):
            token, weight = weighedTokens[i]
            if weight.direction == RelDirection.START:
                if weight.strength == RelStrength.STRONG:
                    print(f"First pass Ordered token : {RelStrength(weight.strength).name,RelDirection(weight.direction).name, token} for weighedStrengh Strong")
                    orderedTokens.insert(cptStrongStrength, token)
                    cptStrongStrength = cptStrongStrength + 1
                else:
                    print(f"First pass Ordered token : {RelStrength(weight.strength).name,RelDirection(weight.direction).name, token} for weighedStrengh Weak")
                    orderedTokens.append(token)

        # Second pass for direction: DEST
        cptStrongStrength = 0
        for i in range(len(weighedTokens)):
            token, weight = weighedTokens[i]
            if weight.direction == RelDirection.DEST:
                if weight.strength == RelStrength.STRONG:
                    print(f"Second pass Ordered token : {RelStrength(weight.strength).name, RelDirection(weight.direction).name, token}")
                    orderedTokens.append(token)
                    cptStrongStrength = cptStrongStrength + 1
                else:
                    if cptStrongStrength == 0:
                        print(f"Second pass Ordered token : {RelStrength(weight.strength).name,RelDirection(weight.direction).name, token}")
                        orderedTokens.append(token)
                    else:
                        orderedTokens.insert(len(orderedTokens)-cptStrongStrength, token)
                        print(f"Second pass Ordered token : {RelStrength(weight.strength).name,RelDirection(weight.direction).name, token}")
                        
        
        
        # Populate location_final cities list
        for token in orderedTokens:
            location_final.append(token.text)
        print(f"Final Location: {location_final}")

        

        return location_final

In [None]:
def NLP():
    for index in range(len(texts)):
        sentence, expectedResult = texts[index]
 
        
        result = getDirection(sentence)
        print(f"\n\n\n===================================    # {index}    ===================================")
        print(f"Request Sentence :    {sentence}")
        print(f"result:    {result}")
        print(f"expected: {expectedResult}")
        print("================================================================================================\n\n\n")
NLP()