In [1]:
import pandas as pd
import spacy
import re
from spacy.tokens import Span, Doc
from spacy.matcher import Matcher
from spacy.language import Language
from slugify import slugify

### Load French Cities

This dataset come from DataGouv https://www.data.gouv.fr/en/datasets/regions-departements-villes-et-villages-de-france-et-doutre-mer/ <br>
As our algorithm concerns SNCF trips, we will check that the destinations are located in Metropolitan France.

- Read dataset to DataFrame
- Filter metropolitan cities, remove islands.

In [2]:
PATH_CITY = 'initial data/cities.csv'

france_cities = pd.read_csv(PATH_CITY)
cities = france_cities.loc[france_cities['department_code'].str.contains("^\d\d$", case=False)]

### Check Franch City

- Slugify city to match `['slug']` column in dataframe
- Slug is lowercase where dash are replaced by white space

In [3]:
def check_french_city(city):
    slug = slugify(city, lowercase=True, separator=' ')
    search = cities.loc[france_cities['slug'] == slug]
    return True if (len(search) > 0) else False

### Matcher's dictionary

Action verbs associated with the city of departure.

In [4]:
start_actions = ['partir','être','venir','train','trajet']

Action verbs associated with the arrival city.

In [5]:
finish_actions = ['rendre','aller','arriver']

Prefix for start. This prefix is inserted before the starting city

In [6]:
start_adps = ['de','depuis']

Prefix for destination. This prefix is inserted before the destination city.

In [7]:
destination_adps = ['a','à','au','aux','en','vers']

### Extract city from Span

Create a matcher with pattern `'ENT_TYPE':'LOC'`
- `LOC`will extract all names that are locations : cities, countries...
- Return city extracted

In [8]:
def extract_cities(span):
    city_matcher = Matcher(nlp.vocab)
    city_pattern = [[{'ENT_TYPE':'LOC', 'OP': '+'}]]
    city_matcher.add('CITY', city_pattern)

    cities = []
    for idx, (match_id, start, end) in enumerate(city_matcher(span)):
        text = span[start:end].text
        # Prevent some issues with "Lyon -" or "- Lyon" that are considered acceptable.
        if check_french_city(text) and not text.startswith('-') and not text.endswith('-'):
            cities.append(text)
        
    return cities

### Create Spacy Component

Component will be added as pipe to process input text. It will extract the departure and the destination.

- Departure pattern and destination patterns will match each string that contains in order an action, an adposition, and a location.
- To distinguish the city of departure and the city of arrival, the actions change and are defined above.

In [9]:
def lemAdpLoc1Pattern():
    
    lem_adp_loc_start = []
    for lemma in start_actions:
        pattern = [{'LEMMA': lemma},{'POS':'ADP'},{'ENT_TYPE':'LOC'}]
        lem_adp_loc_start.append(pattern)
    
    lem_adp_loc_finish = []
    for lemma in finish_actions:
        pattern = [{'LEMMA': lemma},{'POS':'ADP'},{'ENT_TYPE':'LOC'}]
        lem_adp_loc_finish.append(pattern)
    
    return {
        "LEM_ADP_LOC_START": lem_adp_loc_start,
        "LEM_ADP_LOC_FINISH":  lem_adp_loc_finish
    }

def processLemAdpLoc1Pattern(matcher_index, span, pending):
    if (matcher_index == "LEM_ADP_LOC_START"):
        cities = extract_cities(span)
        if 'start' in pending: pending = {}
        pending['start'] = cities[0]

    if (matcher_index == "LEM_ADP_LOC_FINISH"):
        cities = extract_cities(span)
        if 'finish' in pending: pending = {}
        pending['finish'] = cities[0]
        
    return pending

In [10]:
def adpLoc1AdpLoc2Pattern():
    
    adp_loc_adp_loc_start = []
    for adp in destination_adps:
        pattern = [{'ENT_TYPE':'LOC'},{'LEMMA': adp, 'POS':'ADP'},{'ENT_TYPE':'LOC'}]
        adp_loc_adp_loc_start.append(pattern)
    
    adp_loc_adp_loc_finish = []
    for adp in start_adps:
        pattern = [{'ENT_TYPE':'LOC'},{'LEMMA': adp, 'POS':'ADP'},{'ENT_TYPE':'LOC'}]
        adp_loc_adp_loc_finish.append(pattern)

    return {
        "ADP_LOC_ADP_LOC_START": adp_loc_adp_loc_start,
        "ADP_LOC_ADP_LOC_FINISH": adp_loc_adp_loc_finish
    }

def processAdpLoc1AdpLoc2Pattern(matcher_index, span, pending):
    if (matcher_index == "ADP_LOC_ADP_LOC_START"):
        cities = extract_cities(span)

        if len(cities) == 2:
            pending['start'] = cities[0]
            pending['finish'] = cities[1]

    if (matcher_index == "ADP_LOC_ADP_LOC_FINISH"):
        cities = extract_cities(span)

        if len(cities) == 2:
            pending['start'] = cities[1]
            pending['finish'] = cities[0]

    return pending

In [11]:
def loc1Loc2Pattern():

    return {
        "LOC_LOC": [[{'ENT_TYPE':'LOC'},{'ORTH': '-', 'OP': '?'},{'ENT_TYPE':'LOC'}]]
    }

def processLoc1Loc2Pattern(matcher_index, span, pending):
    if (matcher_index == "LOC_LOC"):
        cities = extract_cities(span)

        if len(cities) == 2:
            pending['start'] = cities[0]
            pending['finish'] = cities[1]
            
        if len(cities) == 4:
            pending['start'] = cities[0]
            pending['finish'] = cities[3]

    return pending

In [12]:
def specificPattern():

    return {
        "SPECIFIC_START": [
            [{'LEMMA': 'départ'},{'ENT_TYPE':'LOC'}],
            [{'LEMMA': 'début'},{'ENT_TYPE':'LOC'}]
        ],
        "SPECIFIC_FINISH": [
            [{'LEMMA': 'arrivée'},{'ENT_TYPE':'LOC'}],
            [{'LEMMA': 'fin'},{'ENT_TYPE':'LOC'}],
        ],
    }

def processSpecificPattern(matcher_index, span, pending):
    if (matcher_index == "SPECIFIC_START"):
        cities = extract_cities(span)
        if 'start' in pending: pending = {}
        pending['start'] = cities[0]

    if (matcher_index == "SPECIFIC_FINISH"):
        cities = extract_cities(span)
        if 'finish' in pending: pending = {}
        pending['finish'] = cities[0]

    return pending

In [13]:
def stepsPattern():

    return {
        "STEPS": [
            [{'LEMMA': 'par'},{'ENT_TYPE':'LOC', 'OP': '+'}],
        ]
    }

def processStepsPattern(matcher_index, span, pending):
    if (matcher_index == "STEPS"):
        cities = extract_cities(span)
        pending['steps'] = cities

    return pending

In [14]:
def findSteps(doc):
    steps = []
    text = doc.text

    # Get all occurences of substrings that start with 'par'
    matches = re.finditer(" par ", text)
    matches_positions = [match.start() for match in matches]

    for position in matches_positions:
        # Clean string and remove white spaces
        filtered = text[position:len(text)].strip()
        filtered = filtered.replace(',', '')
        filtered = filtered.replace('.', '')
        filtered = filtered.replace('-', '')
        labels = filtered.split(' ')

        # Test all words in string to find cities
        for idx, label in enumerate(labels):

            # If string, doesn't started with 'par', it's an error and exit loop
            if (idx == 0 and label != 'par'): 
                break

            if (label == 'par'):
                continue
                
            # Extract 1 city from word
            if (check_french_city(label)):
                steps.append(label)
                
            else:
                label_index = [i for i, x in enumerate(doc.text.split(' ')) if x == label]
                if len(label_index) > 0:
                    span = doc[label_index[0]]
                    # If not city was found, can be a CCONJ like 'et', 'ou'..., for an addition of cities
                    # If none of 2, steps search is over
                    if (span.pos_ != 'CCONJ'):
                        break

    return steps

In [15]:
# Loads patterns
patterns = {}
patterns.update(lemAdpLoc1Pattern())
patterns.update(adpLoc1AdpLoc2Pattern())
patterns.update(loc1Loc2Pattern())
patterns.update(specificPattern())
patterns.update(stepsPattern())

In [16]:
# Loads process patterns
processors = [
    processLemAdpLoc1Pattern,
    processAdpLoc1AdpLoc2Pattern,
    processLoc1Loc2Pattern,
    processSpecificPattern,
    processStepsPattern
]

In [17]:
@Language.component("extract_targets")
def extract_targets(doc):

    matcher = Matcher(nlp.vocab)
    for pattern in patterns:
        matcher.add(pattern, patterns[pattern])
    
    targets = []
    steps = []
    pending = {}
    for match_id, start, end in matcher(doc):
        matcher_index = doc.vocab.strings[match_id]

        for processor in processors:
            pending = processor(matcher_index, doc[start:end], pending)

        if (len(pending) == 2):
            targets.append(pending)
            pending = {}
            
    # Specific search for steps
    steps = findSteps(doc)
    
    return (targets, steps)

In [18]:
nlp = spacy.load('fr_core_news_lg')
nlp.add_pipe('extract_targets')

<function __main__.extract_targets(doc)>

In [19]:
datasets = [
    "Je pars de Lyon pour arriver à Toulouse.",
    "Demain, j'irai à Lyon, mais aujourd'hui, je suis à Montpellier.",
    "Je voudrais aller à Montpellier depuis Toulouse.",
    "Je vais aller de Paris à Lyon.",
    "Demain, je fais le trajet Lyon - Marseille.",
    "Demain, je ferai le trajet de Paris à Marseille.",
    "Je compte prendre un train depuis Lille, et avec un peu de chance si la SNCF n'est pas en retard, j'arriverai à Toulouse.",
    "Ville de départ Toulouse et ville d'arrivée Lille.",
    "Trajet Paris à Marseille.",
    "Trajet Paris depuis Marseille.",
    "Aller de Toulouse à Lille demain.",
    "Aller a Toulouse depuis Paris demain.",
    "Départ Toulouse vers Perpignan.",
    "Je suis à Toulouse, je voudrais aller demain à Perpignan.",
    "Trajet de Lille à Toulouse ce soir.",
    "Je voudrais aller de Toulouse à Paris après demain.",
    "Trajet Lyon Toulouse aujourd'hui.",
    "Je voudrais me rendre à Toulouse demain, je suis à Nantes aujourd'hui.",
    "Trains disponible pour aller à Marseille en venant de Lille.",
    "Lille Marseille.",
    "Lille en venant de Marseille.",
    "Je vais de Toulouse à Marseille en passant par Perpignan Montpellier Nice."
]

In [20]:
for doc in datasets:
    print(doc)
    predictions, steps = nlp(doc)
    
    if (len(predictions) == 0):
        print('Prediction: []')
        
    for prediction in predictions:
        print('Prediction : {0} -> {1}'.format(prediction['start'], prediction['finish']))
        
    if (len(steps) > 0):
        print('Etapes: {0}'.format(steps))
    
    print('')

Je pars de Lyon pour arriver à Toulouse.
Prediction : Lyon -> Toulouse

Demain, j'irai à Lyon, mais aujourd'hui, je suis à Montpellier.
Prediction : Montpellier -> Lyon

Je voudrais aller à Montpellier depuis Toulouse.
Prediction : Toulouse -> Montpellier

Je vais aller de Paris à Lyon.
Prediction : Paris -> Lyon

Demain, je fais le trajet Lyon - Marseille.
Prediction : Lyon -> Marseille

Demain, je ferai le trajet de Paris à Marseille.
Prediction : Paris -> Marseille

Je compte prendre un train depuis Lille, et avec un peu de chance si la SNCF n'est pas en retard, j'arriverai à Toulouse.
Prediction : Lille -> Toulouse

Ville de départ Toulouse et ville d'arrivée Lille.
Prediction : Toulouse -> Lille

Trajet Paris à Marseille.
Prediction : Paris -> Marseille

Trajet Paris depuis Marseille.
Prediction : Marseille -> Paris

Aller de Toulouse à Lille demain.
Prediction : Toulouse -> Lille

Aller a Toulouse depuis Paris demain.
Prediction : Paris -> Toulouse

Départ Toulouse vers Perpignan