In [1]:
import spacy
import pandas as pd
import psycopg2
import difflib
from typing import List

In [2]:
nlp = spacy.load('fr_core_news_sm')



In [3]:
french_cities_df = pd.read_csv(r'city_name.csv')
french_cities = french_cities_df.iloc[:, 0].str.lower()

In [4]:
texts = [
    "Je voudrais un trajet de paris vers brest.",
    "Je veut arriver à nantes en partant de lyon.",
    "je voudrais aller de Paris à Nantes.",
    "Il me faudrait aller de Rennes à Bordeaux.",
    "Je dois faire le trajet de Brest à Strasbourg.",
    "Il faudrait que j'aille de Tours à Paris Gare du Nord.",
    "J'aimerai faire Toulouse Nice.",
    "Comment aller de Marseille à Cannes?",
    "Il me faudrait un billet Angers Cahors.",
    "Comment aller à Nantes en partant de Rennes?",
    "Comment faire pour aller à Strasbourg depuis Toulouse?",
    "Comment se procurer un billet pour partir de Nice et aller à Brest?",
    "Comment voyager de Paris à Nantes?",
    "Comment faire le trajet Challans Orléans?",
    "Je suis à Nantes, je veux aller à Paris",
    "Je suis à Bordeaux, je veux aller à Toulouse",
    "Je voudrais un trajet de Brest à la Roche-sur-Yon",
    "Je veux aller de Paris Gare du Nord à Nantes",
    "Comment aller à Paris Gare du Nord en partant de Nantes?",
    "Comment aller à Paris Gare de Lyon en partant de Nantes?",
    "Comment aller à Nantes en partant de Paris Gare de Lyon?",
    "Comment aller à Nantes en partant de Paris Gare du Nord?",
    "j'aimerai faire un trajet Paris-Montparnasse vers Nantes",
    "j'aimerai faire un trajet Nantes vers Paris-Montparnasse",
    "Aller de Brest à Nantes",
    "En partant de Brest je veux aller à Nantes.",
    "Je veux aller à Paris",
    "Je veux aller de Nantes au Havre",
    "J'aimerai aller à Paris Gare Montparnasse en partant de Nantes",
    "Je veux aller de Paris Gare MontParnasse à Nantes",
    "Je veux aller de Nantes à Paris Gare de Lyon",
    "Je veux aller de Paris Gare du Nord à Nantes"
]

In [5]:
texts_df = pd.read_csv(r'augmented_data.csv')
#texts = texts_df.iloc[:, 0].str.lower()
validity_list = texts_df.iloc[:, 1]

In [6]:
travel_words = ["aller", "voyage", "destination", "emmener", "vers", "arriver", "partir"]

In [7]:
prefix_origin_words = ["de", "depuis", "du"]

In [8]:
prefix_destination_words= ["à", "a", "au", "vers", "-", "jusque", "jusqu'à", "jusqu'a"]

In [9]:
prefix_words = prefix_origin_words + prefix_destination_words

In [10]:
wrong_prefix_words = ["le", "la", "du", "de"]

In [11]:
total_texts = 0
total_valid_texts = 0
total_invalid_texts = 0
total_valid_texts_with_cities_found = 0
total_valid_texts_with_cities_not_found = 0
total_invalid_texts_with_cities_found = 0
total_invalid_texts_with_cities_not_found = 0

In [12]:
valid_texts_with_cities_not_found = []
invalid_texts_with_cities_found = []

In [13]:
def clean_text(text: str):
    text = text.strip(".")
    text = text.replace(",", "")
    text = text.strip("?")
    #text = text.replace("'", " ")
    return text.lower()

In [14]:
def split_text(text: str):
    text = clean_text(text)
    return text.split(" ")

In [15]:
def is_before(text: str, first_words: List[str], second_word: str):
        
    text_split = split_text(text)
    
    list_result = []
    
    for first_word in first_words:
        
        if first_word not in text_split or (second_word not in text_split and second_word not in text):
            list_result.append(False)
        else:    

            first_result = False
            second_result = False
            third_result = False

            first_word_index = text_split.index(first_word)

            if second_word in text_split: 

                second_word_index = text_split.index(second_word)

                if text_split.count(first_word) > 1 and text_split[second_word_index - 1] == first_word:
                    first_word_index = second_word_index - 1

                first_result = second_word_index - first_word_index == 1

                if not first_result:
                    prefix = text_split[second_word_index - 1]

                    if prefix == "la" or prefix == "le" or prefix == "l'":
                        prefix_index = text_split.index(prefix)
                        second_result = prefix_index - first_word_index == 1          

            if (not first_result or not second_result) and (second_word not in text_split and second_word in text):
                second_word_split = []

                if len(second_word.split(" ")) > 1:
                    second_word_split = second_word.split(" ")
                if len(second_word.split("-")) > 1:
                    second_word_split = second_word.split("-")   

                if second_word_split and second_word_split[0] in text_split:
                    second_word_first_word_index = text_split.index(second_word_split[0])
                    third_result = second_word_first_word_index - first_word_index == 1 

            list_result.append(first_result or second_result or third_result)

    return any(result for result in list_result)

In [16]:
def clean_city(city: str):
    city = city.replace("la ", "")
    city = city.replace("le ", "")
    city = city.replace("l' ", "")
    return city.strip()

In [17]:
for text_index, text in enumerate(texts):
    
    doc = nlp(text)
    
    text_cities = []
    text_travel_words = []
    locations = []

    for ent in doc.ents:
        if ent.text.lower() not in wrong_prefix_words:    
            locations.append(clean_text(ent.text.lower()))
        text_cities.extend([french_city for french_city in french_cities if french_city.startswith(clean_text(ent.text)) 
                                                                         or clean_text(ent.text) in french_city 
                                                                         or french_city.startswith(clean_text(ent.text).replace(" ", "-"))
                                                                         or clean_text(ent.text).replace(" ", "-") in french_city])
        if len(ent.text.split(" ")) > 1 and not text_cities:
            for word in ent.text.split(" "):
                text_cities.extend([french_city for french_city in french_cities if french_city.startswith(clean_text(word)) 
                                                                                 or clean_text(word) in french_city
                                                                                 or french_city.startswith(clean_text(word).replace(" ", "-"))
                                                                                 or clean_text(word).replace(" ", "-") in french_city]) 
            
    for token in doc:
        if any(token.lemma_ == travel_word for travel_word in travel_words):
            text_travel_words.append(token.text)
    
    text_split = split_text(text)
    
    loca = locations
    
    if len(locations) <= 1:
        if len(list(set(text_split).intersection(prefix_origin_words))) > 0 and len(list(set(text_split).intersection(prefix_destination_words))) > 0:
            for index, word in enumerate(text_split):
                if word in prefix_words:
                    if len(text_split) > index + 1:
                        if text_split[index + 1] in wrong_prefix_words and len(text_split) > index + 2:
                            locations.append(text_split[index + 2])
                        else:
                            locations.append(text_split[index + 1])
            for location in locations:
                text_cities.extend([french_city for french_city in french_cities if french_city.startswith(clean_text(location)) or clean_text(location) in french_city])
    if len(locations) <= 1:
        temp_text = text[0].lower() + text[1:]
        temp_text_split = temp_text.split(" ")
        for tts in temp_text_split:
            if tts[0] == tts[0].upper() and clean_text(tts) not in locations:
                locations.append(clean_text(tts))
                text_cities.extend([french_city for french_city in french_cities if french_city.startswith(clean_text(tts)) or clean_text(tts) in french_city]) 
        if len(locations) == 2 and is_before(clean_text(text), [locations[1]], locations[0]):
            locations.reverse()        
    
    locations = list(dict.fromkeys(locations))
    text_cities = list(dict.fromkeys(text_cities))
    text_travel_words = list(dict.fromkeys(text_travel_words))
    
    origins = []
    destinations = []
    
    for location in locations:
        
        if len(list(set(text_split).intersection(prefix_origin_words))) > 0 and len(list(set(text_split).intersection(prefix_destination_words))) > 0: 
            #print("1")
            if is_before(clean_text(text), prefix_origin_words, clean_city(location)):
                #print("1.1")
                origins.extend([text_city for text_city in text_cities if clean_city(text_city).startswith(location) or clean_city(text_city).startswith(location.replace(" ", "-"))])
            if is_before(clean_text(text), prefix_destination_words, clean_city(location)):
                #print("1.2")
                destinations.extend([text_city for text_city in text_cities if clean_city(text_city).startswith(location) or clean_city(text_city).startswith(location.replace(" ", "-"))])
    
    if not origins and not destinations and len(locations[0].split(" ")) > 1:
        #print("2")
        if is_before(clean_text(text), [clean_city(locations[0].split(" ")[0])], clean_city(locations[0].split(" ")[1])):
            origins.extend([text_city for text_city in text_cities if clean_city(text_city).startswith(locations[0].split(" ")[0])])
            destinations.extend([text_city for text_city in text_cities if clean_city(text_city).startswith(locations[0].split(" ")[1])])   

    if not origins and not destinations and len(locations) == 2:
        #print("3")
        if (len(list(set(text_split).intersection(prefix_origin_words))) > 0 or len(list(set(text_split).intersection(prefix_destination_words))) > 0) and text_split.count("à") < 2 and text_split.count("a") < 2:
            for location in locations:
                if is_before(clean_text(text), prefix_origin_words, clean_city(location)):
                    #print("3.1.1")
                    origins.extend([text_city for text_city in text_cities if clean_city(text_city).startswith(location) or clean_city(text_city).startswith(location.replace(" ", "-"))])
                if is_before(clean_text(text), prefix_destination_words, clean_city(location)):
                    #print("3.1.2")
                    destinations.extend([text_city for text_city in text_cities if clean_city(text_city).startswith(location) or clean_city(text_city).startswith(location.replace(" ", "-"))])
        else:   
            #print("3.2")
            for travel_word in text_travel_words:
                city = locations[0]
                if len(locations[0].split(" ")) > 1:
                    city = locations[0].split(" ")[0]
                if travel_word in text_split and clean_city(city) in text_split and (text_split.index(travel_word) > text_split.index(clean_city(city))):
                    origins.extend([text_city for text_city in text_cities if text_city.startswith(locations[0]) or text_city.startswith(locations[0].replace(" ", "-"))])
                    destinations.extend([text_city for text_city in text_cities if clean_city(text_city).startswith(locations[1]) or clean_city(text_city).startswith(locations[1].replace(" ", "-"))])
                else:
                    origins.extend([text_city for text_city in text_cities if text_city.startswith(locations[1]) or text_city.startswith(locations[1].replace(" ", "-"))])
                    destinations.extend([text_city for text_city in text_cities if clean_city(text_city).startswith(locations[0]) or clean_city(text_city).startswith(locations[0].replace(" ", "-"))])       
    
    if not origins and not destinations and len(locations) == 2:
        #print("4")
        origins.extend([text_city for text_city in text_cities if text_city.startswith(locations[0]) or text_city.startswith(locations[0].replace(" ", "-"))])
        destinations.extend([text_city for text_city in text_cities if clean_city(text_city).startswith(locations[1]) or clean_city(text_city).startswith(locations[1].replace(" ", "-"))])
    
    if len(locations) == 2:
        if locations[0] in origins:
            origins = difflib.get_close_matches(locations[0], origins)
        if locations[0] in destinations:
            destinations = difflib.get_close_matches(locations[0], destinations)
        if locations[1] in origins:
            origins = difflib.get_close_matches(locations[1], origins)
        if locations[1] in destinations:
            destinations = difflib.get_close_matches(locations[1], destinations)        
            
    if not origins and destinations:
        #print("5")
        for location in locations:
            if len(difflib.get_close_matches(location, destinations)) == 0:
                origins.extend([text_city for text_city in text_cities if clean_city(text_city).startswith(location) or clean_city(text_city).startswith(location.replace(" ", "-"))])
        if not origins:
            for location in locations:
                if len(difflib.get_close_matches(location, destinations)) == 0 and len(location.split(" ")) > 1:
                    for loc_split in location.split(" "):
                        if loc_split not in prefix_words:
                            origins.extend([text_city for text_city in text_cities if clean_city(text_city).startswith(loc_split) or clean_city(text_city).startswith(loc_split.replace(" ", "-"))])                    
    if origins and not destinations:
        #print("6")
        for location in locations:
            if len(difflib.get_close_matches(location, origins)) == 0:
                destinations.extend([text_city for text_city in text_cities if clean_city(text_city).startswith(location) or clean_city(text_city).startswith(location.replace(" ", "-"))])
        if not destinations:
            for location in locations:
                if len(difflib.get_close_matches(location, origins)) == 0 and len(location.split(" ")) > 1:
                    for loc_split in location.split(" "):
                        if loc_split not in prefix_words:
                            destinations.extend([text_city for text_city in text_cities if clean_city(text_city).startswith(loc_split) or clean_city(text_city).startswith(loc_split.replace(" ", "-"))])       
    if len(origins) > 1:
        #print("7", origins)
        temp_origins = []
        for t in text_split:
            if t not in wrong_prefix_words and t.lower() != "gare" and any(t.lower() in o for o in origins):
                if not temp_origins:
                    temp_origins.extend([origin for origin in origins if t.lower() in origin])
                else:
                    temp_origins = list(set(temp_origins).intersection([origin for origin in origins if t.lower() in origin]))
        origins = temp_origins    
    if len(destinations) > 1:
        #print("8", destinations)
        temp_destinations = []
        for t in text_split:
            if t not in wrong_prefix_words and t.lower() != "gare" and any(t.lower() in d for d in destinations):
                if not temp_destinations:
                    temp_destinations.extend([destination for destination in destinations if t.lower() in destination])
                else:
                    temp_destinations = list(set(temp_destinations).intersection([destination for destination in destinations if t.lower() in destination]))
        destinations = temp_destinations            
    
    print(text)
    #print("LOCA", loca)
    #print("locations", locations)
    #print("TEXT CITIES", text_cities)
    #print("TEXT TRAVEL WORDS", text_travel_words)
    print("ORIGIN : ", list(dict.fromkeys(origins)))
    print("DESTINATION : ", list(dict.fromkeys(destinations)))
    print("------")  
    
    #text_validity = validity_list[text_index]
    
    #total_texts +=1
    
    #if text_validity == 1:   
        #total_valid_texts += 1
        #if origins and destinations:
            #total_valid_texts_with_cities_found += 1
        #else:
            #total_valid_texts_with_cities_not_found += 1
            #valid_texts_with_cities_not_found.append(text)
            
            #print(text)
            #print("LOCA", loca)
            #print("locations", locations)
            #print("TEXT CITIES", text_cities)
            #print("TEXT TRAVEL WORDS", text_travel_words)
            #print("ORIGIN : ", list(dict.fromkeys(origins)))
            #print("DESTINATION : ", list(dict.fromkeys(destinations)))
            #print("------")   
        
    #else:
        #total_invalid_texts +=1
        #if origins and destinations:
            #total_invalid_texts_with_cities_found += 1
            #invalid_texts_with_cities_found.append(text)
        #else:
             #total_invalid_texts_with_cities_not_found += 1

Je voudrais un trajet de paris vers brest.
ORIGIN :  ['paris-st-lazare', 'paris gare du nord', 'paris-bercy', 'paris-montparnasse 1-2', 'paris-austerlitz', 'paris-est', 'paris-gare-de-lyon', 'paris-montp.3-vaug.']
DESTINATION :  ['brest']
------
Je veut arriver à nantes en partant de lyon.
ORIGIN :  ['lyon-jean-macé']
DESTINATION :  ['nantes']
------
je voudrais aller de Paris à Nantes.
ORIGIN :  ['paris-st-lazare', 'paris gare du nord', 'paris-bercy', 'paris-montparnasse 1-2', 'paris-austerlitz', 'paris-est', 'paris-gare-de-lyon', 'paris-montp.3-vaug.']
DESTINATION :  ['nantes']
------
Il me faudrait aller de Rennes à Bordeaux.
ORIGIN :  ['rennes']
DESTINATION :  ['bordeaux-st-jean']
------
Je dois faire le trajet de Brest à Strasbourg.
ORIGIN :  ['brest']
DESTINATION :  ['strasbourg', 'strasbourg-roethig']
------
Il faudrait que j'aille de Tours à Paris Gare du Nord.
ORIGIN :  ['tours']
DESTINATION :  ['paris gare du nord']
------
J'aimerai faire Toulouse Nice.
ORIGIN :  ['toulouse-m

In [18]:
#print("TOTAL TEXTS", total_texts)
#print("TOTAL VALID TEXTS", total_valid_texts)
#print("TOTAL VALID TEXTS WITH CITIES FOUND", total_valid_texts_with_cities_found)
#print("TOTAL VALID TEXTS WITH CITIES NOT FOUND", total_valid_texts_with_cities_not_found)
#print("TOTAL INVALID TEXTS", total_invalid_texts)
#print("TOTAL INVALID TEXTS WITH CITIES FOUND", total_invalid_texts_with_cities_found)
#print("TOTAL INVALID TEXTS WITH CITIES NOT FOUND", total_invalid_texts_with_cities_not_found)

#print("good text success percentage", (total_valid_texts_with_cities_found / total_valid_texts) * 100)#
#print("bad text success percentage", (total_invalid_texts_with_cities_not_found / total_invalid_texts) * 100)

In [19]:
valid_texts_with_cities_not_found

[]

In [20]:
invalid_texts_with_cities_found

[]