# Extract Cities From a Text Using NLP

### Listing Nouns in a string

In [1]:
import spacy
from spacy.lang.fr.examples import sentences 

nlp = spacy.load("fr_core_news_lg")
doc = nlp("Bonjour je viens de Paris et je suis Lucien. En France il y a l'aéroport d'Orly à Paris et Saint-Exupéry à Lyon.")

for token in doc:
    if token.pos_ == "PROPN":
        print(token.text, token.pos_, token.dep_)

Paris PROPN obl:arg
Lucien PROPN conj
France PROPN obl:mod
Orly PROPN nmod
Paris PROPN nmod
Saint-Exupéry PROPN conj
Lyon PROPN obl:mod


### Listing Locations in the same string

In [2]:
import spacy
from spacy.lang.fr.examples import sentences 

nlp = spacy.load('fr_core_news_lg')

doc = nlp("Bonjour je viens de Paris et je suis Lucien. En France il y a l'aéroport d'Orly à Paris et Saint-Exupéry à Lyon.")

for ent in doc.ents:
    if ent.label_ == "LOC":
        print(ent.text, ent.label_)

Paris LOC
France LOC
aéroport d'Orly LOC
Paris LOC
Saint-Exupéry LOC
Lyon LOC


### Listing Locations From an Article

In [None]:
from newspaper import Article
import spacy

nlp = spacy.load('fr_core_news_lg')

url = r"https://www.lemonde.fr/societe/article/2021/09/14/emmanuel-macron-clot-le-beauvau-de-la-securite-l-elysee-promet-des-annonces-substantielles_6094578_3224.html"

article = Article(url)
article.download()
article.parse()

doc = nlp(article.text)

for ent in doc.ents:
    if ent.label_ == "LOC":
        print(ent.text, ent.label_)

### Listing Locations From a Text File

In [None]:
import spacy

nlp = spacy.load('fr_core_news_lg')

file = open("../data/txt/text.txt", "r").read().lower() # Lower Case Helps w/ Detection

doc = nlp(file)
for ent in doc.ents:
    if ent.label_ == "LOC":
        print(ent.text[0].upper() + ent.text[1:], ent.label_)


# How to extract Cities from this list

### Method One: BruteForce
#### If it's not a country or a continent it's a city

In [None]:
import spacy
import pandas as pd

nlp = spacy.load('fr_core_news_lg')

file = open("../data/txt/text.txt", "r").read().lower() # Lower Case Helps w/ Detection

doc = nlp(file)
places = []
countries = []
res = []

for ent in doc.ents:
    if ent.label_ == "LOC":
        places.append(ent.text)

data = pd.read_csv("../data/csv/pays.csv", sep=";")

for country in data["nom"]:
    countries.append(country.lower())

for loc in places:
    if loc not in countries:
        res.append(loc)

print(res)


-  _This method kinda works but if a region is given it will output it as a City._
-  _Furthermore, if the country is not perfectly written (e.g. Tanzania instead of United Republic Of Tanzania) it will consider it as a City._

### Methode Two: geonamescache
#### if it exists it's a city

In [1]:
import geonamescache
import spacy

gc = geonamescache.GeonamesCache()
# cities = gc.search_cities("Lyon")
# print(cities)

nlp = spacy.load('fr_core_news_lg')
 # Can change that based on language

file = open("../data/txt/text3.txt", "r").read().lower() # Lower Case Helps w/ Detection
doc = nlp(file)
# file = None
list_cities = gc.get_cities()
all_cities = []
cities = []

for city in list_cities:
    all_cities.append(list_cities[city]['name'].lower())
    for alternate in list_cities[city]['alternatenames']:
        dash = alternate.split('-')
        dash = ' '.join(dash).lower()
        all_cities.append(dash)
list_cities = None


for ent in doc.ents:
    if ent.label_ == "LOC":
        location = ent.text.split('-')
        location = " ".join(location).lower()
        if location in all_cities:
            cities.append(location)
print(cities)
print("Found {} cities".format(len(cities)))




[{'geonameid': 2996944, 'name': 'Lyon', 'latitude': 45.74846, 'longitude': 4.84671, 'countrycode': 'FR', 'population': 472317, 'timezone': 'Europe/Paris', 'admin1code': '84', 'alternatenames': ['LYS', 'Leon do Roine', 'León do Roine', 'Lijon', 'Lio', 'Lion', 'Liona', 'Lionas', 'Lione', 'Lioni', 'Liono', 'Liun', 'Liyon', 'Lió', 'Lión', 'Lugdunum', 'Lyon', 'Lyons', 'li ang', 'li yng', "li'om", "li'ona", 'lion', 'liong', 'lioni', 'liyon', 'lyom', 'lywn', 'riyon', 'Λυών', 'Лион', 'Ліон', 'Ліён', 'Լիոն', 'ליאן', 'ליון', 'ليون', 'لیون', 'لیۆن', 'ल्यों', 'ਲਿਓਂ', 'ଲିଓନ', 'லியோன்', 'ลียง', 'ལི་ཡོང་།', 'လီယွန်းမြို့', 'ლიონი', 'ልዮን', 'リヨン', '里昂', '리옹']}]


-   _It's already better_
-   _Depends a lot on the model's precision_
-   _Seems a bit heavy_