In [524]:
import unidecode
import pandas as pd
import numpy as np
from pandas import DataFrame
import re
import geonamescache as gc

In [525]:
def name_to_regex(name):    
    decoded_name = unidecode.unidecode(name)
    if name != decoded_name:
        regex = fr'\b({name}|{decoded_name})\b'
    else:
        regex = fr'\b{name}\b'
#    return re.compile(regex, flags=re.IGNORECASE)
    return re.compile(regex)

In [526]:
# For the countries, we need name and country code
countriesGeo = {k: v.get('geonameid') for k, v in gc.GeonamesCache().get_countries_by_names().items()} 
countriesCC = {k: v.get('iso') for k, v in gc.GeonamesCache().get_countries_by_names().items()}

In [527]:
# Joining the above in a countries dataframe
countries = (pd.Series(countriesGeo).to_frame('countryGeonameid')
           .join(pd.Series(countriesCC).to_frame('countryCode'), how='outer'))

In [528]:
# Reindeixng the countries dataframe on the length of the country name
countries.index.name = 'countryName'
countries = countries.reset_index(level='countryName')
indexCountry = countries.countryName.str.len().sort_values(ascending=False).index

In [529]:
# Creating a dictionary with regexp and country name/code pair
country_to_name={}
for line in countries.reindex(indexCountry).to_dict('records'):
    country_to_name[name_to_regex(line.get("countryName"))] = [line.get("countryName"),line.get("countryCode")]


In [530]:
# For cities, we need name and country code of the city
citiesName = {k: v.get('name') for k, v in gc.GeonamesCache().get_cities().items()} 
citiesGeo = {k: v.get('geonameid') for k, v in gc.GeonamesCache().get_cities().items()}

In [531]:
# Making a cities dataframe
cities = (pd.Series(citiesName).to_frame('cityName')
           .join(pd.Series(citiesGeo).to_frame('cityGeonameid'), how='outer'))

In [532]:
# Reindexing the dataframe on the length of the city name
citiesIndex = cities.cityName.str.len().sort_values(ascending=False).index

In [533]:
# Creating a dictionary with regexp and city name/geonameID pair
city_to_name={}
for line in cities.reindex(citiesIndex).to_dict('records'):
    city_to_name[name_to_regex(line.get("cityName"))] = [line.get("cityName"),line.get("cityGeonameid")]

In [534]:
# Getting US States name and geonameid
# usStatesName = {k: v.get('name') for k, v in gc.GeonamesCache().get_us_states().items()}
# usStatesGeo = {k: v.get('geonameid') for k, v in gc.GeonamesCache().get_us_states().items()}

In [535]:
# Making a US States dataframe
# usStates = (pd.Series(usStatesName).to_frame('usStateName')
#           .join(pd.Series(usStatesGeo).to_frame('usStateGeonameid'), how='outer'))

In [536]:
# Reindexing the dataframe on the length of the US State name
# usStatesIndex = usStates.usStateName.str.len().sort_values(ascending=False).index

In [537]:
# creating a dictionary of US States and their regexp
# usStates_to_name = {}
# for line in usStates.reindex(usStatesIndex).to_dict('records'):
#    usStates_to_name[name_to_regex(line.get("usStateName"))] = [line.get("usStateName"),line.get("usStateGeonameid")]

In [538]:
# Creating a matching function to be used for:
# either a country, or a city
def get_match(text, dictionary):
    for regex, [name, ID] in dictionary.items():
        if regex.search(text):
            return [name, ID]
    return None

In [539]:
# Reading the headlines.txt file and creating a list of headlines
f = open('data/headlines.txt') 
headlines = f.read().split("\n") 
f.close()

# Finding the countries in the headlines
matched_countries = [get_match(headline, country_to_name)
                  for headline in headlines]

# Finding the US States in the headlines
# matched_usStates = [get_match(headline, usStates_to_name)
#                  for headline in headlines]


# Finding the cities in the headlines
matched_cities = [get_match(headline, city_to_name)
                  for headline in headlines]

In [540]:
# Creating a list of matching countries
listcountries = []
for i in range(len(matched_countries)):
    if isinstance(matched_countries[i], list):
        listcountries.append(matched_countries[i][0])
    else:
        listcountries.append('None')

In [541]:
# Creating a list of matching cities
listcities = []
for i in range(len(matched_cities)):
    if isinstance(matched_cities[i], list):
        listcities.append(matched_cities[i][0])
    else:
        listcities.append('None')

In [542]:
# Bring the lists into a dictionary
data = {'Headline': headlines, 'City': listcities,
           'Country': listcountries}
# Making it a dataframe
headlines_df = pd.DataFrame(data)

In [601]:
headlines_df.head(20)

Unnamed: 0,Headline,City,Country
0,Zika Outbreak Hits Miami,Miami,
1,Could Zika Reach New York City?,New York City,
2,First Case of Zika in Miami Beach,Miami Beach,
3,"Mystery Virus Spreads in Recife, Brazil",Recife,Brazil
4,Dallas man comes down with case of Zika,Dallas,
5,Trinidad confirms first Zika case,Trinidad,
6,Zika Concerns are Spreading in Houston,Houston,
7,Geneve Scientists Battle to Find Cure,Genève,
8,The CDC in Atlanta is Growing Worried,Atlanta,
9,Zika Infested Monkeys in Sao Paulo,São Paulo,
