In [1]:
# Load data

import unidecode

headlines = list()
with open('data/headlines.txt') as f:
    for line in f:
        headline_norm = unidecode.unidecode(line[:-1].lower())  # remove accents with unidecode                                                            
        headlines.append(headline_norm)                         # and newline character

# Take a look at first 10 headlines
        
print(headlines[:10])

['zika outbreak hits miami', 'could zika reach new york city?', 'first case of zika in miami beach', 'mystery virus spreads in recife, brazil', 'dallas man comes down with case of zika', 'trinidad confirms first zika case', 'zika concerns are spreading in houston', 'geneve scientists battle to find cure', 'the cdc in atlanta is growing worried', 'zika infested monkeys in sao paulo']


In [2]:
# Create list of country and city names from geonamescache

import geonamescache

gc = geonamescache.GeonamesCache()

countries = []
countries_dic = gc.get_countries()

for key in countries_dic:
    countries.append(unidecode.unidecode(countries_dic[key]['name']))

countries.sort(key = len, reverse = True) # sort longest to shortest to avoid regex finding partial match

cities = []
cities_dic = gc.get_cities() 

for key in cities_dic:
    cities.append(unidecode.unidecode(cities_dic[key]['name']))
    
cities.sort(key = len, reverse = True) # sort longest to shortest to avoid regex finding partial match

# Take a look at first ten cities, countries, and accented names

print('Countries: ', countries[:10], '\n')
print('Cities: ', cities[:10], '\n')


Countries:  ['South Georgia and the South Sandwich Islands', 'United States Minor Outlying Islands', 'Bonaire, Saint Eustatius and Saba ', 'Heard Island and McDonald Islands', 'Democratic Republic of the Congo', 'Saint Vincent and the Grenadines', 'British Indian Ocean Territory', 'French Southern Territories', 'Saint Pierre and Miquelon', 'Central African Republic'] 

Cities:  ['Chak Two Hundred Forty-nine Thal Development Authority', 'Dolores Hidalgo Cuna de la Independencia Nacional', 'Ampliacion San Mateo (Colonia Solidaridad)', 'Licenciado Benito Juarez (Campo Gobierno)', 'Sant Pere, Santa Caterina i La Ribera', 'Palikir - National Government Center', 'Nanchital de Lazaro Cardenas del Rio', 'San Fernando del Valle de Catamarca', "el Camp d'en Grassot i Gracia Nova", 'San Martin Texmelucan de Labastida'] 



In [3]:
# Check headlines for city and country

import re
import pandas as pd

# Create regex for countries 
country_reg_ex = ''
for place in countries:
    country_reg_ex += r'\b' + re.escape(place) +r'\b|' #\b to match entire place name
    
country_reg_ex = country_reg_ex[:-1]

# Look at first 100 characters in country_reg_ex

print('Country regex: ', country_reg_ex[:100])

country_reg_ex = re.compile(country_reg_ex, re.I)

# Create regex for countries 
city_reg_ex = r''
for place in cities:
    city_reg_ex +=r'\b' + re.escape(place) + r'\b|' #\b to match entire place name
    
city_reg_ex = city_reg_ex[:-1]+r'\b'

# Look at first 100 characters in city_reg_ex
print('City regex: ', city_reg_ex[:100])

city_reg_ex = re.compile(city_reg_ex,re.I)

headline_place_data = [['headline','city', 'country']]

for headline in headlines:
    countries_match = country_reg_ex.findall(headline)
    cities_match = city_reg_ex.findall(headline)
    if countries_match == []:
        country = ''
    else:
        countries_match.sort(key = len, reverse = True)
        country = countries_match[0]
        
    if cities_match == []:
        city = ''
    else:
        cities_match.sort(key = len, reverse = True)
        city = cities_match[0]
    headline_place_data.extend([[headline, city, country]])

headline_df = pd.DataFrame(headline_place_data[1:], columns=headline_place_data[0])

Country regex:  \bSouth\ Georgia\ and\ the\ South\ Sandwich\ Islands\b|\bUnited\ States\ Minor\ Outlying\ Islands\b|
City regex:  \bChak\ Two\ Hundred\ Forty\-nine\ Thal\ Development\ Authority\b|\bDolores\ Hidalgo\ Cuna\ de\ la\ 


In [4]:
# Check for common city names
headline_df.groupby('city').count().sort_values('headline', ascending=False).head(50) 
# shows cities incorrectly detected as 'hit', 'of' and 'come'

Unnamed: 0_level_0,headline,country
city,Unnamed: 1_level_1,Unnamed: 2_level_1
,31,31
of,6,6
monroe,4,4
miami,4,4
madrid,4,4
barcelona,3,3
birmingham,2,2
cambridge,2,2
cancun,2,2
colombo,2,2


In [5]:
# Remove cities detected as 'hit', 'of' and 'come'
mask =(headline_df['city'] == 'hit') | (headline_df['city'] =='of' )|  (headline_df['city'] =='come')

headline_df.loc[mask, 'city'] = ''

In [6]:
# For later: dictionary mapping unaccented city name to accented city name

accented_names = {unidecode.unidecode(city['name']): city['name'] for city in cities_dic.values() 
                  if city['name'] != unidecode.unidecode(city['name'])}