In [2]:
with open("headlines.txt") as file:
    data = [headline.strip() for headline in file]
    
data[:4]

['Zika Outbreak Hits Miami',
 'Could Zika Reach New York City?',
 'First Case of Zika in Miami Beach',
 'Mystery Virus Spreads in Recife, Brazil']

In [7]:
import geonamescache

gc = geonamescache.GeonamesCache()
countries = [country["name"] for country in gc.get_countries().values()]
countries[:10]

['Andorra',
 'United Arab Emirates',
 'Afghanistan',
 'Antigua and Barbuda',
 'Anguilla',
 'Albania',
 'Armenia',
 'Angola',
 'Antarctica',
 'Argentina']

In [4]:
cities = [city['name'] for city in gc.get_cities().values()]
cities[:4]

['Andorra la Vella', 'Umm Al Quwain City', 'Ras Al Khaimah City', 'Zayed City']

In [8]:
continents = [continents['name'] for continents in gc.get_continents().values()]
continents[:10]

['Africa',
 'Asia',
 'Europe',
 'North America',
 'Oceania',
 'South America',
 'Antarctica']

In [14]:
from collections import Counter

city_counts = Counter(cities)
city_counts.most_common(100)

[('Springfield', 8),
 ('San Pedro', 7),
 ('Richmond', 7),
 ('San Fernando', 7),
 ('Mercedes', 6),
 ('La Paz', 6),
 ('Victoria', 6),
 ('San Francisco', 6),
 ('Auburn', 6),
 ('Santa Cruz', 6),
 ('Burlington', 6),
 ('San Carlos', 6),
 ('La Unión', 6),
 ('San Marcos', 6),
 ('San Vicente', 5),
 ('San Lorenzo', 5),
 ('San Isidro', 5),
 ('Santa Rosa', 5),
 ('San Juan', 5),
 ('Lincoln', 5),
 ('San Miguel', 5),
 ('Albany', 5),
 ('Orange', 5),
 ('Hamilton', 5),
 ('Santa Maria', 5),
 ('Aurora', 5),
 ('Windsor', 5),
 ('San Felipe', 5),
 ('San Antonio', 5),
 ('Winchester', 5),
 ('Wellington', 5),
 ('Portsmouth', 5),
 ('Lancaster', 5),
 ('Bristol', 5),
 ('Salem', 5),
 ('Florence', 5),
 ('Greenville', 5),
 ('Madison', 5),
 ('Middletown', 5),
 ('Columbus', 5),
 ('Ashland', 5),
 ('Concord', 5),
 ('Farmington', 5),
 ('Lebanon', 5),
 ('Lakewood', 5),
 ('Pilar', 4),
 ('Santa Lucía', 4),
 ('San Rafael', 4),
 ('San Luis', 4),
 ('Scarborough', 4),
 ('Fairfield', 4),
 ('Brunswick', 4),
 ('Brighton', 4),
 ('Na

In [17]:
import unidecode

country_accent_mapping = {
    unidecode.unidecode(country): country for country in countries
}

city_accent_mapping = {
    unidecode.unidecode(city): city for city in cities
}
city_accent_mapping["Asmar"]

'Āsmār'

In [12]:
data = [unidecode.unidecode(headline) for headline in data]
data[-10:]

['Authorities are Worried about the Spread of Varicella in Clovis',
 'More Zika patients reported in Fort Worth',
 'Zika symptoms spotted in Boynton Beach',
 'Outbreak of Zika in Portoviejo',
 'Influenza Exposure in Muscat',
 'Rumors about Rabies spreading in Jerusalem have been refuted',
 'More Zika patients reported in Indang',
 'Suva authorities confirmed the spread of Rotavirus',
 'More Zika patients reported in Bella Vista',
 'Zika Outbreak in Wichita Falls']

In [13]:
# Create list of cities and countries
unaccented_cities = list(city_accent_mapping.keys())
unaccented_countries = set(country_accent_mapping.keys())

print(f"There are {len(unaccented_cities)} cities to look through.")
print(f"There are {len(unaccented_countries)} countries to look through.")

There are 23022 cities to look through.
There are 252 countries to look through.


In [18]:
import re

problem_city = 'San Jose'
re.search('\\bSan\\b|\\bSan Jose\\b', problem_city)

<_sre.SRE_Match object; span=(0, 3), match='San'>

In [19]:
re.search('\\bSan Jose\\b|\\bSan\\b', problem_city)

<_sre.SRE_Match object; span=(0, 8), match='San Jose'>

In [22]:
unaccented_cities = sorted(unaccented_cities, key=lambda x: len(x), reverse=True)
unaccented_cities[:100]

['Chak Two Hundred Forty-nine Thal Development Authority',
 'Dolores Hidalgo Cuna de la Independencia Nacional',
 'Ampliacion San Mateo (Colonia Solidaridad)',
 'Licenciado Benito Juarez (Campo Gobierno)',
 'Sant Pere, Santa Caterina i La Ribera',
 'Palikir - National Government Center',
 'Nanchital de Lazaro Cardenas del Rio',
 'San Fernando del Valle de Catamarca',
 "el Camp d'en Grassot i Gracia Nova",
 'San Martin Texmelucan de Labastida',
 'Acilia-Castel Fusano-Ostia Antica',
 'Chak One Hundred Twenty Nine Left',
 'Sydney Central Business District',
 'Brandys nad Labem-Stara Boleslav',
 'Rosignano Solvay-Castiglioncello',
 'Montecchio Maggiore-Alte Ceccato',
 'Delegacion Cuajimalpa de Morelos',
 'Socorro Mission Number 1 Colonia',
 "l'Antiga Esquerra de l'Eixample",
 'Marina di Ardea-Tor San Lorenzo',
 'Jardines de la Silla (Jardines)',
 'Parque Industrial Ciudad Mitras',
 'Zurich (Kreis 2) / Wollishofen',
 'Zurich (Kreis 6) / Unterstrass',
 'Zurich (Kreis 9) / Albisrieden',
 'Ise

In [23]:
unaccented_countries = sorted(unaccented_countries, key=lambda x: len(x), reverse=True)
unaccented_countries[:100]

['South Georgia and the South Sandwich Islands',
 'United States Minor Outlying Islands',
 'Bonaire, Saint Eustatius and Saba ',
 'Heard Island and McDonald Islands',
 'Saint Vincent and the Grenadines',
 'Democratic Republic of the Congo',
 'British Indian Ocean Territory',
 'French Southern Territories',
 'Saint Pierre and Miquelon',
 'Northern Mariana Islands',
 'Turks and Caicos Islands',
 'Central African Republic',
 'Svalbard and Jan Mayen',
 'Bosnia and Herzegovina',
 'British Virgin Islands',
 'Republic of the Congo',
 'Serbia and Montenegro',
 'Palestinian Territory',
 'Sao Tome and Principe',
 'Saint Kitts and Nevis',
 'United Arab Emirates',
 'Netherlands Antilles',
 'U.S. Virgin Islands',
 'Trinidad and Tobago',
 'Antigua and Barbuda',
 'Dominican Republic',
 'Wallis and Futuna',
 'Equatorial Guinea',
 'Papua New Guinea',
 'Marshall Islands',
 'Saint Barthelemy',
 'French Polynesia',
 'Christmas Island',
 'Falkland Islands',
 'North Macedonia',
 'Solomon Islands',
 'Western

In [24]:
city_regex = r'\b|\b'.join(unaccented_cities)
city_regex[1500:1800]

'-Baume\\b|\\bTamuning-Tumon-Harmon Village\\b|\\bTultitlan de Mariano Escobedo\\b|\\bSan Bernardino Tlaxcalancingo\\b|\\bSan Francisco Tlalcilalcalpan\\b|\\bFraccionamiento Ciudad Olmeca\\b|\\bPresidencia Roque Saenz Pena\\b|\\bZurich (Kreis 11) / Oerlikon\\b|\\bSan Fernando de Monte Cristi\\b|\\bPuerto Francisco de '

In [25]:
import numpy as np

np.random.seed(50)

test_headlines = np.random.choice(data, 10)

for test_headline in test_headlines:
    print(test_headline)
    match = re.search(city_regex, test_headline)
    if match:
        print(match.group(0), "\n")

More Zika patients reported in Custodia
Custodia 

Tokyo Encounters Severe Symptoms of Meningitis
Tokyo 

Zika Troubles come to Kampong Cham
Kampong Cham 

19 new Zika Cases in Sengkang
Sengkang 

Mumbai's Health Minister warns of more Zika cases
Mumbai 

Varicella re-emerges in Lagos
Lagos 

Mumbai's Health Minister warns of more Zika cases
Mumbai 

Milwaukee authorities confirmed the spread of Rhinovirus
Milwaukee 

Zika cases concern Charlotte residents
Charlotte 

Four cases of Zika in Hidalgo County
Hidalgo 



In [26]:
country_regex = r"\b|\b".join(unaccented_countries)
country_regex[:100]

'South Georgia and the South Sandwich Islands\\b|\\bUnited States Minor Outlying Islands\\b|\\bBonaire, S'

In [27]:
np.random.seed(100)
test_headlines = np.random.choice(data, 10)

for test_headline in test_headlines:
    print(test_headline)
    match = re.search(country_regex, test_headline)
    if match:
        print(match.group(0), "\n")

Longwood volunteers spreading Zika awareness
More Zika cases in Soyapango
Spike of Dengue Cases in Stockholm
Case of Measles Reported in Vancouver
Zika arrives in Belmopan
Outbreak of Zika in Colombo
Zika symptoms spotted in Arlington
Malaria re-emerges in Boise
Southampton Patient in Critical Condition after Contracting Tuberculosis
Manassas Encounters Severe Symptoms of Measles


In [28]:
test_headline = data[3]
print(test_headline)
print(re.search(city_regex, test_headline).group(0))
print(re.search(country_regex, test_headline).group(0))

Mystery Virus Spreads in Recife, Brazil
Recife
Brazil


In [29]:
print(city_accent_mapping["Recife"])
print(country_accent_mapping["Brazil"])

Recife
Brazil


In [30]:
def find_city_and_country_in_headline(headline):
    """
    Find the city(s) and/or country(s) in a text headline.
    
    :param headline: string for headline
    
    :return dict: a dictionary mapping the headline to city(s) and/or countries.
    """
    city_match = re.search(city_regex, headline)
    country_match = re.search(country_regex, headline)
    cities = None if not city_match else city_match.group(0)
    countries = None if not country_match else country_match.group(0)
    return dict(headline=headline, countries=countries, cities=cities)

In [31]:
find_city_and_country_in_headline(data[3])

{'headline': 'Mystery Virus Spreads in Recife, Brazil',
 'countries': 'Brazil',
 'cities': 'Recife'}

In [32]:
find_city_and_country_in_headline(data[1])

{'headline': 'Could Zika Reach New York City?',
 'countries': None,
 'cities': 'New York City'}

In [33]:
headline_cities_and_countries = [
    find_city_and_country_in_headline(headline) for headline in data
]
headline_cities_and_countries[-10:]

[{'headline': 'Authorities are Worried about the Spread of Varicella in Clovis',
  'countries': None,
  'cities': 'Clovis'},
 {'headline': 'More Zika patients reported in Fort Worth',
  'countries': None,
  'cities': 'Fort Worth'},
 {'headline': 'Zika symptoms spotted in Boynton Beach',
  'countries': None,
  'cities': 'Boynton Beach'},
 {'headline': 'Outbreak of Zika in Portoviejo',
  'countries': None,
  'cities': 'Portoviejo'},
 {'headline': 'Influenza Exposure in Muscat',
  'countries': None,
  'cities': 'Muscat'},
 {'headline': 'Rumors about Rabies spreading in Jerusalem have been refuted',
  'countries': None,
  'cities': 'Jerusalem'},
 {'headline': 'More Zika patients reported in Indang',
  'countries': None,
  'cities': 'Indang'},
 {'headline': 'Suva authorities confirmed the spread of Rotavirus',
  'countries': None,
  'cities': 'Suva'},
 {'headline': 'More Zika patients reported in Bella Vista',
  'countries': None,
  'cities': 'Bella Vista'},
 {'headline': 'Zika Outbreak in 

In [35]:
import json

save_file = "with open(save_file, "r") as fin:
    check_data = json.loads(fin.read())headline_cities_and_countries.json"
with open(save_file, "w") as fout:
    fout.write(json.dumps(headline_cities_and_countries))

In [36]:
with open(save_file, "r") as fin:
    check_data = json.loads(fin.read())

In [37]:
check_data[-10:]

[{'headline': 'Authorities are Worried about the Spread of Varicella in Clovis',
  'countries': None,
  'cities': 'Clovis'},
 {'headline': 'More Zika patients reported in Fort Worth',
  'countries': None,
  'cities': 'Fort Worth'},
 {'headline': 'Zika symptoms spotted in Boynton Beach',
  'countries': None,
  'cities': 'Boynton Beach'},
 {'headline': 'Outbreak of Zika in Portoviejo',
  'countries': None,
  'cities': 'Portoviejo'},
 {'headline': 'Influenza Exposure in Muscat',
  'countries': None,
  'cities': 'Muscat'},
 {'headline': 'Rumors about Rabies spreading in Jerusalem have been refuted',
  'countries': None,
  'cities': 'Jerusalem'},
 {'headline': 'More Zika patients reported in Indang',
  'countries': None,
  'cities': 'Indang'},
 {'headline': 'Suva authorities confirmed the spread of Rotavirus',
  'countries': None,
  'cities': 'Suva'},
 {'headline': 'More Zika patients reported in Bella Vista',
  'countries': None,
  'cities': 'Bella Vista'},
 {'headline': 'Zika Outbreak in 

In [38]:
check_data[:5]

[{'headline': 'Zika Outbreak Hits Miami',
  'countries': None,
  'cities': 'Miami'},
 {'headline': 'Could Zika Reach New York City?',
  'countries': None,
  'cities': 'New York City'},
 {'headline': 'First Case of Zika in Miami Beach',
  'countries': None,
  'cities': 'Miami Beach'},
 {'headline': 'Mystery Virus Spreads in Recife, Brazil',
  'countries': 'Brazil',
  'cities': 'Recife'},
 {'headline': 'Dallas man comes down with case of Zika',
  'countries': None,
  'cities': 'Dallas'}]

In [42]:
with open("city_accent_mapping.json", "w") as fout:
    fout.write(json.dumps(city_accent_mapping))

In [44]:
with open("country_accent_mapping.json", "w") as fout:
    fout.write(json.dumps(country_accent_mapping))

In [46]:
import pandas as pd

data = pd.read_json("headline_cities_and_countries.json")
data = data.replace({None: np.nan})

data.head(100)

Unnamed: 0,cities,countries,headline
0,Miami,,Zika Outbreak Hits Miami
1,New York City,,Could Zika Reach New York City?
2,Miami Beach,,First Case of Zika in Miami Beach
3,Recife,Brazil,"Mystery Virus Spreads in Recife, Brazil"
4,Dallas,,Dallas man comes down with case of Zika
5,Trinidad,,Trinidad confirms first Zika case
6,Houston,,Zika Concerns are Spreading in Houston
7,Geneve,,Geneve Scientists Battle to Find Cure
8,Atlanta,,The CDC in Atlanta is Growing Worried
9,Sao Paulo,,Zika Infested Monkeys in Sao Paulo
