In [1]:
with open("./data/headlines.txt") as file:
    data = [headline.strip() for headline in file]
    
data[:4]

['Zika Outbreak Hits Miami',
 'Could Zika Reach New York City?',
 'First Case of Zika in Miami Beach',
 'Mystery Virus Spreads in Recife, Brazil']

In [2]:
import geonamescache

gc = geonamescache.GeonamesCache()
countries = [country["name"] for country in gc.get_countries().values()]
countries[:4]

['Andorra', 'United Arab Emirates', 'Afghanistan', 'Antigua and Barbuda']

In [3]:
cities = [city['name'] for city in gc.get_cities().values()]
cities[:4]

['Andorra la Vella', 'Umm Al Quwain City', 'Ras Al Khaimah City', 'Zayed City']

In [4]:
import unidecode

country_accent_mapping = {
    unidecode.unidecode(country): country for country in countries
}

city_accent_mapping = {
    unidecode.unidecode(city): city for city in cities
}
city_accent_mapping["Asmar"]

'Āsmār'

In [5]:
data = [unidecode.unidecode(headline) for headline in data]
data[-4:]

['More Zika patients reported in Indang',
 'Suva authorities confirmed the spread of Rotavirus',
 'More Zika patients reported in Bella Vista',
 'Zika Outbreak in Wichita Falls']

In [6]:
# Create list of cities and countries
unaccented_cities = list(city_accent_mapping.keys())
unaccented_countries = set(country_accent_mapping.keys())

print(f"There are {len(unaccented_cities)} cities to look through.")
print(f"There are {len(unaccented_countries)} countries to look through.")

There are 23022 cities to look through.
There are 252 countries to look through.


In [7]:
import re

In [8]:
unaccented_cities = sorted(unaccented_cities, key=lambda x: len(x), reverse=True)
unaccented_cities[:2]

['Chak Two Hundred Forty-nine Thal Development Authority',
 'Dolores Hidalgo Cuna de la Independencia Nacional']

In [9]:
unaccented_countries = sorted(unaccented_countries, key=lambda x: len(x), reverse=True)
unaccented_countries[:2]

['South Georgia and the South Sandwich Islands',
 'United States Minor Outlying Islands']

In [10]:
city_regex = r'\b|\b'.join(unaccented_cities)
city_regex[1500:1800]

'-Baume\\b|\\bTamuning-Tumon-Harmon Village\\b|\\bTultitlan de Mariano Escobedo\\b|\\bSan Bernardino Tlaxcalancingo\\b|\\bSan Francisco Tlalcilalcalpan\\b|\\bFraccionamiento Ciudad Olmeca\\b|\\bPresidencia Roque Saenz Pena\\b|\\bZurich (Kreis 11) / Oerlikon\\b|\\bSan Fernando de Monte Cristi\\b|\\bPuerto Francisco de '

In [11]:
country_regex = r"\b|\b".join(unaccented_countries)
country_regex[:100]

'South Georgia and the South Sandwich Islands\\b|\\bUnited States Minor Outlying Islands\\b|\\bBonaire, S'

In [12]:
def find_city_and_country_in_headline(headline):
    """
    Find the city(s) and/or country(s) in a text headline.
    
    :param headline: string for headline
    
    :return dict: a dictionary mapping the headline to city(s) and/or countries.
    """
    city_match = re.search(city_regex, headline)
    country_match = re.search(country_regex, headline)
    cities = None if not city_match else city_match.group(0)
    countries = None if not country_match else country_match.group(0)
    return dict(headline=headline, countries=countries, cities=cities)

In [13]:
headline_cities_and_countries = [
    find_city_and_country_in_headline(headline) for headline in data
]
headline_cities_and_countries[-10:]

[{'headline': 'Authorities are Worried about the Spread of Varicella in Clovis',
  'countries': None,
  'cities': 'Clovis'},
 {'headline': 'More Zika patients reported in Fort Worth',
  'countries': None,
  'cities': 'Fort Worth'},
 {'headline': 'Zika symptoms spotted in Boynton Beach',
  'countries': None,
  'cities': 'Boynton Beach'},
 {'headline': 'Outbreak of Zika in Portoviejo',
  'countries': None,
  'cities': 'Portoviejo'},
 {'headline': 'Influenza Exposure in Muscat',
  'countries': None,
  'cities': 'Muscat'},
 {'headline': 'Rumors about Rabies spreading in Jerusalem have been refuted',
  'countries': None,
  'cities': 'Jerusalem'},
 {'headline': 'More Zika patients reported in Indang',
  'countries': None,
  'cities': 'Indang'},
 {'headline': 'Suva authorities confirmed the spread of Rotavirus',
  'countries': None,
  'cities': 'Suva'},
 {'headline': 'More Zika patients reported in Bella Vista',
  'countries': None,
  'cities': 'Bella Vista'},
 {'headline': 'Zika Outbreak in 

In [15]:
import json

save_file = "./data/headline_cities_and_countries.json"
with open(save_file, "w") as fout:
    fout.write(json.dumps(headline_cities_and_countries))

In [16]:
with open(save_file, "r") as fin:
    check_data = json.loads(fin.read())

In [17]:
save_city = "./data/city_accent_mapping.json"
with open(save_city, "w") as fout:
    fout.write(json.dumps(city_accent_mapping))

In [18]:
with open(save_city, "r") as fin:
    check_data = json.loads(fin.read())

In [19]:
save_country = "./data/country_accent_mapping.json"
with open(save_country, "w") as fout:
    fout.write(json.dumps(country_accent_mapping))

In [20]:
with open(save_country, "r") as fin:
    check_data = json.loads(fin.read())

In [22]:
import pandas as pd
import numpy as np

data = pd.read_json("./data/headline_cities_and_countries.json")
data = data.replace({None: np.nan})

data.head(10)

Unnamed: 0,headline,countries,cities
0,Zika Outbreak Hits Miami,,Miami
1,Could Zika Reach New York City?,,New York City
2,First Case of Zika in Miami Beach,,Miami Beach
3,"Mystery Virus Spreads in Recife, Brazil",Brazil,Recife
4,Dallas man comes down with case of Zika,,Dallas
5,Trinidad confirms first Zika case,,Trinidad
6,Zika Concerns are Spreading in Houston,,Houston
7,Geneve Scientists Battle to Find Cure,,Geneve
8,The CDC in Atlanta is Growing Worried,,Atlanta
9,Zika Infested Monkeys in Sao Paulo,,Sao Paulo


In [23]:
acc_cities = pd.read_json("./data/city_accent_mapping.json")
acc_cities = acc_cities.replace({None: np.nan})
    
acc_cities.head(10)

ValueError: If using all scalar values, you must pass an index