In [81]:
import re # text extration regex
import pandas as pd # data manipulation and analysis
import numpy as np
import matplotlib as plt # general visualisation
import geonamescache as gc # identify cities and countries and their location
#from matplotlib import basemap # geographical visualisation
from collections import Counter # check duplicates in dictionary
import unidecode # handling accent marks, editing text
import json # saving data

In [12]:
# inport data as list
# strip off newline marks 
with open('headlines.txt') as file:
    data = [headline.strip() for headline in file]

# show first 5 rows
data[:5]

['Zika Outbreak Hits Miami',
 'Could Zika Reach New York City?',
 'First Case of Zika in Miami Beach',
 'Mystery Virus Spreads in Recife, Brazil',
 'Dallas man comes down with case of Zika']

In [15]:
# GET COUNTRY NAMES
# save ONLY country name in each entry in countries
# .values() retunrs list of all values available in dictionary
countries = [country['name'] for country in gc.GeonamesCache().get_countries().values()]

# show first 5 rows
countries[:5]

['Andorra',
 'United Arab Emirates',
 'Afghanistan',
 'Antigua and Barbuda',
 'Anguilla']

In [18]:
# GET CITY NAMES
# save ONLY city names
cities = [city['name'] for city in gc.GeonamesCache().get_cities().values()]

# show first 5 rows
cities[:5]

['Andorra la Vella',
 'Umm Al Quwain City',
 'Ras Al Khaimah City',
 'Zayed City',
 'Khawr Fakkān']

In [22]:
# CHECK CITY DUPLICATES in package that are recored more than once 
city_tally = Counter(cities)


city_tally.most_common(5)

[('San Fernando', 8),
 ('Springfield', 8),
 ('San Pedro', 7),
 ('Richmond', 7),
 ('Mercedes', 6)]

In [31]:
# REMOVE ACCENT MARKS
# unidecode.unidecode() method decodes text 

# create dictionary mapping unaccented country name to accented country name
country_accent_map = {
    unidecode.unidecode(country): country for country in countries
}

# create city dictionary
city_accent_map = {
    unidecode.unidecode(city): city for city in cities
}

#city_accent_map['Qarqin']

# remove accents from headlines
# apply same method to each line in data
data = [unidecode.unidecode(headline) for headline in data]


In [37]:
# SEARCH FOR CITY AND COUNTY NAMES IN HEADLINES
# use regex created from accented cities and countries

# list of cities and countries 
unaccented_cities = list(city_accent_map.keys())
unaccented_countries = list(country_accent_map.keys())

# length of list
print("There are {} cities".format(len(unaccented_cities)))
print("There are {} countries".format(len(unaccented_countries)))


There are 23151 cities
There are 252 countries


In [49]:
# match entire word and match ENTIRE CITY NAME with regex
# e.g. San and San Jose are both cities, but San Jose in headlines will be matched to San
# set priority as longest name to shortest: Jose > San

# sort entire lists with sorted()
# key = has to be function applied to each element, returns values for checking,
#       use key to check length
# reverse = true so that longest at top of list
unaccented_cities = sorted(unaccented_cities, key=lambda x: len(x), reverse=True)

unaccented_cities[:5]

['Chak Two Hundred Forty-nine Thal Development Authority',
 'Dolores Hidalgo Cuna de la Independencia Nacional',
 'Ampliacion San Mateo (Colonia Solidaridad)',
 'Licenciado Benito Juarez (Campo Gobierno)',
 'Sant Pere, Santa Caterina i La Ribera']

In [50]:
unaccented_countries = sorted(unaccented_countries, key=lambda x: len(x), reverse=True)
unaccented_countries[:5]

['South Georgia and the South Sandwich Islands',
 'United States Minor Outlying Islands',
 'Bonaire, Saint Eustatius and Saba ',
 'Heard Island and McDonald Islands',
 'Democratic Republic of the Congo']

In [52]:
# CONSTRUCT REGEX
city_regex = r'\b|\b'.join(unaccented_cities)
city_regex[0:500]

"Chak Two Hundred Forty-nine Thal Development Authority\\b|\\bDolores Hidalgo Cuna de la Independencia Nacional\\b|\\bAmpliacion San Mateo (Colonia Solidaridad)\\b|\\bLicenciado Benito Juarez (Campo Gobierno)\\b|\\bSant Pere, Santa Caterina i La Ribera\\b|\\bPalikir - National Government Center\\b|\\bNanchital de Lazaro Cardenas del Rio\\b|\\bSan Fernando del Valle de Catamarca\\b|\\bDovercourt-Wallace Emerson-Junction\\b|\\bel Camp d'en Grassot i Gracia Nova\\b|\\bSan Martin Texmelucan de Labastida\\b|\\bWaterfront C"

In [65]:
# test regex

# init generator
np.random.seed(5)

# randomly select 10 headlines
test_headlines = np.random.choice(data, 10)

# find matching city name in each headline
for test_headline in test_headlines:
    match = re.search(city_regex, test_headline)
    print(test_headline)
    if match:
        print(match.group(0), "\n") #.group(0) returns ALL MATCHES

Oxford tests new cure for HIV
Oxford 

Spanish Flu Outbreak in Barcelona
Barcelona 

Zika spreads to Lewisville
Lewisville 

Zika case reported in Oton
The CDC in Atlanta is Growing Worried
Atlanta 

Rumors about Varicella spreading in Carlsbad have been refuted
Carlsbad 

Hepatitis C Outbreak in Bethlehem
Bethlehem 

Lower Hospitalization in Palm Springs after Hepatitis A Vaccine becomes Mandatory
Palm Springs 

More Zika patients reported in Davao
Davao 

Birmingham Residents Recieve Rabies vaccine
Birmingham 



In [56]:
# create regex for country
country_regex = r'\b|\b'.join(unaccented_countries)
country_regex[0:500]

'South Georgia and the South Sandwich Islands\\b|\\bUnited States Minor Outlying Islands\\b|\\bBonaire, Saint Eustatius and Saba \\b|\\bHeard Island and McDonald Islands\\b|\\bDemocratic Republic of the Congo\\b|\\bSaint Vincent and the Grenadines\\b|\\bBritish Indian Ocean Territory\\b|\\bFrench Southern Territories\\b|\\bSaint Pierre and Miquelon\\b|\\bCentral African Republic\\b|\\bNorthern Mariana Islands\\b|\\bTurks and Caicos Islands\\b|\\bBosnia and Herzegovina\\b|\\bSvalbard and Jan Mayen\\b|\\bBritish Virgin Island'

In [63]:
# test countries regex
np.random.seed(10)

test_headlines = np.random.choice(data, 10)

for test_headline in test_headlines:
    match = re.search(country_regex, test_headline)
    print(test_headline)
    if match:
        print(match.group(0), "\n")

Rumors about Mumps Spreading in New Bedford have been Refuted
New medicine wipes out West Nile Virus in Ventura
Will Rotavirus vaccine help Addis Ababa?
Chikungunya re-emerges in Kobe
Zika symptoms spotted in Quisqueya
New Delhi Addressing Zika Concerns
West Nile Virus Outbreak in Saint Johns
Toms River Encounters Severe Symptoms of Respiratory Syncytial Virus
The CDC in Atlanta is Growing Worried
Zika case reported in Oton


In [62]:
# test on headline with BOTH city and country
test_headline = data[3]
print(test_headline)
print(re.search(city_regex, test_headline).group(0))
print(re.search(country_regex, test_headline).group(0))

Mystery Virus Spreads in Recife, Brazil
Recife
Brazil


In [73]:
# function for finding city and country
def city_and_country_in_headline(headline):
    """
    Finds any city or country names in text headline.
    
    :param headline: string of the headline
    
    :return dict: dictionary which maps headline to cities and countries contained
    """

    # match with regex
    country_match = re.search(country_regex, headline)
    city_match = re.search(city_regex, headline)
    
    # save actual match (from group(0)) into var or else 'None'
    countries = None if not country_match else country_match.group(0)
    cities = None if not city_match else city_match.group(0)

    return dict(Headline=headline, Countries=countries, Cities=cities)

In [74]:
# test finding function
city_and_country_in_headline(data[3])

{'Headline': 'Mystery Virus Spreads in Recife, Brazil',
 'Countries': 'Brazil',
 'Cities': 'Recife'}

In [75]:
city_and_country_in_headline(data[10])

{'Headline': 'Brownsville teen contracts Zika virus',
 'Countries': None,
 'Cities': 'Brownsville'}

In [76]:
# try applying to all headlines
headline_cities_and_countries = [city_and_country_in_headline(headline) for headline in data]

In [79]:
headline_cities_and_countries[:5]

[{'Headline': 'Zika Outbreak Hits Miami',
  'Countries': None,
  'Cities': 'Miami'},
 {'Headline': 'Could Zika Reach New York City?',
  'Countries': None,
  'Cities': 'New York City'},
 {'Headline': 'First Case of Zika in Miami Beach',
  'Countries': None,
  'Cities': 'Miami Beach'},
 {'Headline': 'Mystery Virus Spreads in Recife, Brazil',
  'Countries': 'Brazil',
  'Cities': 'Recife'},
 {'Headline': 'Dallas man comes down with case of Zika',
  'Countries': None,
  'Cities': 'Dallas'}]

In [82]:
# saving data as JSON
save_file = 'headline_cities_and_countries.json'
with open(save_file, 'w') as fout:
    # convert dict to JSON using .dumps()
    fout.write(json.dumps(headline_cities_and_countries))

In [84]:
# test loading back in
with open(save_file, 'r') as fin:
    check_data = json.loads(fin.read())

check_data[:5]

[{'Headline': 'Zika Outbreak Hits Miami',
  'Countries': None,
  'Cities': 'Miami'},
 {'Headline': 'Could Zika Reach New York City?',
  'Countries': None,
  'Cities': 'New York City'},
 {'Headline': 'First Case of Zika in Miami Beach',
  'Countries': None,
  'Cities': 'Miami Beach'},
 {'Headline': 'Mystery Virus Spreads in Recife, Brazil',
  'Countries': 'Brazil',
  'Cities': 'Recife'},
 {'Headline': 'Dallas man comes down with case of Zika',
  'Countries': None,
  'Cities': 'Dallas'}]

In [85]:
# save mappings
with open('city_accent_mapping.json', 'w') as fout:
    fout.write(json.dumps(city_accent_map))

with open('country_accent_mapping.json', 'w') as fout:
    fout.write(json.dumps(country_accent_map))

In [86]:
# convert to dataframe
data = pd.read_json('headline_cities_and_countries.json')
# replace Nones with NaN so that df can read
data = data.replace({None: np.nan})

data.head(10)

Unnamed: 0,Headline,Countries,Cities
0,Zika Outbreak Hits Miami,,Miami
1,Could Zika Reach New York City?,,New York City
2,First Case of Zika in Miami Beach,,Miami Beach
3,"Mystery Virus Spreads in Recife, Brazil",Brazil,Recife
4,Dallas man comes down with case of Zika,,Dallas
5,Trinidad confirms first Zika case,,Trinidad
6,Zika Concerns are Spreading in Houston,,Houston
7,Geneve Scientists Battle to Find Cure,,Geneve
8,The CDC in Atlanta is Growing Worried,,Atlanta
9,Zika Infested Monkeys in Sao Paulo,,Sao Paulo
