In [2]:
import re # text extration regex
import pandas as pd # data manipulation and analysis
import numpy as np
import matplotlib as plt # general visualisation
import geonamescache as gc # identify cities and countries and their location
#from matplotlib import basemap # geographical visualisation
from collections import Counter # check duplicates in dictionary
import unidecode # handling accent marks, editing text
import json # saving data

In [3]:
df = pd.read_json('headline_cities_and_countries.json')
df = df.replace({None: np.nan})
df

Unnamed: 0,Headline,Countries,Cities
0,Zika Outbreak Hits Miami,,Miami
1,Could Zika Reach New York City?,,New York City
2,First Case of Zika in Miami Beach,,Miami Beach
3,"Mystery Virus Spreads in Recife, Brazil",Brazil,Recife
4,Dallas man comes down with case of Zika,,Dallas
...,...,...,...
645,Rumors about Rabies spreading in Jerusalem hav...,,Jerusalem
646,More Zika patients reported in Indang,,Indang
647,Suva authorities confirmed the spread of Rotav...,,Suva
648,More Zika patients reported in Bella Vista,,Bella Vista


In [4]:
# extract longitude and latitude information
countries = gc.GeonamesCache().get_countries()

# access 0th dict key in countries dict
# convert to list
country_key = list(countries.keys())[0]
print(countries[country_key])

{'geonameid': 3041565, 'name': 'Andorra', 'iso': 'AD', 'iso3': 'AND', 'isonumeric': 20, 'fips': 'AN', 'continentcode': 'EU', 'capital': 'Andorra la Vella', 'areakm2': 468, 'population': 84000, 'tld': '.ad', 'currencycode': 'EUR', 'currencyname': 'Euro', 'phone': '376', 'postalcoderegex': '^(?:AD)*(\\d{3})$', 'languages': 'ca', 'neighbours': 'ES,FR'}


In [8]:
# take city name from df column
test_city = 'Boston'

# put into get_cities_by_name()
test_list = gc.GeonamesCache().get_cities_by_name(test_city)

# test_list is a list of dicts

# choose most populous city
pops = [0]*len(test_list)

for entry in range(len(test_list)):
    pops[entry] = [city['population'] for city in list(test_list[entry].values())]

max_ind = pops.index(max(pops))

# extract country name
code = [city['countrycode'] for city in list(test_list[0].values())]

code[0]

'GB'

In [178]:
# encapsulate previous block into function

def extract_data(city, property):
    """
    Takes a specific property of a specific city from geonamescache.
    
    :param city: string of city.
    :param property: string of property value you wish to acquire.
    
    :return result: outputs property you want as a string
     """

    # make sure only actual city names go in
    if city == 'nan': return np.nan

    cities = gc.GeonamesCache().get_cities_by_name(city)

    # makes sure its not an empty list
    if not cities: return np.nan
    
    # choose most populous country
    population_list = [0]*len(cities)

    for entry in range(len(cities)):
        population_list[entry] = [city['population'] for city in cities[entry].values()]

    #print(population_list)

    max_ind = population_list.index(max(population_list)) 

    result = [city[property][0] for city in cities[max_ind].values()]

    return result

In [139]:
# test function
extract_data('Miami Beach', 'countrycode')

['US']

In [179]:
country_codes = [0]*len(df['Cities'])

# iterate through df column
for city in range(len(df['Cities'])):
    #print(city)
    country_codes[city] = extract_data(df['Cities'][city], 'countrycode')
    

In [181]:
longitudes = [0]*len(df['Cities'])
latitudes = [0]*len(df['Cities'])

# iterate through df column
for city in range(len(df['Cities'])):
    #print(city)
    longitudes[city] = extract_data(df['Cities'][city], 'longitude')
    latitudes[city] = extract_data(df['Cities'][city], 'latitude')

In [185]:
# add to df
df['Latitudes'] = latitudes
df['Longitudes'] = longitudes
df['Country Codes'] = country_codes

In [194]:
df

Unnamed: 0,Headline,Countries,Cities,Latitudes,Longitudes,Country Codes
0,Zika Outbreak Hits Miami,,Miami,[25.77427],[-80.19366],[US]
1,Could Zika Reach New York City?,,New York City,[40.71427],[-74.00597],[US]
2,First Case of Zika in Miami Beach,,Miami Beach,[25.79065],[-80.13005],[US]
3,"Mystery Virus Spreads in Recife, Brazil",Brazil,Recife,[-8.05389],[-34.88111],[BR]
4,Dallas man comes down with case of Zika,,Dallas,[32.78306],[-96.80667],[US]
...,...,...,...,...,...,...
645,Rumors about Rabies spreading in Jerusalem hav...,,Jerusalem,[31.76904],[35.21633],[IL]
646,More Zika patients reported in Indang,,Indang,[14.19528],[120.87694],[PH]
647,Suva authorities confirmed the spread of Rotav...,,Suva,[-18.14161],[178.44149],[FJ]
648,More Zika patients reported in Bella Vista,,Bella Vista,[18.45539],[-69.9454],[DO]
