In [1]:
import numpy as np
import json
import pandas as pd

data = pd.read_json("./data/headline_cities_and_countries.json")
data = data.replace({None: np.nan})
    
data.head(10)

Unnamed: 0,headline,countries,cities
0,Zika Outbreak Hits Miami,,Miami
1,Could Zika Reach New York City?,,New York City
2,First Case of Zika in Miami Beach,,Miami Beach
3,"Mystery Virus Spreads in Recife, Brazil",Brazil,Recife
4,Dallas man comes down with case of Zika,,Dallas
5,Trinidad confirms first Zika case,,Trinidad
6,Zika Concerns are Spreading in Houston,,Houston
7,Geneve Scientists Battle to Find Cure,,Geneve
8,The CDC in Atlanta is Growing Worried,,Atlanta
9,Zika Infested Monkeys in Sao Paulo,,Sao Paulo


In [3]:
import geonamescache
gc = geonamescache.GeonamesCache()


In [4]:
acc_cities = [city['name'] for city in gc.get_cities().values()]

In [5]:
import unidecode

city_accent_mapping = {
    unidecode.unidecode(city): city for city in acc_cities
}

city_accent_mapping["Geneve"]

'Genève'

In [6]:
data.loc[6:10, ["headline","countries"]]

Unnamed: 0,headline,countries
6,Zika Concerns are Spreading in Houston,
7,Geneve Scientists Battle to Find Cure,
8,The CDC in Atlanta is Growing Worried,
9,Zika Infested Monkeys in Sao Paulo,
10,Brownsville teen contracts Zika virus,


In [7]:
data.describe()

Unnamed: 0,headline,countries,cities
count,650,15,608
unique,647,10,573
top,Spanish Flu Outbreak in Lisbon,Malaysia,Miami
freq,2,3,4


In [8]:
data[data.countries.notnull()].describe()

Unnamed: 0,headline,countries,cities
count,15,15,15
unique,15,10,14
top,"Zika reaches Johor Bahru, Malaysia",Brazil,Panama City
freq,1,3,2


In [9]:
data[data.cities.notnull()].describe()

Unnamed: 0,headline,countries,cities
count,608,15,608
unique,605,10,573
top,Spanish Flu Spreading through Madrid,Malaysia,Miami
freq,2,3,4


In [10]:
data.loc[data["cities"] == "Monroe", "headline"]

355    Lower Hospitalization in Monroe after Hepatiti...
458               Spike of Syphilis Cases in West Monroe
542                          West Nile Virus Hits Monroe
607    The Spread of Respiratory Syncytial Virus in M...
Name: headline, dtype: object

In [11]:
# Get the dataframe with only not null cities and headlines
data = data[~data.cities.isnull()][['cities', 'headline']]
data.head(20)

Unnamed: 0,cities,headline
0,Miami,Zika Outbreak Hits Miami
1,New York City,Could Zika Reach New York City?
2,Miami Beach,First Case of Zika in Miami Beach
3,Recife,"Mystery Virus Spreads in Recife, Brazil"
4,Dallas,Dallas man comes down with case of Zika
5,Trinidad,Trinidad confirms first Zika case
6,Houston,Zika Concerns are Spreading in Houston
7,Geneve,Geneve Scientists Battle to Find Cure
8,Atlanta,The CDC in Atlanta is Growing Worried
9,Sao Paulo,Zika Infested Monkeys in Sao Paulo


In [12]:
data.describe

<bound method NDFrame.describe of             cities                                           headline
0            Miami                           Zika Outbreak Hits Miami
1    New York City                    Could Zika Reach New York City?
2      Miami Beach                  First Case of Zika in Miami Beach
3           Recife            Mystery Virus Spreads in Recife, Brazil
4           Dallas            Dallas man comes down with case of Zika
..             ...                                                ...
645      Jerusalem  Rumors about Rabies spreading in Jerusalem hav...
646         Indang              More Zika patients reported in Indang
647           Suva  Suva authorities confirmed the spread of Rotav...
648    Bella Vista         More Zika patients reported in Bella Vista
649  Wichita Falls                     Zika Outbreak in Wichita Falls

[608 rows x 2 columns]>

In [19]:
for index, element in data.cities.items():
    print(index, element)

0 Miami
1 New York City
2 Miami Beach
3 Recife
4 Dallas
5 Trinidad
6 Houston
7 Geneve
8 Atlanta
9 Sao Paulo
10 Brownsville
11 St. Louis
12 San Juan
13 Galveston
14 Manila
15 Iloilo
16 Los Angeles
18 Orlando
20 Chicago
21 Tampa
22 Flint
23 Baltimore
24 London
25 Ho Chi Minh City
26 Philadelphia
27 Boston
28 Paris
29 San Diego
30 Bangkok
31 Beijing
32 Salvador
33 Kuala Lumpur
34 Yangon
35 Tallahassee
36 San Francisco
37 Bethesda
38 Townsville
39 Mandaluyong City
40 Santa Rosa
41 San Salvador
42 Cleveland
43 Austin
44 Piracicaba
45 Lima
46 Toronto
47 Bogota
49 Brisbane
50 Dakar
51 Havana
52 Key West
53 Vancouver
54 Seattle
55 Nashville
56 Saint Croix
57 Fort Collins
58 Klang
59 Guatemala City
60 Madison
61 Sarasota
62 Entebbe
64 Brasilia
65 Jacksonville
66 Shenzhen
67 Caracas
68 Quezon City
69 Sydney
70 Mumbai
71 Pune
72 Hanoi
74 Silver Spring
75 Delhi
77 Belize City
78 Campinas
79 Soyapango
80 Rome
81 Seoul
82 Quebec
83 Mexico City
84 Rochester
85 El Paso
86 Cucuta
87 Jakarta
89 Salt Lak

In [31]:
print(data.headline.iloc[18])

Chicago's First Zika Case Confirmed


In [42]:
data

Unnamed: 0,cities,headline
0,Miami,Zika Outbreak Hits Miami
1,New York City,Could Zika Reach New York City?
2,Miami Beach,First Case of Zika in Miami Beach
3,Recife,"Mystery Virus Spreads in Recife, Brazil"
4,Dallas,Dallas man comes down with case of Zika
5,Trinidad,Trinidad confirms first Zika case
6,Houston,Zika Concerns are Spreading in Houston
7,Geneve,Geneve Scientists Battle to Find Cure
8,Atlanta,The CDC in Atlanta is Growing Worried
9,Sao Paulo,Zika Infested Monkeys in Sao Paulo


In [44]:
# Create a mapping from countrycode given by the city 
# to Country name used in the resulting dataframe
from geonamescache.mappers import country
mapper = country(from_key='iso', to_key='name')

latitudes = []
longitudes = []
countries =[]
# Iterate on the cities and append the values for the max population
for element in data.cities.values:
    city = max(gc.get_cities_by_name(city_accent_mapping[element]),key=lambda x: list(x.values())[0]['population'])
    city = list(city.values())[0]
#    headline=data.headline.iloc[index-1]
    cityname=None if not city['name'] else city['name']
    latitude=None if not city['latitude'] else city['latitude']
    longitude=None if not city["longitude"] else city["longitude"]
    country=mapper(city["countrycode"])
    latitudes.append(latitude)
    longitudes.append(longitude)
    countries.append(country)
    
data = data.assign(Latitude=latitudes, Longitude=longitudes, Countries=countries)

In [45]:
data.head()

Unnamed: 0,cities,headline,Latitude,Longitude,Countries
0,Miami,Zika Outbreak Hits Miami,25.77427,-80.19366,United States
1,New York City,Could Zika Reach New York City?,40.71427,-74.00597,United States
2,Miami Beach,First Case of Zika in Miami Beach,25.79065,-80.13005,United States
3,Recife,"Mystery Virus Spreads in Recife, Brazil",-8.05389,-34.88111,Brazil
4,Dallas,Dallas man comes down with case of Zika,32.78306,-96.80667,United States


In [59]:
import pandas as pd
import numpy as np

data.to_json('./data/headlines_and_locations.json',
             orient='records',lines=True)
