# Part 1 - Get Location Data
In this notebook, I get the top 100 metropolitan areas by population from Wikipedia.  
Link: https://en.wikipedia.org/wiki/List_of_metropolitan_areas_by_population  
I then get the latitude and longitude of the metro area.  
The data is cleaned so that latitude and longitude are returned for all metro areas.  
Finally, the dataframe is stored as a json file for later use.

<div style="text-align: right">
    <h2><a href="https://nbviewer.jupyter.org/github/KathrynDH/IBMCapstoneFinalProject/blob/master/Final%20Project%20-%20Get%20FourSquare%20Data.ipynb">Part 2 &rarr;</a>
    </h2>
</div>

In [1]:
#import libraries
import requests
import numpy as np
import pandas as pd
import ibm_boto3
from geopy.geocoders import Nominatim

In [2]:
# The code was removed by Watson Studio for sharing.

In [3]:
#get info from wikipedia
wikipedia_link='https://en.wikipedia.org/wiki/List_of_metropolitan_areas_by_population'
raw_page = requests.get(wikipedia_link)
page = raw_page.text
#print(page)

In [4]:
#Read info from HTML table that contains 'Rank'
match = 'Rank'
dfArray = pd.read_html(page,match)
#First element of array is the dataset of interest
df=dfArray[0]

In [5]:
#Process the dataframe
df_Metro = df.drop(columns = ['Rank','Official population','Year'])
#Add latitude and longitude columns
df_Metro=pd.concat([df_Metro,pd.DataFrame(columns=['Latitude','Longitude'])], sort=False)
df_Metro.head()

Unnamed: 0,Metropolitan,Country,Continent,Latitude,Longitude
0,Tokyo,Japan,Asia,,
1,Delhi,India,Asia,,
2,Shanghai,China,Asia,,
3,Jakarta,Indonesia,Asia,,
4,Seoul,South Korea,Asia,,


In [6]:
#Update latitude and longitude for each Metropolitan area
geolocator = Nominatim(user_agent="m_explorer")

#Go through each row
for index, row in df_Metro.iterrows():
    location = geolocator.geocode(row['Metropolitan'] +', '+ row['Country'])
    if location != None:
        row['Latitude'] = location.latitude
        row['Longitude'] = location.longitude
df_Metro.head()

Unnamed: 0,Metropolitan,Country,Continent,Latitude,Longitude
0,Tokyo,Japan,Asia,35.6828,139.759
1,Delhi,India,Asia,28.6517,77.2219
2,Shanghai,China,Asia,31.2323,121.469
3,Jakarta,Indonesia,Asia,-6.17539,106.827
4,Seoul,South Korea,Asia,37.5667,126.978


In [7]:
#See if any areas were not located
df_nan = df_Metro[df_Metro['Latitude'].isna()]
nan_index = df_nan.index
df_nan

Unnamed: 0,Metropolitan,Country,Continent,Latitude,Longitude
15,Keihanshin (Kyoto-Osaka-Kobe),Japan,Asia,,
47,"Washington, D.C. - Baltimore",United States,North America,,
53,San Francisco-San Jose-Oakland,United States,North America,,
65,Hong Kong,CHN,Asia,,


In [8]:
#Fix these issues
df_Metro.iloc[15, 0] = 'Kyoto'
df_Metro.iloc[47, 0] = 'Washington, D.C.'
df_Metro.iloc[53, 0] = 'San Francisco'
df_Metro.iloc[65, 1] = 'China'

In [9]:
df_Metro[df_Metro['Latitude'].isna()]

Unnamed: 0,Metropolitan,Country,Continent,Latitude,Longitude
15,Kyoto,Japan,Asia,,
47,"Washington, D.C.",United States,North America,,
53,San Francisco,United States,North America,,
65,Hong Kong,China,Asia,,


In [10]:
#Try again
for i in nan_index:
        location = geolocator.geocode(df_Metro.iloc[i, 0] +', '+ df_Metro.iloc[i, 1])
        if location != None:
            df_Metro.iloc[i, 3] = location.latitude
            df_Metro.iloc[i, 4] = location.longitude

In [11]:
#Check to see if errors were resolved
df_Metro[df_Metro['Latitude'].isna()]

Unnamed: 0,Metropolitan,Country,Continent,Latitude,Longitude


In [12]:
df_Metro.head()

Unnamed: 0,Metropolitan,Country,Continent,Latitude,Longitude
0,Tokyo,Japan,Asia,35.6828,139.759
1,Delhi,India,Asia,28.6517,77.2219
2,Shanghai,China,Asia,31.2323,121.469
3,Jakarta,Indonesia,Asia,-6.17539,106.827
4,Seoul,South Korea,Asia,37.5667,126.978


In [13]:
resource.Bucket(name=bucket).put_object(Key='MetroData.json', Body=df_Metro.to_json())
print('Saved')

Saved


<div style="text-align: right">
    <h2><a href="https://nbviewer.jupyter.org/github/KathrynDH/IBMCapstoneFinalProject/blob/master/Final%20Project%20-%20Get%20FourSquare%20Data.ipynb">Part 2 &rarr;</a>
    </h2>
</div>