# Get Data

In [1]:
#import libraries
import requests
import numpy as np
import pandas as pd
from geopy.geocoders import Nominatim

In [2]:
#get info from wikipedia
wikipedia_link='https://en.wikipedia.org/wiki/List_of_metropolitan_areas_by_population'
raw_page = requests.get(wikipedia_link)
page = raw_page.text
#print(page)

In [3]:
#Read info from HTML table that contains 'Postcode'
match = 'Rank'
dfArray = pd.read_html(page,match)
#First element of array is the dataset of interest
df=dfArray[0]
#df.head()

In [4]:
#Process the dataframe
df_Metro = df.drop(columns = ['Rank','Official population','Year'])
#Add latitude and longitude columns
df_Metro=pd.concat([df_Metro,pd.DataFrame(columns=['Latitude','Longitude'])], sort=False)
df_Metro.head()

Unnamed: 0,Metropolitan,Country,Continent,Latitude,Longitude
0,Tokyo,Japan,Asia,,
1,Delhi,India,Asia,,
2,Shanghai,China,Asia,,
3,Jakarta,Indonesia,Asia,,
4,Seoul,South Korea,Asia,,


In [5]:
#Update latitude and longitude for each Metropolitan area
geolocator = Nominatim(user_agent="tor_explorer")

#Go through each row
for index, row in df_Metro.iterrows():
    location = geolocator.geocode(row['Metropolitan'] +', '+ row['Country'])
    if location != None:
        row['Latitude'] = location.latitude
        row['Longitude'] = location.longitude
df_Metro.head()

Unnamed: 0,Metropolitan,Country,Continent,Latitude,Longitude
0,Tokyo,Japan,Asia,35.6828,139.759
1,Delhi,India,Asia,28.6517,77.2219
2,Shanghai,China,Asia,31.2323,121.469
3,Jakarta,Indonesia,Asia,-6.17539,106.827
4,Seoul,South Korea,Asia,37.5667,126.978


In [6]:
#See if any areas were not located
df_nan = df_Metro[df_Metro['Latitude'].isna()]
nan_index = df_nan.index
df_nan

Unnamed: 0,Metropolitan,Country,Continent,Latitude,Longitude
15,Keihanshin (Kyoto-Osaka-Kobe),Japan,Asia,,
47,"Washington, D.C. - Baltimore",United States,North America,,
64,Hong Kong,CHN,Asia,,
96,San Francisco-Oakland-Hayward,United States,North America,,


In [7]:
#Fix these issues
df_Metro.iloc[15, 0] = 'Kyoto'
df_Metro.iloc[47, 0] = 'Washington, D.C.'
df_Metro.iloc[64, 1] = 'China'
df_Metro.iloc[96, 0] = 'San Francisco'

In [8]:
#Try again
for i in nan_index:
        location = geolocator.geocode(df_Metro.iloc[i, 0] +', '+ df_Metro.iloc[i, 1])
        if location != None:
            df_Metro.iloc[i, 3] = location.latitude
            df_Metro.iloc[i, 4] = location.longitude

In [9]:
#Check to see if errors were resolved
df_Metro[df_Metro['Latitude'].isna()]

Unnamed: 0,Metropolitan,Country,Continent,Latitude,Longitude


In [10]:
df_Metro

Unnamed: 0,Metropolitan,Country,Continent,Latitude,Longitude
0,Tokyo,Japan,Asia,35.6828,139.759
1,Delhi,India,Asia,28.6517,77.2219
2,Shanghai,China,Asia,31.2323,121.469
3,Jakarta,Indonesia,Asia,-6.17539,106.827
4,Seoul,South Korea,Asia,37.5667,126.978
5,Guangzhou,China,Asia,23.1302,113.259
6,Beijing,China,Asia,40.1906,116.412
7,Manila,Philippines,Asia,14.5906,120.98
8,New York City,United States,North America,40.7127,-74.006
9,Shenzhen,China,Asia,22.5446,114.055


In [12]:
#Save dataframe to a json file
df_Metro.to_json(r'MetroData')