Location data is scraped from OpenStreetMap (OSM) resources using the OverPass API.  
PSGC data taken from https://data.humdata.org/dataset/philippines-administrative-levels-0-to-3  
OSMPythonTools package: https://wiki.openstreetmap.org/wiki/OSMPythonTools


In [17]:
import numpy as np
import pandas as pd
import geopandas as gpd
import shapely
from OSMPythonTools.overpass import Overpass

overpass = Overpass()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Functions

In [50]:
def string_contains(x, list_of_values, values_map=None, ignore_words=None):
    '''
    Evaluates options for x from a list_of_values. Option to include word mapping or identify words to ignore.
    
    Returns a string or a list of strings of evaluated options.
    '''
    if not x:
        return None
    
    while '  ' in x:
        x = x.replace('  ', ' ')
        
    x = x.strip()
    output = [val for val in list_of_values if val.lower()==x.lower()]
    
    if output:
        return output[0]

    if values_map:
        if x.lower() in values_map.keys():
            x = values_map[x.lower()]    
            
            output = [val for val in list_of_values if val.lower()==x.lower()]

            if output:
                return output[0]

    if ignore_words:
        for word in ignore_words:
            x = x.lower().replace(word, '').strip()
            x = x.replace('  ', ' ').strip()
        
        # if words in x is complete words in list_of_values (or vice versa)
        ### CODE HERE ###
        # else:
        output = [val for val in list_of_values if val.lower()==x.lower()]

        if output:
            return output[0]

    output += [val for val in list_of_values if ((val.lower() in x.lower()) or (x.lower() in val.lower()))]
    
    if len(output) == 1: output = output[0]
    
    if not output:
        return None
    return output

In [51]:
def query_city_province(name, city=True, province=True):
    '''
    Queries a name and gets the top result in OSM Overpass API.
    Outputs a dictionary with keys city, province.
    
    Inputs:
    name: string, name of location to query.
    city: bool, True if result will be returned.
    province: bool, True if result will be returned.
    '''
    osm_result = overpass.query(f'way["name"="{name}"]; out body;')
    try:
        osm_way = osm_result.elements()[0]
    except:
        return None
    
    result = {}
    if city:
        if osm_way.tag('addr:city'):
            result['city'] = osm_way.tag('addr:city')
        else:
            result['city'] = None
    if province:
        if osm_way.tag('addr:province'):
            result['province'] = osm_way.tag('addr:province')
        else:
            result['province'] = None
    return result

## Loading Data

In [20]:
df = pd.read_csv('data/processed/data.csv', 
                 names=[
 'Name',
 'City/Municipality',
 'Province',
 'Region',], header=0)

In [21]:
cities_gdf = gpd.read_file('data/raw/phl_admbnda_adm3_psa_namria_20200529.shp')

### Cleaning Provinces

#### Evaluating Province Names

In [22]:
cities_gdf['ADM2_EN2'] = cities_gdf['ADM2_EN']
cities_gdf.loc[cities_gdf['ADM2_EN2'].str.contains('NCR'), 'ADM2_EN2'] = 'NCR' #combine all NCR classifications

In [23]:
provinces = list(cities_gdf['ADM2_EN2'].unique())

province_map = {
    'metro manila': 'NCR', 
} # values used in OpenStreetMap

In [24]:
df['province_map'] = df['Province']
df['province_map'].fillna('', inplace=True)
df['province_map'] = df['province_map'].str.strip()

df['province_map'] = [string_contains(x, provinces, province_map) for x in df['province_map'].values]

In [52]:
#review provinces with multiple matches
df['province_map'][df['province_map'].apply(lambda x: isinstance(x, list))]

1189    [Davao del Norte, Davao Oriental, Davao del Su...
Name: province_map, dtype: object

In [54]:
df.loc[1189]

Name                                            Davao Doctors Hospital
City/Municipality                                               Davao 
Province                                                        Davao 
Region                                                             NaN
province_map         [Davao del Norte, Davao Oriental, Davao del Su...
city_map                                                    Davao City
Name: 1189, dtype: object

In [26]:
# Clean Davao values
df.loc[df['City/Municipality'] == 'Davao', 'province_map'] = 'Davao del Sur'

#### Identifying Province using OSM

In [38]:
print("for Rizal Medical Center\n")

print("Evaluate what is the appropriate province from osm:")
print(query_city_province('Rizal Medical Center')['province'])
print()
print("Map to the appropriate value in our provinces list via province_map:")
print(string_contains(query_city_province('Rizal Medical Center')['province'], 
                          provinces, province_map))

for Rizal Medical Center

Evaluate what is the appropriate province from osm:
Metro Manila

Evaluate what is the appropriate value in our provinces list via province_map:
NCR


In [43]:
# Update the institution with the correct province.
institution = 'Rizal Medical Center'

df.loc[df['Name'] == institution, 'province_map'] \
        = string_contains(query_city_province(institution)['province'], 
                          provinces, province_map)

### Cleaning Cities

In [40]:
# manually created mapping list. This can be automated using text analysis.
city_map = {
    'malabon city': 'City of Malabon',
    'san jose del monte city': 'City of San Jose del Monte',
    'mandaluyong city': 'City of Mandaluyong',
    'meycauayan city': 'City of Meycauayan',
    'meycauayan': 'City of Meycauayan',
    'tagum': 'City of Tagum',
    'san jose del monte': 'City of San Jose del Monte',
    'malabon': 'City of Malabon',
    'kidapawan city': 'City of Kidapawan',
    'tagum city': 'City of Tagum',
    'sampaloc manila': 'Sampaloc',
    'bacoor': 'Bacoor City',
    'canlubang laguna': 'City of Calamba',
    'cabatauan, iloilo': 'Cabatuan',
    'cabatuan, iloilo': 'Cabatuan',    
    'mandaue': 'Mandaue City',
    'lawaan, roxas': 'Roxas City',
    'lawaan roxas': 'Roxas City',    
    'banica, roxas': 'Roxas City',
    'banica roxas': 'Roxas City',
    'brgy. baybay, roxas': 'Roxas City',
    'cabanatuan': 'Cabanatuan City',
    'mandaluyong': 'City of Mandaluyong',
    'tondo': 'Tondo I / II',
    'caloocan': 'Caloocan City',
    'san miguel, bulacan': 'San Miguel',
    'batangas': 'Batangas City',
    'lucena': 'Lucena City',
    'baguio': 'Baguio City',
    'canlubang': 'City of Calamba',
    'victorias': 'City of Victorias',
    'calbayog': 'Calbayog City',
    'tabaco': 'City of Tabaco', 
    'vigan': 'City of Vigan',
    'lucena': 'Lucena City',
    'tagbilaran': 'Tagbilaran City',
    'bogo': 'City of Bogo',
    'cotabato': 'Cotabato City',
    'kidapawan': 'City of Kidapawan',
    'lipa': 'Lipa City',
    'taal lemery': 'Lemery',
    'santiago bayan': 'Santiago',
    'naga panganiban': 'Naga',
    'san pablo rizal': 'San Pablo',
    'malaybalay': 'City of Malaybalay'
}

In [41]:
df['city_map'] = df['City/Municipality']
df['city_map'].fillna('', inplace=True)
df['city_map'] = df['city_map'].str.strip()

df.loc[df['city_map'].str.contains('roxas city', case=False) == True, ['city_map']] = 'Roxas City'
df.loc[df['city_map'].str.contains('angono', case=False) == True, ['city_map']] = 'Angono'

df['city_map'] = [string_contains(x, list(cities_gdf['ADM3_EN'].unique()), city_map, ignore_words=['city']) for x in df['city_map'].values]

In [46]:
#review cities with multiple matches
# pd.set_option('display.max_rows', None)
df[['City/Municipality','city_map']][df['city_map'].apply(lambda x: isinstance(x, list))]

Unnamed: 0,City/Municipality,city_map
10857,Gao,"[Argao, Baggao, Baliangao, Balungao, Bongao, C..."
10860,Gao,"[Argao, Baggao, Baliangao, Balungao, Bongao, C..."
11408,Gao,"[Argao, Baggao, Baliangao, Balungao, Bongao, C..."
11611,Gen,"[City of General Trias, Gen. Mariano Alvarez, ..."


In [45]:
string_contains(query_city_province('Rizal Medical Center')['city'], 
                        list(cities_gdf['ADM3_EN'].unique()), 
                        ignore_words=['city'])

'City of Pasig'

In [16]:
institution = 'Rizal Medical Center'
    
df.loc[df['Name of Institution/Hospital'] == institution, 'city_map'] = \
        string_contains(query_city_province(institution)['city'], 
                        list(cities_gdf['ADM3_EN'].unique()), 
                        ignore_words=['city'])
