# IBM Applied Data Science Capstone Course by Coursera

### Designing an optimal location map for the online pharmacy warehouses in London

### Step 1: Library import

In [1]:
import numpy as np
import pandas as pd
import json # library to handle JSON files
!pip install geocoder
!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import geocoder
import requests # library to handle requests

!pip install BeautifulSoup4
from bs4 import BeautifulSoup # library to parse HTML and XML documents

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

# install and import folium library 
!pip -q install folium
import folium

print("Libraries import completed.")

Collecting geocoder
[?25l  Downloading https://files.pythonhosted.org/packages/4f/6b/13166c909ad2f2d76b929a4227c952630ebaf0d729f6317eb09cbceccbab/geocoder-1.38.1-py2.py3-none-any.whl (98kB)
[K     |████████████████████████████████| 102kB 15.7MB/s ta 0:00:01
[?25hCollecting click (from geocoder)
[?25l  Downloading https://files.pythonhosted.org/packages/d2/3d/fa76db83bf75c4f8d338c2fd15c8d33fdd7ad23a9b5e57eb6c5de26b430e/click-7.1.2-py2.py3-none-any.whl (82kB)
[K     |████████████████████████████████| 92kB 20.0MB/s eta 0:00:01
Collecting ratelim (from geocoder)
  Downloading https://files.pythonhosted.org/packages/f2/98/7e6d147fd16a10a5f821db6e25f192265d6ecca3d82957a4fdd592cad49c/ratelim-0.1.6-py2.py3-none-any.whl
Collecting future (from geocoder)
[?25l  Downloading https://files.pythonhosted.org/packages/45/0b/38b06fd9b92dc2b68d58b75f900e97884c45bedd2ff83203d933cf5851c9/future-0.18.2.tar.gz (829kB)
[K     |████████████████████████████████| 829kB 6.3MB/s eta 0:00:01
Building wheels

### Step 2: Data scraping

In [2]:
# send the GET request
data = requests.get("https://en.wikipedia.org/wiki/List_of_areas_of_London").text

In [3]:
# parse data from the html into a beautifulsoup object
soup = BeautifulSoup(data, 'html.parser')

In [4]:
# create a list to store neighborhood data
neighborhoodList = []

In [5]:
# append the data into the list
for row in soup.find_all("td", class_="navbox-list navbox-even")[2].findAll("li"):
    neighborhoodList.append(row.text)

In [6]:
# create a new DataFrame from the list
lon_df = pd.DataFrame({"Neighborhood": neighborhoodList})

lon_df

Unnamed: 0,Neighborhood
0,Abbey Wood
1,Alperton
2,Anerley
3,Archway
4,Barnes
...,...
64,Walworth
65,Wapping
66,West Drayton
67,Worcester Park


In [7]:
lon_df.shape

(69, 1)

### Step 3: Get geographical coordinates

In [8]:
# define a function to get coordinates
def get_latlng(neighborhood):
    # initialize your variable to None
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, London, UK'.format(neighborhood))
        lat_lng_coords = g.latlng
    return lat_lng_coords

In [9]:
# call the function to get the coordinates, store in a new list using list comprehension
coords = [ get_latlng(neighborhood) for neighborhood in lon_df["Neighborhood"].tolist() ]

In [10]:
coords

[[51.492450000000076, 0.12127000000003818],
 [51.52687087712042, -0.2064400519240089],
 [51.412330000000054, -0.06538999999997941],
 [51.56574690648617, -0.13491918108038964],
 [51.47457000000003, -0.24211999999994305],
 [51.536480534879935, -0.1109095599068421],
 [51.46760000000006, -0.16289999999997917],
 [51.53292000000005, 0.05461000000002514],
 [51.51906000000008, -0.1289499999999748],
 [51.49790000000007, -0.08143999999992957],
 [51.52702000000005, -0.02593999999993457],
 [51.57649791767992, -0.2175705771750798],
 [51.462680000000034, -0.035579999999924894],
 [51.54868000000005, -0.0917499999999336],
 [51.48699000000005, 0.031870000000026266],
 [51.49014000000005, -0.16247999999995955],
 [51.591498213325025, -0.2029268077365924],
 [51.650000000000034, -0.1999999999999318],
 [51.606445473009835, -0.18057574083288533],
 [51.522164999999994, -0.10715175000000866],
 [51.40715585742943, -0.058433918986020864],
 [51.5191284392908, -0.14264381403357546],
 [51.5267074289349, -0.342207000

In [11]:
#create temporary dataframe to populate the coordinates into Latitude and Longitude
df_coords = pd.DataFrame(coords, columns=['Latitude', 'Longitude'])

In [12]:
# merge the coordinates into the original dataframe
lon_df['Latitude'] = df_coords['Latitude']
lon_df['Longitude'] = df_coords['Longitude']

In [13]:
# check the neighborhoods and the coordinates
print(lon_df.shape)
lon_df

(69, 3)


Unnamed: 0,Neighborhood,Latitude,Longitude
0,Abbey Wood,51.492450,0.121270
1,Alperton,51.526871,-0.206440
2,Anerley,51.412330,-0.065390
3,Archway,51.565747,-0.134919
4,Barnes,51.474570,-0.242120
...,...,...,...
64,Walworth,51.487640,-0.095420
65,Wapping,51.504580,-0.055990
66,West Drayton,51.595020,-0.011722
67,Worcester Park,51.370997,-0.228087


### Step 4: Create a map of London

In [14]:
# get the coordinates of London
address = 'London, UK'

geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of London, UK {}, {}.'.format(latitude, longitude))

The geograpical coordinate of London, UK 51.5073219, -0.1276474.


In [15]:
# create map of London using latitude and longitude values
map_lon = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, neighborhood in zip(lon_df['Latitude'], lon_df['Longitude'], lon_df['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_lon)  
    
map_lon

In [16]:
# save the map as HTML file
map_lon.save('map_lon.html')

### Step 5: Use the Foursquare API to explore the neighborhoods for the Pharmacy shops

In [21]:
# First I define Foursquare Credentials and Version
CLIENT_ID = 'PJYY1YUQJKPK2OF4GB3EESAIB4O3CZLWGFN1B3SMXAMZYPKR' # your Foursquare ID
CLIENT_SECRET = '44CAYHYHVFQGYHSQAVX5Q2LY2IT5JGAWKJBXUJ2XF23WQT3Z' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: PJYY1YUQJKPK2OF4GB3EESAIB4O3CZLWGFN1B3SMXAMZYPKR
CLIENT_SECRET:44CAYHYHVFQGYHSQAVX5Q2LY2IT5JGAWKJBXUJ2XF23WQT3Z


#### Let's see the top 100 venues in a radius of 5k from the center of location. 

In [22]:
LIMIT = 100 # limit of number of venues returned by Foursquare API
query = 'Pharmacy'
radius = 5000 # define radius

def getNearbyVenues(names, latitudes, longitudes, radius=5000):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&query={}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            query,
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [23]:
london_venues = getNearbyVenues(names=lon_df['Neighborhood'],
                                   latitudes=lon_df['Latitude'],
                                   longitudes=lon_df['Longitude']
                                  )

Abbey Wood
Alperton
Anerley
Archway
Barnes
Barnsbury
Battersea
Beckton
Bedford Park
Bermondsey
Bow
Brent Cross
Brockley
Canonbury
Charlton
Chelsea
Chessington
Chipping Barnet
Chislehurst
Clerkenwell
Elmers End
Gidea Park
Greenford
Gunnersbury
Hackbridge
Hackney
Ham
Hampton
Hanwell
Hanworth
Harold Wood
Highams Park
Highbury
Highgate
Hillingdon
Hook
Holloway
Hoxton
Ickenham
Isle of Dogs
Isleworth
Islington
Kensal Green
Kew
Lambeth
Manor Park
Mortlake
Neasden
Northolt
Nunhead
Plaistow (Newham)
Poplar
Roehampton
Rotherhithe
Seven Kings
Seven Sisters
Shoreditch
Stamford Hill
Stepney
St Helier
Surrey Quays
Tottenham
Upper Clapton
Upper Holloway
Walworth
Wapping
West Drayton
Worcester Park
Yiewsley


In [24]:
print(london_venues.shape)
london_venues.head()

(3391, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Abbey Wood,51.49245,0.12127,Superdrug,51.506883,0.106192,Pharmacy
1,Abbey Wood,51.49245,0.12127,Morrisons Pharmacy,51.507656,0.105978,Pharmacy
2,Abbey Wood,51.49245,0.12127,LloydsPharmacy,51.48337,0.147426,Pharmacy
3,Abbey Wood,51.49245,0.12127,Superdrug,51.462711,0.10761,Pharmacy
4,Abbey Wood,51.49245,0.12127,Superdrug,51.490997,0.067669,Pharmacy


In [25]:
london_venues.loc[london_venues['Venue Category'] == 'Pharmacy']

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Abbey Wood,51.49245,0.12127,Superdrug,51.506883,0.106192,Pharmacy
1,Abbey Wood,51.49245,0.12127,Morrisons Pharmacy,51.507656,0.105978,Pharmacy
2,Abbey Wood,51.49245,0.12127,LloydsPharmacy,51.483370,0.147426,Pharmacy
3,Abbey Wood,51.49245,0.12127,Superdrug,51.462711,0.107610,Pharmacy
4,Abbey Wood,51.49245,0.12127,Superdrug,51.490997,0.067669,Pharmacy
...,...,...,...,...,...,...,...
3386,Yiewsley,51.51263,-0.47259,Boots,51.545489,-0.477378,Pharmacy
3387,Yiewsley,51.51263,-0.47259,Boots,51.470376,-0.458656,Pharmacy
3388,Yiewsley,51.51263,-0.47259,Adell Pharmacy,51.551479,-0.448780,Pharmacy
3389,Yiewsley,51.51263,-0.47259,Savers,51.546329,-0.480300,Pharmacy


In [26]:
london_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Abbey Wood,14,14,14,14,14,14
Alperton,81,81,81,81,81,81
Anerley,23,23,23,23,23,23
Archway,52,52,52,52,52,52
Barnes,49,49,49,49,49,49
...,...,...,...,...,...,...
Walworth,100,100,100,100,100,100
Wapping,71,71,71,71,71,71
West Drayton,21,21,21,21,21,21
Worcester Park,13,13,13,13,13,13


In [27]:
print('There are {} uniques categories.'.format(len(london_venues['Venue Category'].unique())))

There are 5 uniques categories.


### Step 6: Analyze Each Neighborhood

In [28]:
# one hot encoding
london_onehot = pd.get_dummies(london_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
london_onehot['Neighborhood'] = london_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [london_onehot.columns[-1]] + list(london_onehot.columns[:-1])
london_onehot = london_onehot[fixed_columns]

london_onehot.head()

Unnamed: 0,Neighborhood,Department Store,Doctor's Office,Health Food Store,Pharmacy,Supermarket
0,Abbey Wood,0,0,0,1,0
1,Abbey Wood,0,0,0,1,0
2,Abbey Wood,0,0,0,1,0
3,Abbey Wood,0,0,0,1,0
4,Abbey Wood,0,0,0,1,0


#### Group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [29]:
london_grouped = london_onehot.groupby('Neighborhood').mean().reset_index()
london_grouped

Unnamed: 0,Neighborhood,Department Store,Doctor's Office,Health Food Store,Pharmacy,Supermarket
0,Abbey Wood,0.000000,0.000000,0.000000,0.928571,0.071429
1,Alperton,0.024691,0.000000,0.012346,0.950617,0.012346
2,Anerley,0.000000,0.000000,0.000000,0.956522,0.043478
3,Archway,0.000000,0.019231,0.000000,0.961538,0.019231
4,Barnes,0.000000,0.000000,0.000000,0.959184,0.040816
...,...,...,...,...,...,...
64,Walworth,0.010000,0.000000,0.000000,0.970000,0.020000
65,Wapping,0.000000,0.014085,0.000000,0.957746,0.028169
66,West Drayton,0.000000,0.000000,0.000000,0.952381,0.047619
67,Worcester Park,0.000000,0.000000,0.000000,1.000000,0.000000


In [30]:
len(london_grouped[london_grouped["Pharmacy"] > 0])

69

#### Creating a dataframe for Pharmacy shops in London.

In [31]:
lon_pharm = london_grouped[['Neighborhood', 'Pharmacy']].copy()
lon_pharm.tail()

Unnamed: 0,Neighborhood,Pharmacy
64,Walworth,0.97
65,Wapping,0.957746
66,West Drayton,0.952381
67,Worcester Park,1.0
68,Yiewsley,1.0


### Step 7: k-nearest neighbors clustering

In [32]:
# set number of clusters
kclusters = 3

lon_clustering = lon_pharm.drop(["Neighborhood"], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(lon_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([2, 1, 1, 1, 1, 0, 1, 0, 0, 1], dtype=int32)

In [33]:
# create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.
lon_merged = lon_pharm.copy()

# add clustering labels
lon_merged["Cluster Labels"] = kmeans.labels_

In [34]:
lon_merged.head()

Unnamed: 0,Neighborhood,Pharmacy,Cluster Labels
0,Abbey Wood,0.928571,2
1,Alperton,0.950617,1
2,Anerley,0.956522,1
3,Archway,0.961538,1
4,Barnes,0.959184,1


In [35]:
# merge london_grouped with lon_data to add latitude/longitude for each neighborhood
lon_merged = lon_merged.join(lon_df.set_index("Neighborhood"), on="Neighborhood")

print(lon_merged.shape)

(69, 5)


In [36]:
lon_merged.head()

Unnamed: 0,Neighborhood,Pharmacy,Cluster Labels,Latitude,Longitude
0,Abbey Wood,0.928571,2,51.49245,0.12127
1,Alperton,0.950617,1,51.526871,-0.20644
2,Anerley,0.956522,1,51.41233,-0.06539
3,Archway,0.961538,1,51.565747,-0.134919
4,Barnes,0.959184,1,51.47457,-0.24212


In [37]:
# sort the results by Cluster Labels
print(lon_merged.shape)
lon_merged.sort_values(["Cluster Labels"], inplace=True)
lon_merged

(69, 5)


Unnamed: 0,Neighborhood,Pharmacy,Cluster Labels,Latitude,Longitude
68,Yiewsley,1.000000,0,51.512630,-0.472590
20,Elmers End,1.000000,0,51.407156,-0.058434
42,Kensal Green,0.981132,0,51.530540,-0.225480
22,Greenford,1.000000,0,51.526707,-0.342207
24,Hackbridge,1.000000,0,51.377690,-0.154170
...,...,...,...,...,...
12,Brockley,0.942857,2,51.462680,-0.035580
39,Isle of Dogs,0.928571,2,51.487210,-0.013810
47,Neasden,0.933333,2,51.559708,-0.250301
58,Stamford Hill,0.944444,2,51.570230,-0.072830


### Step 8: Visualization

In [38]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(lon_merged['Latitude'], lon_merged['Longitude'], lon_merged['Neighborhood'], lon_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' - Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [39]:
# save the map as HTML file
map_clusters.save('map_clusters.html')

### Step 9: Result analysis

Let's overview each cluster:

#### Cluster 0 - Red colour

In [36]:
lon_merged.loc[lon_merged['Cluster Labels'] == 0]

Unnamed: 0,Neighborhood,Pharmacy,Cluster Labels,Latitude,Longitude
68,Yiewsley,1.0,0,51.51263,-0.47259
20,Elmers End,1.0,0,51.407156,-0.058434
41,Islington,1.0,0,51.53279,-0.10614
22,Greenford,1.0,0,51.526707,-0.342207
24,Hackbridge,1.0,0,51.37769,-0.15417
28,Hanwell,1.0,0,51.50878,-0.3363
29,Hanworth,1.0,0,51.625332,-0.050611
19,Clerkenwell,1.0,0,51.522165,-0.107152
30,Harold Wood,1.0,0,51.59207,0.23277
32,Highbury,0.986111,0,51.55318,-0.09839


#### Cluster 1 - Purple colour

In [37]:
lon_merged.loc[lon_merged['Cluster Labels'] == 1]

Unnamed: 0,Neighborhood,Pharmacy,Cluster Labels,Latitude,Longitude
53,Rotherhithe,0.969697,1,51.49574,-0.05157
66,West Drayton,0.952381,1,51.59502,-0.011722
46,Mortlake,0.96875,1,51.46482,-0.26591
65,Wapping,0.956522,1,51.50458,-0.05599
64,Walworth,0.959184,1,51.48764,-0.09542
49,Nunhead,0.953488,1,51.46671,-0.0515
50,Plaistow (Newham),0.961538,1,51.52653,0.02876
62,Upper Clapton,0.95122,1,51.559163,-0.05611
54,Seven Kings,0.952381,1,51.53407,0.036
60,Surrey Quays,0.96875,1,51.49356,-0.04782


#### Cluster 2 - Blue colour

In [38]:
lon_merged.loc[lon_merged['Cluster Labels'] == 2]

Unnamed: 0,Neighborhood,Pharmacy,Cluster Labels,Latitude,Longitude
16,Chessington,0.941176,2,51.591498,-0.202927
12,Brockley,0.942857,2,51.46268,-0.03558
47,Neasden,0.933333,2,51.559708,-0.250301
39,Isle of Dogs,0.928571,2,51.48721,-0.01381
45,Manor Park,0.9,2,51.55241,0.05258
58,Stamford Hill,0.944444,2,51.57023,-0.07283
0,Abbey Wood,0.928571,2,51.49245,0.12127


#### Decision

Therefore, the top tier for warehouse target locations are highlighted with the red colour.