# Coursera Capstone project

### Import and install all of the necessary packages
Uncomment and run any installations as necessary

In [25]:
import pandas as pd
import numpy as np
import requests
from pandas.io.json import json_normalize 
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans

#!pip install geocoder
import geocoder
from geopy.geocoders import Nominatim

#!pip install pgeocode
import pgeocode

#!pip install folium
import folium

### Creating a London Dataset
London's postal codes are divided up into a few different regions and some of the postcodes have sub postcodes for areas with particularly high postal traffic.

A List of post codes is not easy accessible off the web so I created a list manually for myself with the main post codes, leaving out any sub postcodes.

In [4]:
# Create a list of all postcodes in london
postcodes = ['WC1', 'WC2']

# 4 East central postcodes
for i in range(1,5):
        postcodes.append(('EC' + str(i)))

# 20 Eastern postcodes
for i in range(1,21):
        postcodes.append(('E' + str(i)))
        
# 22 northern postcodes
for i in range(1,23):
        postcodes.append(('N' + str(i)))
        
# 11 north west postcodes
for i in range(1,12):
        postcodes.append(('NW' + str(i)))
        
# 28 south east postcodes
for i in range(1,29):
        postcodes.append(('SE' + str(i)))
        
# 20 south west postcodes
for i in range(1,21):
        postcodes.append(('SW' + str(i)))
        
# 14 west postcodes
for i in range(1,15):
        postcodes.append(('SW' + str(i)))

In [6]:
# initialise the country
lon = pgeocode.Nominatim('gb')

# create a dataframe with the postcode information
london_data = lon.query_postal_code(postcodes)

In [7]:
# Check which are Na
london_data[london_data['state_name'].isna()]

Unnamed: 0,postal_code,country_code,place_name,state_name,state_code,county_name,county_code,community_name,community_code,latitude,longitude,accuracy
24,E19,,,,,,,,,,,
34,N9,,,,,,,,,,,


In [8]:
# Manually convert the NA to the correct values for postal code N9
london_data.iloc[34,9] = 51.6281
london_data.iloc[34,10] = -0.0560
london_data.iloc[34,2] = 'Lower Edmonton'

london_data[london_data['state_name'].isna()]

Unnamed: 0,postal_code,country_code,place_name,state_name,state_code,county_name,county_code,community_name,community_code,latitude,longitude,accuracy
24,E19,,,,,,,,,,,
34,N9,,Lower Edmonton,,,,,,,51.6281,-0.056,


In [9]:
#Drop E19 because it doesn't exist
london_data = london_data[london_data['postal_code'] != 'E19']

In [10]:
# Clean up the dataset to our final target dataset
london = london_data.dropna(axis = 1 , how = 'any')
london = london.rename(columns = {'place_name': 'neighbourhood'})
london.head()

Unnamed: 0,postal_code,neighbourhood,latitude,longitude
0,WC1,"St Pancras, Bloomsbury, King's Cross, Mount Pl...",51.5333,-0.1223
1,WC2,"Covent Garden, Charing Cross, Holborn, Trafalg...",51.5142,-0.123382
2,EC1,"Old Street, Farringdon, Holborn, Smithfield, C...",51.5262,-0.105518
3,EC2,"Shoreditch, Barbican, London Wall, Bank, Liver...",51.5251,-0.0857
4,EC3,"Billingsgate, Monument, Bank, Tower Hill, Fenc...",51.5085,-0.1257


In [5]:
table = pd.read_html('https://en.wikipedia.org/wiki/List_of_areas_of_London')

df = pd.DataFrame(table[1])

# Retrieve only the locations with a london postcode
df = df[df['Post town'] == "LONDON"]

# Remove parenthesis in locations
df['Location'] = df['Location'].str.replace(r"\(.*\)","")

# Change the names of an incorrect town
df[df['Location'] == 'Somerstown'] = 'Somers Town'
df.head()

Unnamed: 0,Location,London borough,Post town,Postcode district,Dial code,OS grid ref
0,Abbey Wood,"Bexley, Greenwich [7]",LONDON,SE2,20,TQ465785
1,Acton,"Ealing, Hammersmith and Fulham[8]",LONDON,"W3, W4",20,TQ205805
6,Aldgate,City[10],LONDON,EC3,20,TQ334813
7,Aldwych,Westminster[10],LONDON,WC2,20,TQ307810
9,Anerley,Bromley[11],LONDON,SE20,20,TQ345695


In [6]:
Lat_list=[]
Lng_list=[]

locations = df['Location'].tolist()

geolocator = Nominatim(user_agent = 'lo_explorer')

for i in locations:
    try:
        address = "{}, London, UK".format(i)
        location = geolocator.geocode(address)
        Lat_list.append(location.latitude)
        Lng_list.append(location.longitude)
    except:
        print(i)

In [7]:
london = df
london['latitude'] = Lat_list
london['longitude'] = Lng_list
london = london.rename(columns = {'Location' : 'neighbourhood'})
london.head()

Unnamed: 0,neighbourhood,London borough,Post town,Postcode district,Dial code,OS grid ref,latitude,longitude
0,Abbey Wood,"Bexley, Greenwich [7]",LONDON,SE2,20,TQ465785,51.487621,0.11405
1,Acton,"Ealing, Hammersmith and Fulham[8]",LONDON,"W3, W4",20,TQ205805,51.50814,-0.273261
6,Aldgate,City[10],LONDON,EC3,20,TQ334813,51.514248,-0.075719
7,Aldwych,Westminster[10],LONDON,WC2,20,TQ307810,51.51294,-0.118101
9,Anerley,Bromley[11],LONDON,SE20,20,TQ345695,51.407599,-0.061939


### Visualize our London postcodes on a map

In [187]:
# Get the Location of london
address = 'London, UK'

# Save the location data into variables
geolocator = Nominatim(user_agent="lo_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

# Create a map of london
map_london = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, neighbourhood in zip(london['latitude'], london['longitude'], london['neighbourhood']):
    label = '{}'.format(neighbourhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_london)  
    
map_london

### Creating a Paris Dataset
Paris is a smaller city and has much fewer post codes so this task is much easier.

In [8]:
# List for postal codes
paris_pc = []

# 20 neighbourhoods in paris
for i in range(1, 21):
    paris_pc.append(str(75000 + i))
    
paris_pc

['75001',
 '75002',
 '75003',
 '75004',
 '75005',
 '75006',
 '75007',
 '75008',
 '75009',
 '75010',
 '75011',
 '75012',
 '75013',
 '75014',
 '75015',
 '75016',
 '75017',
 '75018',
 '75019',
 '75020']

In [9]:
par = pgeocode.Nominatim('fr')

# create a dataframe with the postcode information
paris_data = par.query_postal_code(paris_pc)
paris_data

Unnamed: 0,postal_code,country_code,place_name,state_name,state_code,county_name,county_code,community_name,community_code,latitude,longitude,accuracy
0,75001,FR,"Paris 01, Paris",Île-de-France,11.0,Paris,75,Paris,751,48.8592,2.34525,5
1,75002,FR,"Paris 02, Paris",Île-de-France,11.0,Paris,75,Paris,751,48.8655,2.3457,5
2,75003,FR,"Paris 03, Paris",Île-de-France,11.0,Paris,75,Paris,751,48.8637,2.35515,5
3,75004,FR,"Paris 04, Paris",Île-de-France,11.0,Paris,75,Paris,751,48.8601,2.34975,5
4,75005,FR,"Paris 05, Paris",Île-de-France,11.0,Paris,75,Paris,751,48.8448,2.34795,5
5,75006,FR,"Paris 06, Paris",Île-de-France,11.0,Paris,75,Paris,751,48.8493,2.3394,5
6,75007,FR,"Paris 07, Paris",Île-de-France,11.0,Paris,75,Paris,751,48.8565,2.3349,5
7,75008,FR,"Paris 08, Paris",Île-de-France,11.0,Paris,75,Paris,751,48.8763,2.33355,5
8,75009,FR,"Paris 09, Paris",Île-de-France,11.0,Paris,75,Paris,751,48.8718,2.34435,5
9,75010,FR,"Paris 10, Paris",Île-de-France,11.0,Paris,75,Paris,751,48.8709,2.35245,5


In [10]:
# remove the unnecessary columns
paris = paris_data[['postal_code', 'place_name', 'latitude', 'longitude']]
paris = paris.rename(columns = {'place_name': 'neighbourhood'})
paris

Unnamed: 0,postal_code,neighbourhood,latitude,longitude
0,75001,"Paris 01, Paris",48.8592,2.34525
1,75002,"Paris 02, Paris",48.8655,2.3457
2,75003,"Paris 03, Paris",48.8637,2.35515
3,75004,"Paris 04, Paris",48.8601,2.34975
4,75005,"Paris 05, Paris",48.8448,2.34795
5,75006,"Paris 06, Paris",48.8493,2.3394
6,75007,"Paris 07, Paris",48.8565,2.3349
7,75008,"Paris 08, Paris",48.8763,2.33355
8,75009,"Paris 09, Paris",48.8718,2.34435
9,75010,"Paris 10, Paris",48.8709,2.35245


### Visualise the Paris neighbourhoods

In [15]:
# Get the Location of london
address = 'Paris, FR'

# Save the location data into variables
geolocator = Nominatim(user_agent="pa_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

# Create a map of london
map_paris = folium.Map(location=[latitude, longitude], zoom_start=13)

# add markers to map
for lat, lng, neighbourhood in zip(paris['latitude'], paris['longitude'], paris['neighbourhood']):
    label = '{}'.format(neighbourhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_paris)  
    
map_paris

### Creating a Berlin Dataset
The Berlin postcodes are not sequential so it would be harder to create a dataset similar to the paris dataset. However there is convenientaly a single table online which can be downloaded unlike the London postcodes.

In [14]:
table = pd.read_html('https://en.wikipedia.org/wiki/Boroughs_and_neighborhoods_of_Berlin')

# Mitte Borough
df1 = pd.DataFrame(table[2])
df1['Borough'] = 'Mitte'

# Friedrichshain-Kreuzberg
df2 = pd.DataFrame(table[3])
df2['Borough'] = 'Friedrichshain-Kreuzberg'

# Pankow
df3 = pd.DataFrame(table[4])
df3['Borough'] = 'Pankow'

# Charlottenburg-Wilmersdorf
df4 = pd.DataFrame(table[5])
df4['Borough'] = 'Charlottenburg-Wilmersdorf'

# Spandau
df5 = pd.DataFrame(table[6])
df5['Borough'] = 'Spandau'

# Steglitz-Zehlendorf
df6 = pd.DataFrame(table[7])
df6['Borough'] = 'Steglitz-Zehlendorf'

# Tempelhof-Schöneberg
df7 = pd.DataFrame(table[8])
df7['Borough'] = 'Tempelhof-Schöneberg'

# CNeukölln
df8 = pd.DataFrame(table[9])
df8['Borough'] = 'Neukölln'

# Treptow-Köpenick
df9 = pd.DataFrame(table[10])
df9['Borough'] = 'Treptow-Köpenick'

# Marzahn-Hellersdorf
df10 = pd.DataFrame(table[11])
df10['Borough'] = 'Marzahn-Hellersdorf'

# Lichtenberg
df11 = pd.DataFrame(table[12])
df11['Borough'] = 'Lichtenberg'

# Reinickendorf
df12 = pd.DataFrame(table[13])
df12['Borough'] = 'Reinickendorf'

frames = [df1, df2, df3, df4, df5, df6, df7, df8, df9, df10, df11, df12]

berlin_data = pd.concat(frames)
berlin_data.head()

Unnamed: 0,Locality,Area in km2,Population as of 2008,Density inhabitants per km2,Map,Borough
0,(0101) Mitte,10.7,79582,7445,,Mitte
1,(0102) Moabit,7.72,69425,8993,,Mitte
2,(0103) Hansaviertel,0.53,5889,11111,,Mitte
3,(0104) Tiergarten,5.17,12486,2415,,Mitte
4,(0105) Wedding,9.23,76363,8273,,Mitte


In [18]:
berlin_data['Locality'] = berlin_data['Locality'].str.replace(r"\(.*\)","")
berlin_data.head()

Unnamed: 0,Locality,Area in km2,Population as of 2008,Density inhabitants per km2,Map,Borough
0,Mitte,10.7,79582,7445,,Mitte
1,Moabit,7.72,69425,8993,,Mitte
2,Hansaviertel,0.53,5889,11111,,Mitte
3,Tiergarten,5.17,12486,2415,,Mitte
4,Wedding,9.23,76363,8273,,Mitte


In [19]:
Lat_list=[]
Lng_list=[]

localities = berlin_data['Locality'].tolist()

geolocator = Nominatim(user_agent = 'berlin')

for i in localities:
    address = "{}, Berlin, Germany".format(i)
    location = geolocator.geocode(address)
    Lat_list.append(location.latitude)
    Lng_list.append(location.longitude)

In [20]:
berlin = berlin_data
berlin['latitude'] = Lat_list
berlin['longitude'] = Lng_list
berlin = berlin[['Locality', 'Borough', 'latitude', 'longitude']]
berlin = berlin.rename(columns = {'Locality' : 'neighbourhood'})
berlin.head()

Unnamed: 0,neighbourhood,Borough,latitude,longitude
0,Mitte,Mitte,52.517885,13.40406
1,Moabit,Mitte,52.530102,13.342542
2,Hansaviertel,Mitte,52.519123,13.341872
3,Tiergarten,Mitte,52.509778,13.35726
4,Wedding,Mitte,52.550123,13.34197


In [22]:
# Get the Location of london
address = 'Berlin, DE'

# Save the location data into variables
geolocator = Nominatim(user_agent="be_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

# Create a map of london
map_berlin = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, neighbourhood, borough in zip(berlin['latitude'], berlin['longitude'], berlin['neighbourhood'], berlin['Borough']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_berlin)  
    
map_berlin

### Using the Foursquare API
Create a function to retrieve foursquare data and put it into a dataframe

In [26]:
CLIENT_ID = 'SBFTA0JXPDGARLPPU0IUIPG1LZO5DX0NNY1A5PTQADA0N4ZK' # your Foursquare ID
CLIENT_SECRET = '1FMUHRUZ42CORCBIRBJOVLKNT2EIOVD0TLDOCTCSYWGY1HNO' # your Foursquare Secret
ACCESS_TOKEN = 'EUKTULGKTBMODOT3M4CYRNAN0PDVSZKYRLXLRXHKV1ZQEDSS' # your FourSquare Access Token
VERSION = '20180604'
LIMIT = 30

# Create a function to return all of the venus from the dataset
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [27]:
london_venues = getNearbyVenues(names=london['neighbourhood'],
                                  latitudes = london['latitude'],
                                  longitudes = london['longitude']
                                 )

In [28]:
london_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Abbey Wood,51.487621,0.11405,Co-op Food,51.48765,0.11349,Grocery Store
1,Abbey Wood,51.487621,0.11405,Bostal Gardens,51.48667,0.110462,Playground
2,Abbey Wood,51.487621,0.11405,Abbey Wood Caravan Club,51.485502,0.120014,Campground
3,Acton,51.50814,-0.273261,London Star Hotel,51.509624,-0.272456,Hotel
4,Acton,51.50814,-0.273261,The Aeronaut,51.508376,-0.275216,Pub


In [40]:
# Use one hot encoding
# one hot encoding
london_oh = pd.get_dummies(london_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
london_oh['Neighbourhood'] = london_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [london_oh.columns[-1]] + list(london_oh.columns[:-1])
london_oh = london_oh[fixed_columns]

# Group by Neighbourhood
london_grouped = london_oh.groupby('Neighbourhood').mean().reset_index()

london_grouped.head()

Unnamed: 0,Neighbourhood,Accessories Store,Afghan Restaurant,African Restaurant,Airport Terminal,American Restaurant,Antique Shop,Arcade,Arepa Restaurant,Argentinian Restaurant,...,Watch Shop,Waterfront,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Xinjiang Restaurant,Yoga Studio,Zoo Exhibit
0,Abbey Wood,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Acton,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.045455,0.0,0.0,0.0,0.0,0.0
2,Aldgate,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Aldwych,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.033333,0.033333,0.0,0.0,0.0,0.0,0.0
4,Anerley,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
paris_venues = getNearbyVenues(names=paris['neighbourhood'],
                                  latitudes = paris['latitude'],
                                  longitudes = paris['longitude']
                                 )

In [31]:
paris_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Paris 01, Paris",48.8592,2.34525,59 Rivoli,48.859172,2.345648,Art Gallery
1,"Paris 01, Paris",48.8592,2.34525,Hôtel Britannique,48.858155,2.346339,Hotel
2,"Paris 01, Paris",48.8592,2.34525,Hôtel Maison Albar Paris Céline,48.860552,2.344259,Hotel
3,"Paris 01, Paris",48.8592,2.34525,Au Vieux Comptoir,48.858893,2.346129,French Restaurant
4,"Paris 01, Paris",48.8592,2.34525,Novotel Paris Les Halles,48.860777,2.346322,Hotel


In [33]:
berlin_venues = getNearbyVenues(names=berlin['neighbourhood'],
                                  latitudes = berlin['latitude'],
                                  longitudes = berlin['longitude']
                                 )

In [34]:
berlin_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Mitte,52.517885,13.40406,Designpanoptikum - surreales Museum für indust...,52.516941,13.406072,Museum
1,Mitte,52.517885,13.40406,"Bronzestatue ""Heiliger St. Georg im Kampf mit ...",52.51629,13.405558,Outdoor Sculpture
2,Mitte,52.517885,13.40406,Kuppelumgang Berliner Dom,52.518966,13.400981,Scenic Lookout
3,Mitte,52.517885,13.40406,Radisson Blu,52.519561,13.402857,Hotel
4,Mitte,52.517885,13.40406,Lustgarten,52.518469,13.399454,Garden
