# CAPSTONE PROJECT

#### THIS NOTEBOOK WILL BE MAINLY USED FOR THE CAPSTONE PROJECT FROM IBM DATA SCIENCE PROFESSIONAL CERTIFICATE

## 1. Import important libraries

In [2]:
#!pip install --upgrade requests branca six jinja2 numpy chardet idna urllib3 certifi MarkupSafe
#!conda install -c conda-forge folium --yes
#!conda install -c conda-forge geopy --yes
import numpy as np
import pandas as pd
import requests
from sklearn.cluster import KMeans
from pandas.io.json import json_normalize
import folium
from folium.plugins import MiniMap
from geopy.geocoders import Nominatim
import json
import matplotlib.pyplot as plt

print('Libraries imported.')

Libraries imported.


## 2. Load Neighborhoods in New York and Toronto from:
* New York: https://ibm.box.com/shared/static/fbpwbovar7lf8p5sgddm06cgipa2rxpe.json
* Toronto: https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M

### 2.1. Download and load New York data

In [3]:
!wget -q -O 'newyork_data.json' https://ibm.box.com/shared/static/fbpwbovar7lf8p5sgddm06cgipa2rxpe.json
print('Data downloaded!')
print('Loading data...')
with open('newyork_data.json') as json_data:
    newyork_data = json.load(json_data)
    
print('\n...data loaded succesfully!')

Data downloaded!
Loading data...

...data loaded succesfully!


#### 2.1.1. Let's take a look at the relevant data in a *pandas* dataframe

In [4]:
neighborhoods_ny = pd.DataFrame(columns = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'])
for data in newyork_data['features']:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    neighborhoods_ny = neighborhoods_ny.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)
    
neighborhoods_ny.drop_duplicates(subset = 'Neighborhood', keep = 'first', inplace = True)
neighborhoods_ny.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585


In [5]:
print("{} neighborhoods found in New York city.".format(neighborhoods_ny.shape[0]))

302 neighborhoods found in New York city.


### 2.2. Download and load Toronto data

In [6]:
webPage = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
print('Data downloaded!')
print('Loading data...')
html = webPage.text                                                                       
tableInit = html.find('<table class="wikitable sortable">')                            
tableFinal = html.find('</table>')                                                      
htmlTable = html[tableInit:tableFinal]
print('\n...data loaded succesfully!')

Data downloaded!
Loading data...

...data loaded succesfully!


#### 2.2.1. Let's take a look at the relevant data in a *pandas* dataframe

In [7]:
table = pd.read_html(htmlTable, header = 0)[0]                                            
table["Borough"] = table["Borough"].replace({"Not assigned":np.nan})                
table.dropna(inplace = True)                                                          
table.where(table != "Not assigned", table["Borough"], axis = 0, inplace = True)      
joinedRows = table.groupby("Postcode")["Neighbourhood"].apply(lambda x: ", ".join(x)) 
table.drop_duplicates(["Postcode"],inplace = True)                                    
df = table.join(joinedRows, on = "Postcode", lsuffix='_single')                       
df.drop(columns = ["Neighbourhood_single"], inplace = True)                         
df.reset_index(drop = True, inplace = True)

Add latitude and longitude from: http://cocl.us/Geospatial_data

In [8]:
!wget -q -O 'Geospatial_Coordinates.csv' http://cocl.us/Geospatial_data
geo = pd.read_csv("Geospatial_Coordinates.csv", index_col = 0)         
neighborhoods_to = df.join(geo, on = "Postcode")                     
neighborhoods_to.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494


In [9]:
print("\n{} neighborhoods found in the city of Toronto.".format(neighborhoods_to.shape[0]))


103 neighborhoods found in the city of Toronto.


### 2.3. Visualize the neighborhoods in New York and Toronto

#### 2.3.1. Get center coordinates to visualize both cities at the same time

In [10]:
address = 'New York City, NY'

geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode(address)
ny_latitude = location.latitude
ny_longitude = location.longitude
print('The geograpical coordinate of New York City are {}, {}.'.format(ny_latitude, ny_longitude))

address = 'Toronto'

location = geolocator.geocode(address)
to_latitude = location.latitude
to_longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(to_latitude, to_longitude))

centerLatitude = (ny_latitude + to_latitude)/2
centerLongitude = (ny_longitude + to_longitude)/2

print('\nThe geograpical central coordinates are {}, {}.'.format(centerLatitude, centerLongitude))

The geograpical coordinate of New York City are 40.7308619, -73.9871558.
The geograpical coordinate of Toronto are 43.653963, -79.387207.

The geograpical central coordinates are 42.19241245, -76.6871814.


#### 2.3.2 Append New York and Toronto tables

In [33]:
tmp = neighborhoods_to.drop("Postcode",axis = 1)
tmp.rename(columns = {"Neighbourhood":"Neighborhood"}, inplace = True)
df = neighborhoods_ny.append(tmp, True).reset_index(drop = True)
pd.set_option('max_rows', 6)
df

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
...,...,...,...,...
402,East Toronto,Business reply mail Processing Centre969 Eastern,43.662744,-79.321558
403,Etobicoke,"Humber Bay, King's Mill Park, Kingsway Park So...",43.636258,-79.498509
404,Etobicoke,"Kingsway Park South West, Mimico NW, The Queen...",43.628841,-79.520999


#### 2.3.3 Visualize neighborhoods in a map using *Folium*

In [34]:
map_tony = folium.Map(location=[centerLatitude, centerLongitude], zoom_start=7, width='100%', height='100%')

for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_tony)
map_tony

NEW YORK CITY

In [35]:
map_tony = folium.Map(location=[ny_latitude, ny_longitude], zoom_start=9.5, width='100%', height='100%')

for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_tony)

minimap = MiniMap(position = 'topleft', center_fixed = (ny_latitude, ny_longitude), zoom_level_fixed = 8)
minimap.add_to(map_tony)

minimap = MiniMap(position='bottomright',  center_fixed = (to_latitude, to_longitude), zoom_level_fixed = 8)
minimap.add_to(map_tony)
map_tony


CITY OF TORONTO

In [36]:
map_tony = folium.Map(location=[to_latitude, to_longitude], zoom_start=10, width='100%', height='100%')

for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_tony)

minimap = MiniMap(position = 'topleft', center_fixed = (ny_latitude, ny_longitude), zoom_level_fixed = 8)
minimap.add_to(map_tony)

minimap = MiniMap(position='bottomright',  center_fixed = (to_latitude, to_longitude), zoom_level_fixed = 8)
minimap.add_to(map_tony)
map_tony

## 3. Import data from Foursquare

### 3.1. Setting up the credentials

In [37]:
CLIENT_ID = '2I0S3UDT4JPCUVSECPX2NUVA1DCGMOJCICB5PMJJIVQNMXHV' # your Foursquare ID
CLIENT_SECRET = 'HR1NF5MW52YXVDV0KWI5G3XFVWXFRHRNOBYRQEUCGKXA10CL' # your Foursquare Secret
VERSION = '20180604'
LIMIT = 100
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 2I0S3UDT4JPCUVSECPX2NUVA1DCGMOJCICB5PMJJIVQNMXHV
CLIENT_SECRET:HR1NF5MW52YXVDV0KWI5G3XFVWXFRHRNOBYRQEUCGKXA10CL


In [78]:
def getNearbyVenues(names, boroughs, latitudes, longitudes, radius=500):
    ind = 1
    venues_list=[]
    for name, borough, lat, lng in zip(names, boroughs, latitudes, longitudes):
        print(ind,"|",name)
        ind += 1
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name,
            borough,
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood',
                  'Borough',
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    print("\nDone!")
    return(nearby_venues)    

In [79]:
tony_venues = getNearbyVenues(names=df['Neighborhood'],
                                   boroughs = df['Borough'],
                                   latitudes=df['Latitude'],
                                   longitudes=df['Longitude']
                                  )

1 | Wakefield
2 | Co-op City
3 | Eastchester
4 | Fieldston
5 | Riverdale
6 | Kingsbridge
7 | Marble Hill
8 | Woodlawn
9 | Norwood
10 | Williamsbridge
11 | Baychester
12 | Pelham Parkway
13 | City Island
14 | Bedford Park
15 | University Heights
16 | Morris Heights
17 | Fordham
18 | East Tremont
19 | West Farms
20 | High  Bridge
21 | Melrose
22 | Mott Haven
23 | Port Morris
24 | Longwood
25 | Hunts Point
26 | Morrisania
27 | Soundview
28 | Clason Point
29 | Throgs Neck
30 | Country Club
31 | Parkchester
32 | Westchester Square
33 | Van Nest
34 | Morris Park
35 | Belmont
36 | Spuyten Duyvil
37 | North Riverdale
38 | Pelham Bay
39 | Schuylerville
40 | Edgewater Park
41 | Castle Hill
42 | Olinville
43 | Pelham Gardens
44 | Concourse
45 | Unionport
46 | Edenwald
47 | Bay Ridge
48 | Bensonhurst
49 | Sunset Park
50 | Greenpoint
51 | Gravesend
52 | Brighton Beach
53 | Sheepshead Bay
54 | Manhattan Terrace
55 | Flatbush
56 | Crown Heights
57 | East Flatbush
58 | Kensington
59 | Windsor Terrace


KeyError: 'groups'

In [77]:
# one hot encoding
tony_onehot = pd.get_dummies(tony_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
tony_onehot['Neighborhood'] = tony_venues['Neighborhood'] 
tony_onehot['Borough'] = tony_venues['Borough']
tony_onehot['Neighborhood Latitude'] = tony_venues['Neighborhood Latitude']
tony_onehot['Neighborhood Longitude'] = tony_venues['Neighborhood Longitude']


# move neighborhood column to the first column
fixed_columns = [tony_onehot.columns[-1]] + list(tony_onehot.columns[:-1])
tony_onehot = tony_onehot[fixed_columns]

tony_grouped = tony_onehot.groupby(['Borough','Neighborhood Latitude','Neighborhood Longitude']).mean().reset_index()
pd.set_option('max_rows', 500)
tony_grouped.shape

(403, 462)

In [41]:
# set number of clusters
kclusters = 5

tony_grouped_clustering = tony_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(tony_grouped_clustering)

# check cluster labels generated for each row in the dataframe
print(kmeans.labels_[:6])

[4 4 0 1 1 1]


In [45]:
print(df.shape)
print(len(kmeans.labels_[:]))
tony_merged = df

# add clustering labels
tony_merged['Cluster Labels'] = kmeans.labels_

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
tony_merged = tony_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighbourhood')

(409, 4)
401


ValueError: Length of values does not match length of index