# Coursera Capstone Project - Battle of the Neighborhoods


### Loading libraries to extract, explore, segment and cluster information about neighborhoods in Philadelphia and  Toronto.

In [2]:
import numpy as np #Library required for vectorized data handling
import pandas as pd #Library required for data analysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json #Library required to handle JSON files

from bs4 import BeautifulSoup #utilized for web scraping data

#!conda install -c conda-forge geopy=1.49.0 --yes # uncomment this line if you haven't installed the following library for python
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

#!conda install -c conda-forge geocoder --yes
import geocoder

import requests # library to handle requests
import urllib.request

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't installed the following library for python
import folium # map rendering library

print('All libraries are imported!')

All libraries are imported!


**Obtain and visualize the underlying HTML code within our webpage**

In [3]:
#Here we establish the URL that gives us the pertinent information on the Neighborhoods an 
urlT = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
urlP = 'https://en.wikipedia.org/wiki/List_of_Philadelphia_neighborhoods'

pageT = urllib.request.urlopen(urlT)
pageP = urllib.request.urlopen(urlP)

#parse the HTML data into the BeautifulSoup parse tree format and visualize using prettify
soupT = BeautifulSoup(pageT, "html.parser")
soupP = BeautifulSoup(pageP, "html.parser")


#print(soupT.prettify())
#print(soupP.prettify())


**Extract the table from our underlying HTML code that contains the data important to us**

In [4]:
#Established the specific table we are looking for within the parsed html for our web page
tablesT = soupT.find("table", class_='wikitable sortable')
tablesP = soupP.findAll("a")
#tablesT
#tablesP

**Now that we have the raw html output, I will begin cleaning the Philadelphia data.**

In [5]:
#Here we grab the majority of the information that is required, plus a bit of excess. Unfortunately this was the cleanest data I could find on Philadelphia.
PNL = []
for row in soupP.findAll("a"):
    PNL.append(row.get('title'))
    
#print(PNL)

In [6]:
#Here we take a rough outline and filter out the data that is not needed.
dfPRough = pd.DataFrame(PNL, columns=['Neighborhood'])
dfPRough = dfPRough.dropna()
dfPRough = dfPRough.reset_index(drop=True)
dfPRough = dfPRough[8:238]
l2drop=['SEPTA', 'Delaware River', 'Schuylkill River', 'Media/Elwyn Line', 'Semi-detached house', 'Bucks County', 'American Middle class', 'Whites', 'White flight', 'Hispanic and Latino Americans', 'African American', 'Asian American', 'Irish-American',  'Enlarge']
dfPR = dfPRough[~dfPRough.Neighborhood.isin(l2drop)]

In [7]:
#These are to clean up the values for the neighborhoods in the cells so that don't contain redundant values
dfPRtemp = dfPR['Neighborhood'].str.replace('\(page does not exist\)','')
dfPRtemp = dfPRtemp.str.replace('(', "")
dfPRtemp = dfPRtemp.str.replace(')', "")
dfPRtemp = dfPRtemp.str.replace(', Philadelphia, Pennsylvania', "")
dfPRtemp = dfPRtemp.str.replace(', Philadelphia', "")
dfPRtemp = dfPRtemp.str.replace(', PA', "")
dfPRTemp = dfPRtemp.str.replace(', Pennsylvania', "")
dfPR = dfPRtemp.to_frame()

In [8]:
#We sort through finding our boroughs through the use of the Edit Section flag that was leftover. Removing the string
#to set as our borough variable. This will set up our dataframe for finalization
PB = []
PN = []

for row in dfPR['Neighborhood']:
    if "Edit" in row:
        borough = row.replace('Edit section: ', '')
        PB.append(borough)
        PN.append(None)
    else:
        PB.append(borough)
        PN.append(row)

In [31]:
#Finally this gets rid of any None values and ensures that the Borough is not defined as a neighborhood as well.
dfP=pd.DataFrame(PB,columns=['Borough'])
dfP['Neighborhood'] = PN
dfP.dropna(inplace=True)
borP = dfP['Borough'].unique()
dfP = dfP[~dfP.Neighborhood.isin(borP)]
dfP
    

array(['Center City', 'South Philadelphia', 'Southwest Philadelphia',
       'West Philadelphia', 'Lower North Philadelphia',
       'Upper North Philadelphia', 'Bridesburg-Kensington-Port Richmond',
       'Roxborough-Manayunk', 'Germantown-Chestnut Hill',
       'Olney-Oak Lane', 'Near Northeast Philadelphia',
       'Far Northeast Philadelphia'], dtype=object)

**Create and Clean the dataframe to best suit our needs**

**Now that the Philadelphia data has been compiled we shall compile the Toronto data as well.**

In [10]:
#We define 3 empty list to be established with our 3 columns from our web page
#these columns are Postal Code, Borough and Neighbourhood.

A=[]
B=[]
C=[]

#Creates a loop to find all rows that begin with <tr> and contain at least 3 columns. If both
#stipulations are met we extract the text values within <td> for each element in that row and
#append them to our lists.
for row in tablesT.findAll('tr'):
    cells=row.findAll('td')
    if len(cells)==3:
        A.append(cells[0].find(text=True).replace("\n",""))
        B.append(cells[1].find(text=True).replace("\n",""))
        C.append(cells[2].find(text=True).replace("\n",""))

In [16]:
#Here we simply define the column headers and apply their respective lists to fill the value below
dfT=pd.DataFrame(A,columns=['Postal Code'])
dfT['Borough']=B
dfT['Neighborhood']=C

#Finally we exclude any Borough that has a value of 'Not assigned'
dfT.drop(dfT[dfT['Borough'] == 'Not assigned'].index, inplace=True)
dfT = dfT.reset_index(drop=True)
dfT

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


**Here I set up the neighborhoods to be search through geocoder to find the latitude and longtiude coordinates for each respective neighborhood.** 

In [28]:
#Repeat a similar process to extracting our data from wikipedia for utilizing
#geocoder to generate latitude and longitude coordinates from the postal codes.

DT=[]
ET=[]
for pc, borough, neighborhood in zip(dfT['Postal Code'], dfT['Borough'], dfT['Neighborhood']):
    g = geocoder.arcgis('{}, Toronto, Ontario'.format(pc))
    lat_lng_coords = g.latlng
    
    lat = lat_lng_coords[0]
    lng = lat_lng_coords[1]
    DT.append(lat)
    ET.append(lng)

#output the list of corresponding latitude and longitude to new columns in the dataframe.
dfT['Latitude']=DT
dfT['Longitude']=ET
dfT.drop(columns=['Postal Code'], inplace=True)
dfT

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,North York,Parkwoods,43.752935,-79.335641
1,North York,Victoria Village,43.728102,-79.31189
2,Downtown Toronto,"Regent Park, Harbourfront",43.650964,-79.353041
3,North York,"Lawrence Manor, Lawrence Heights",43.723265,-79.451211
4,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66179,-79.38939
5,Etobicoke,"Islington Avenue, Humber Valley Village",43.667481,-79.528953
6,Scarborough,"Malvern, Rouge",43.808626,-79.189913
7,North York,Don Mills,43.7489,-79.35722
8,East York,"Parkview Hill, Woodbine Gardens",43.707193,-79.311529
9,Downtown Toronto,"Garden District, Ryerson",43.657491,-79.377529


In [29]:
dfT.to_csv('toronto_neighborhoods.csv', index=False)

In [30]:
dfTor = pd.read_csv('toronto_neighborhoods.csv')
dfTor

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,North York,Parkwoods,43.752935,-79.335641
1,North York,Victoria Village,43.728102,-79.31189
2,Downtown Toronto,"Regent Park, Harbourfront",43.650964,-79.353041
3,North York,"Lawrence Manor, Lawrence Heights",43.723265,-79.451211
4,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66179,-79.38939
5,Etobicoke,"Islington Avenue, Humber Valley Village",43.667481,-79.528953
6,Scarborough,"Malvern, Rouge",43.808626,-79.189913
7,North York,Don Mills,43.7489,-79.35722
8,East York,"Parkview Hill, Woodbine Gardens",43.707193,-79.311529
9,Downtown Toronto,"Garden District, Ryerson",43.657491,-79.377529


In [18]:

#Repeat a similar process to extracting our data from wikipedia for utilizing
#geocoder to generate latitude and longitude coordinates from the postal codes.

DP=[]
EP=[]
for neighborhood in zip(dfP['Neighborhood']):
    g = geocoder.arcgis('{}, Philadelphia, Pennsylvania'.format(neighborhood))
    lat_lng_coords = g.latlng
    
    lat = lat_lng_coords[0]
    lng = lat_lng_coords[1]
    DP.append(lat)
    EP.append(lng)
    
#output the list of corresponding latitude and longitude to new columns in the dataframe.
dfP['Latitude']=DP
dfP['Longitude']=EP
dfP

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
2,Center City,Avenue of the Arts Philadelphia,39.950487,-75.164029
3,Center City,Callowhill,39.960198,-75.166814
4,Center City,Chinatown,39.95535,-75.1558
5,Center City,Elfreth's Alley,39.952763,-75.142396
6,Center City,French Quarter,39.98493,-75.149041
7,Center City,Logan Square,39.95763,-75.16986
8,Center City,Naval Square,39.95222,-75.16218
9,Center City,Jewelers' Row,39.94875,-75.15386
10,Center City,Market East,39.960466,-75.229333
11,Center City,Old City,39.95009,-75.14507


In [22]:
dfT.to_csv('toronto_neighborhoods.csv', index=False)

In [23]:
dfTor = pd.read_csv('toronto_neighborhoods.csv')
dfTor

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.752935,-79.335641
1,M4A,North York,Victoria Village,43.728102,-79.31189
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.650964,-79.353041
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.723265,-79.451211
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66179,-79.38939
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667481,-79.528953
6,M1B,Scarborough,"Malvern, Rouge",43.808626,-79.189913
7,M3B,North York,Don Mills,43.7489,-79.35722
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.707193,-79.311529
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657491,-79.377529


**Here I now add the top 25 venues from each neighborhood**

In [14]:
import os

CLIENT_ID =  os.environ.get("CLIENT_ID")# your Foursquare ID
CLIENT_SECRET =  os.environ.get("CLIENT_SECRET") # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
ACCESS_TOKEN = os.environ.get("ACCESS_TOKEN")

print('Your credentails:')
print('CLIENT_ID: Are contained within your conda environment variables!' )
print('CLIENT_SECRET: Are contained within your conda environment variables!')

Your credentails:
CLIENT_ID: Are contained within your conda environment variables!
CLIENT_SECRET: Are contained within your conda environment variables!


In [15]:
limit=10

def getNearbyVenues(names, latitude, longitude, radius = 500) :
    venues_list = []
    
    for name, lat, long in zip(names, latitude, longitude):
        print(name)
        
        #Generate the API request url
        url='https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&oauth_token={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID,
            CLIENT_SECRET,
            ACCESS_TOKEN,
            VERSION,
            lat,
            long,
            radius,
            limit)
        
        #Make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        #return only relevant information on nearby venues
        venues_list.append([(
            name,
            lat,
            long,
            v['venue']['name'],
            v['venue']['id'],
            v['venue']['location']['lat'],
            v['venue']['location']['lng'],
            v['venue']['categories'][0]['name']) for v in results])
        
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue ID',
                  'Venue Latitude', 
                  'Venue Longitude',
                  'Venue Category']
    return(nearby_venues)          

In [None]:
toronto_venues = getNearbyVenues(names=dfT['Neighborhood'],
                                latitude=dfT['Latitude'],
                                longitude=dfT['Longitude']
                                )

In [45]:
print(toronto_venues.shape)
toronto_venues.head()

(802, 8)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue ID,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.752935,-79.335641,Brookbanks Park,4e8d9dcdd5fbbbb6b3003c7b,43.751976,-79.33214,Park
1,Parkwoods,43.752935,-79.335641,GTA Restoration,53e72d92498e336f61db5bf5,43.753396,-79.333477,Fireworks Store
2,Parkwoods,43.752935,-79.335641,Corrosion Service Company Limited,5921cc82e96d0c63d980640c,43.752432,-79.334661,Construction & Landscaping
3,Parkwoods,43.752935,-79.335641,Variety Store,4cb11e2075ebb60cd1c4caad,43.751974,-79.333114,Food & Drink Shop
4,Parkwoods,43.752935,-79.335641,Three Valleys Park,53695918498e12363ff0d9ad,43.751195,-79.337356,Park


In [47]:
philadelphia_venues = getNearbyVenues(names=dfP['Neighborhood'],
                                latitude=dfP['Latitude'],
                                longitude=dfP['Longitude']
                                )

Avenue of the Arts Philadelphia


KeyError: 'groups'

In [13]:
lat=40.078440
long=-74.987950
radius =500

url='https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&oauth_token={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID,
            CLIENT_SECRET,
            ACCESS_TOKEN,
            VERSION,
            lat,
            long,
            radius,
            limit)

res = requests.get(url).json()["response"]['groups'][0]['items']
res

NameError: name 'CLIENT_ID' is not defined

In [85]:
for v in res:
    print(v)

{'reasons': {'count': 0, 'items': [{'summary': 'This spot is popular', 'type': 'general', 'reasonName': 'globalInteractionReason'}]}, 'venue': {'id': '4ba3d6b2f964a520236538e3', 'name': 'GameStop', 'location': {'address': '3292 Red Lion Rd', 'crossStreet': 'at Academy Plaza', 'lat': 40.07933147730603, 'lng': -74.99259614323135, 'labeledLatLngs': [{'label': 'display', 'lat': 40.07933147730603, 'lng': -74.99259614323135}], 'distance': 407, 'postalCode': '19114', 'cc': 'US', 'city': 'Philadelphia', 'state': 'PA', 'country': 'United States', 'formattedAddress': ['3292 Red Lion Rd (at Academy Plaza)', 'Philadelphia, PA 19114']}, 'categories': [{'id': '4bf58dd8d48988d10b951735', 'name': 'Video Game Store', 'pluralName': 'Video Game Stores', 'shortName': 'Video Games', 'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/shops/videogames_', 'suffix': '.png'}, 'primary': True}], 'photos': {'count': 0, 'groups': []}}, 'referralId': 'e-0-4ba3d6b2f964a520236538e3-0'}
{'reasons': {'count': 0

In [38]:
print(philadelphia_venues.shape)
philadelphia_venues.head()

NameError: name 'philadelphia_venues' is not defined

In [58]:
vID = '4e8d9dcdd5fbbbb6b3003c7b'
url='https://api.foursquare.com/v2/venues/{}?&client_id={}&client_secret={}&oauth_token={}&v={}'.format(
            vID,
            CLIENT_ID,
            CLIENT_SECRET,
            ACCESS_TOKEN,
            VERSION)
requests.get(url).json()

{'meta': {'code': 403,
  'errorType': 'rate_limit_exceeded',
  'errorDetail': 'Quota exceeded',
  'requestId': '5ed4df0998205d001b5dce8a'},
 'response': {}}

In [None]:
def getVenuesDetails(names, vIDs, main_df) :
    
    for name, vID in zip(names, vIDs):
        print(name)
        
        #Generate the API request url
        url='https://api.foursquare.com/v2/venues/{}?&client_id={}&client_secret={}&oauth_token={}&v={}&ll={},{}&radius={}&limit={}'.format(
            vID,
            CLIENT_ID,
            CLIENT_SECRET,
            ACCESS_TOKEN,
            VERSION)
        
        #Make the GET request
        results = requests.get(url).json()["response"]['venue']
        
        
        #return only relevant information on nearby venues
        main_df.append([(
            v['venue']['name'],
            v['venue']['id'],
            v['venue']['location']['lat'],
            v['venue']['location']['lng'],
            v['venue']['categories'][0]['name']) for v in results])
        
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue ID',
                  'Venue Latitude', 
                  'Venue Longitude',
                  'Venue Category']
    return(nearby_venues)          