In [1]:
# Data Manipulation
import pandas as pd
import numpy as np
# Map Data and Visual
import folium
from geopy.geocoders import Nominatim
# Web Data
import requests
from bs4 import BeautifulSoup
import json
import re

from geopy.geocoders import Nominatim

# Battle of the Neighborhoods: Seattle Neighborhood Clustering

## Notebook Reference:
1. ***Data Acqusition: https://github.com/JoeBattafarano/Battle-of-the-Neighborhoods/blob/master/Data_Acqusition.ipynb***
2. Data Preparation, Exploration, and Analysis: https://github.com/JoeBattafarano/Battle-of-the-Neighborhoods/blob/master/Data_Acqusition.ipynb
3. Report: https://github.com/JoeBattafarano/Battle-of-the-Neighborhoods/blob/master/Seattle-Neighborhood-Clustering.docx
4. Presentation: https://github.com/JoeBattafarano/Battle-of-the-Neighborhoods/blob/master/Seattle-Neighborhood-Clustering.pptx

## Data Acquisition
* [Scraping Addresses](#wiki)
* [Geocoding Addresses](#geo)
* [Foursquare Explore Endpoint](#explore)
* [Foursquare Details Endpoint](#details)

## Scraping Addresses: <a name='wiki'></a>

In [3]:
# Getting Neighborhoods from Wikapedia
neighborhood_url = 'https://en.wikipedia.org/wiki/List_of_neighborhoods_in_Seattle'
neighborhood_response = requests.get(neighborhood_url)
neighborhood_response_soup = BeautifulSoup(neighborhood_response.text)
neighborhood_response_table = neighborhood_response_soup.find('tbody')
neighborhood_response_table_rows = neighborhood_response_table.find_all('tr')[1:]

In [4]:
# Setting up the dataframe
headers  = ['Neighborhood name','Within larger district','Annexed','Locator Map','Street Map','Image','Notes']
sn = pd.DataFrame(columns=headers)

In [5]:
for i in range(len(neighborhood_response_table_rows)):
    sn.loc[i] = [td.text.strip() for td in neighborhood_response_table_rows[i].find_all('td')]

In [6]:
sn.drop(['Annexed','Locator Map','Street Map','Image','Notes'], axis=1,inplace=True)

In [7]:
# Rename Columns:
sn = sn.rename(columns={'Neighborhood name':'Neighborhood','Within larger district':'District'})

In [8]:
#Looking for data that needs to be cleaned:
sn.head()

Unnamed: 0,Neighborhood,District
0,North Seattle,Seattle
1,Broadview,North Seattle[42]
2,Bitter Lake,North Seattle[42]
3,North Beach / Blue Ridge,North Seattle[42]
4,Crown Hill,North Seattle[42]


In [9]:
# Cleaning up addresses:
sn['District'].replace(r'\[.*?\]','',regex=True, inplace=True)
sn['District'] = sn['District'].str.split('/',1, expand=True)
sn['Neighborhood'].replace(r'\[.*?\]','',regex=True, inplace=True)
sn['Neighborhood'] = sn['Neighborhood'].str.split('/',1, expand=True)
sn['Neighborhood'] = sn['Neighborhood'].str.split('(',1, expand=True)
sn['Neighborhood'] = sn['Neighborhood'].str.split('&',1, expand=True)
sn['Neighborhood'] = sn['Neighborhood'].str.split(',',1, expand=True)
sn['Neighborhood'] = sn['Neighborhood'].str.split('\n',1, expand=True)

In [11]:
# Check the shape prior to deletion
sn.shape

(127, 2)

In [12]:
#Check the Shape prior to deletion
# Removing addresses that did not return lat and longs or completely wrong data
sn.drop([34,53,58,67,100,101,103,104], inplace=True)
sn.reset_index(inplace=True,drop=True)
# 34 = West Woodland
# 53 = Pike-Pine Corridor
# 58 = Renton Hill, returns renton slightly outside main area
# 67 = Denny Triangle
# 100 = North Beacon Hill
# 101 = Mid Beacon Hill
# 103 = South Beach Hill
# 104 = Industrial district

In [13]:
#Check the Shape post deletion
sn.shape

(119, 2)

In [14]:
# Prep data to get addresses
city_state = ', Seattle, Washington'
addresses = sn['Neighborhood'] + city_state
locations = pd.DataFrame(columns=['Lat','Long'])
addresses.head()
addresses.shape

(119,)

## Geocoding Addresses <a name='geo'></a>

In [15]:
# Feed in the addresses to get the lat and long
geolocator = Nominatim(user_agent="Seattle_Neighborhoods")
n = 0
for address in addresses:
    location = geolocator.geocode(address)
    locations.at[n,'Lat'] = location.latitude
    locations.at[n,'Long'] = location.longitude
    n = n + 1

In [16]:
# Concat neighborhoods with locations since index is the same
sn = pd.concat([sn,locations], axis = 1)
sn.head()

Unnamed: 0,Neighborhood,District,Lat,Long
0,North Seattle,Seattle,47.6608,-122.291
1,Broadview,North Seattle,47.7223,-122.36
2,Bitter Lake,North Seattle,47.7262,-122.349
3,North Beach,North Seattle,47.6962,-122.392
4,Crown Hill,North Seattle,47.6947,-122.371


In [34]:
# Export data for use
sn.to_csv('sn.csv')

In [35]:
# Quick visual to ensure they are within proximity of each other
# Base lat,long
seattle_lat = 47.6062
seattle_long = -122.3321

# Mapping Toronto
map_seattle = folium.Map(location=[seattle_lat,seattle_long], zoom_start = 11)

# add markers to map
for lat, lng, neighborhood, district in zip(sn['Lat'], sn['Long'], sn['Neighborhood'], sn['District']):
    label = '{}, {}'.format(neighborhood, district)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_seattle)  
map_seattle

## Foursquare Explore Endpoint: <a name='explore'></a>

In [8]:
# Parameters
Client_ID = 'Insert your own'
Client_Secret = 'Insert your own'
version = '20180605'
seattle_lat = 47.6062
seattle_long = -122.3321
limit = 100
radius = 500

In [37]:

# Explore Data
# Can be made into a function to gather each neighborhoods info -- see toronto testing
explore_url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    Client_ID, 
    Client_Secret, 
    version, 
    seattle_lat, 
    seattle_long, 
    radius,
    limit)
# Details data (Premium Calls)


In [20]:
# Make calls to every seattle neighborhood
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            Client_ID, 
            Client_Secret, 
            version, 
            lat, 
            lng, 
            radius, 
            limit)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['id'],
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'id',
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [21]:
# Dataset with all neighborhoods, their venues, and more
seattle_venues = getNearbyVenues(names=sn['Neighborhood'],
                                   latitudes=sn['Lat'],
                                   longitudes=sn['Long']
                                  )

In [22]:
seattle_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,id,Venue Latitude,Venue Longitude,Venue Category
0,North Seattle,47.660773,-122.291497,Burke-Gilman Brewing Company,5b538f7e2b9844002ca8dde6,47.661308,-122.288067,Brewery
1,North Seattle,47.660773,-122.291497,Center for Urban Horticulture,4aa2e52ef964a520824220e3,47.657978,-122.290237,College Science Building
2,North Seattle,47.660773,-122.291497,Jak's Grill,4abc357af964a520c98620e3,47.661072,-122.288073,Steakhouse
3,North Seattle,47.660773,-122.291497,Baskin-Robbins,4af5ed48f964a520eafe21e3,47.661336,-122.292004,Ice Cream Shop
4,North Seattle,47.660773,-122.291497,The Seattle Gym,4b774cc8f964a52019902ee3,47.661273,-122.286642,Gym


In [23]:
seattle_venues.shape

(3390, 8)

In [25]:
print('There are {} uniques categories.'.format(len(seattle_venues['Venue Category'].unique())))

There are 308 uniques categories.


In [27]:
# Exporting data for use
seattle_venues.to_csv('seattle_venues.csv')

## Foursquare Details Endpoint <a name='details'></a>

In [7]:
# Obtain details for all venues acquired
def getVenueDetails(venue_ids):
    
    details_list=[]
    for venue_id in venue_ids:
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/{}?&client_id={}&client_secret={}&v={}'.format(
            venue_id,
            Client_ID, 
            Client_Secret,
            version)
            
        # make the GET request
        details_response = requests.get(url).json()["response"]['venue']
        
        # return only relevant information for each nearby venue
        details_list.append([
        details_response['id'],
        details_response['price']['tier'] if 'price' in details_response else 0,
        details_response['rating'] if 'rating' in details_response else 0])
        details_table = pd.DataFrame(details_list)
        details_table.columns = ['id','price','rating']
    
    return(details_table)

In [2]:
# read in initial query
seattle_venues = pd.read_csv('seattle_venues.csv')

In [3]:
seattle_venues.head()

Unnamed: 0.1,Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,id,Venue Latitude,Venue Longitude,Venue Category
0,0,North Seattle,47.660773,-122.291497,Burke-Gilman Brewing Company,5b538f7e2b9844002ca8dde6,47.661308,-122.288067,Brewery
1,1,North Seattle,47.660773,-122.291497,Center for Urban Horticulture,4aa2e52ef964a520824220e3,47.657978,-122.290237,College Science Building
2,2,North Seattle,47.660773,-122.291497,Jak's Grill,4abc357af964a520c98620e3,47.661072,-122.288073,Steakhouse
3,3,North Seattle,47.660773,-122.291497,Baskin-Robbins,4af5ed48f964a520eafe21e3,47.661336,-122.292004,Ice Cream Shop
4,4,North Seattle,47.660773,-122.291497,The Seattle Gym,4b774cc8f964a52019902ee3,47.661273,-122.286642,Gym


In [4]:
# Cleaning up
venue_ids = seattle_venues['id']
venue_ids.drop_duplicates(inplace=True)
venue_ids.reset_index(inplace=True,drop=True)

## Note: The Premium Call limit is 500 per day. Therefore, I had to parse out my premium calls to each unique venue

In [99]:
#First Day: First 300 of 2285
first_venue_details = venue_ids[0:300]
first_seattle_venues_details = getVenueDetails(first_venue_details)

In [105]:
first_seattle_venues_details.to_csv('first_seattle_venues_details.csv')

In [27]:
# Second Day:300 to 799
second_venue_details = venue_ids[300:800]
second_seattle_venues_details = getVenueDetails(second_venue_details)

In [28]:
second_seattle_venues_details.head()

Unnamed: 0,id,price,rating
0,52e572e4498ec75b76158ae1,0,0.0
1,4b834228f964a52080ff30e3,1,7.1
2,4f31f205e4b09c2f100ab3c2,0,0.0
3,5bbfb34aad1789002c51aba1,1,0.0
4,4b68c40bf964a520b58b2be3,0,0.0


In [29]:
second_seattle_venues_details.to_csv('second_seattle_venues_details.csv')

In [61]:
# Third Day: 800 - 1299
third_venue_details = venue_ids[800:1300]
third_seattle_venues_details = getVenueDetails(third_venue_details)

In [63]:
third_seattle_venues_details.head()

Unnamed: 0,id,price,rating
0,562999b3498ed376b4722af2,0,8.2
1,57767834498e7b1d7599e9db,1,8.2
2,4ad2595cf964a5200be120e3,0,7.8
3,4b53aaaaf964a52002a727e3,1,7.9
4,4b0f0bd0f964a5208c5e23e3,0,8.3


In [64]:
third_seattle_venues_details.to_csv('third_seattle_venues_details.csv')

In [10]:
# Fourth Day: 1300 - 1799
fourth_venue_details = venue_ids[1300:1800]
fourth_seattle_venues_details = getVenueDetails(fourth_venue_details)

In [11]:
fourth_seattle_venues_details.head()

Unnamed: 0,id,price,rating
0,4c114ad6d41e76b0dc26310d,3,7.2
1,4f2c386ee4b09513aefc06ad,1,7.2
2,46216d22f964a52091451fe3,4,8.5
3,59fd011881635b4c218c13bc,1,7.9
4,5a625db412138424d8d8362a,1,7.3


In [13]:
fourth_seattle_venues_details.to_csv('fourth_seattle_venues_details.csv')

In [9]:
# Fifth Day: 1800 - 2285
fifth_venue_details = venue_ids[1800:2285]
fifth_seattle_venues_details = getVenueDetails(fifth_venue_details)

In [10]:
fifth_seattle_venues_details.head()

Unnamed: 0,id,price,rating
0,4ac14ba2f964a5209e9620e3,0,7.8
1,4b69f47df964a5200dbe2be3,2,7.6
2,4c13e6c1583c9c7477833fa4,0,7.2
3,4c5a55e07b049521c644891f,0,7.2
4,4bd7227c4e32d13ae9c8c380,1,6.8


In [11]:
fifth_seattle_venues_details.to_csv('fifth_seattle_venues_details.csv')