# Capstone Project - The Battle of the Neighborhoods (Week 2)
### Applied Data Science Capstone by IBM/Coursera



## Introduction: Business Problem <a name="introduction"></a>
What location should we place a new Mexican restaurant in Dallas, Texas?

In [978]:
import requests # library to handle requests
from bs4 import BeautifulSoup
import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

from geopy.geocoders import Nominatim# module to convert an address into latitude and longitude values
import geocoder

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 

# transforming json file into a pandas dataframe library
from pandas.io.json import json_normalize

import folium # plotting library

import numpy as np
from sklearn.cluster import DBSCAN
from sklearn.datasets.samples_generator import make_blobs 
from sklearn.preprocessing import StandardScaler 
import matplotlib.pyplot as plt 
%matplotlib inline
from sklearn import preprocessing


#### Define Foursquare Credentials and Version

In [979]:
CLIENT_ID = 'H02GZU3YVDIY5GWT3JMWZ0VZQS1K10ZE5DHV13F03PHY5TUB' # your Foursquare ID
CLIENT_SECRET = 'Q1CEIAOSDCGVEFYCZ3FY3303UJVYN04ZFJAAMXQGDT3EAMOR' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: H02GZU3YVDIY5GWT3JMWZ0VZQS1K10ZE5DHV13F03PHY5TUB
CLIENT_SECRET:Q1CEIAOSDCGVEFYCZ3FY3303UJVYN04ZFJAAMXQGDT3EAMOR


In [980]:
# Load data on neighborhoods in Dallas from wiki and put it into tables using pandas
source = requests.get("https://en.wikipedia.org/wiki/Category:Neighborhoods_in_Downtown_Dallas").text
soup = BeautifulSoup(source, 'html.parser')
neighborhoods = []
for row in soup.find_all("div", class_="mw-category")[0].findAll("li"):
    neighborhoods.append(row.text)
neighborhoods_df = pd.DataFrame({"Neighborhood": neighborhoods})

display(neighborhoods_df)

Unnamed: 0,Neighborhood
0,Downtown Dallas
1,"Arts District, Dallas"
2,"City Center District, Dallas"
3,"Convention Center District, Dallas"
4,"Farmers Market District, Dallas"
5,"Government District, Dallas"
6,"Main Street District, Dallas"
7,"Reunion District, Dallas"
8,West End Historic District (Dallas)


In [981]:
# Get the coordinates of the districts in Downtown Dallas
def get_coordinates(neighborhood):
    coords = None
    while(coords is None):
        loc = geocoder.arcgis('{}, Dallas, Texas'.format(neighborhood))
        coords = loc.latlng
    return coords

coords = [get_coordinates(neighborhood) for neighborhood in neighborhoods_df["Neighborhood"].tolist()]
# Put the coordinates into a new data frame
coords_df = pd.DataFrame(coords, columns=['Latitude', 'Longitude'])
coords_df.head()

Unnamed: 0,Latitude,Longitude
0,32.78137,-96.79758
1,32.78944,-96.79717
2,32.80555,-96.79212
3,32.77276,-96.803
4,32.77469,-96.80153


In [982]:
# Combine the neighborhoods and coordinates data frames
neighborhoods_df['Latitude'] = coords_df['Latitude']
neighborhoods_df['Longitude'] = coords_df['Longitude']
neighborhoods_df.head()

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Downtown Dallas,32.78137,-96.79758
1,"Arts District, Dallas",32.78944,-96.79717
2,"City Center District, Dallas",32.80555,-96.79212
3,"Convention Center District, Dallas",32.77276,-96.803
4,"Farmers Market District, Dallas",32.77469,-96.80153


# Map the different Neighborhoods

In [983]:
# Get the coordinates of Dallas
address = 'Dallas, Texas'
geolocator = Nominatim(user_agent="Dal_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geographical coordinate of Dallas are {}, {}.'.format(latitude, longitude))

The geographical coordinate of Dallas are 32.7762719, -96.7968559.


In [984]:
# Map the neighborhoods of dallas
neighborhoods_map = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, neighborhood in zip(neighborhoods_df['Latitude'], neighborhoods_df['Longitude'], neighborhoods_df['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(neighborhoods_map)  
    
neighborhoods_map
# neighborhoods_map.save('neighborhoods_map.html')

In [985]:
# address = '1601 Main St, Dallas, TX 75201'
# geolocator = Nominatim(user_agent="Dal_explorer")
# location = geolocator.geocode(address)
# latitude = location.latitude
# longitude = location.longitude
# print('The geographical coordinate of Dallas are {}, {}.'.format(latitude, longitude))

In [986]:
category_id = "4bf58dd8d48988d1c1941735" # Category ID for Mexican Food in FourSquare
LIMIT = 500
city = "Dallas, TX"
print(search_query + ' .... OK!')

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&near={}&limit={}&categoryId={}'.format(
        CLIENT_ID, 
        CLIENT_SECRET, 
        VERSION, 
        city,
        LIMIT,
        category_id) # FOOD Category ID
results = requests.get(url).json()

Food .... OK!


In [987]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [988]:
#Normalize results of dataframe
venues = json_normalize(results['response']['groups'][0]['items'])
venues.head()

  


Unnamed: 0,referralId,reasons.count,reasons.items,venue.id,venue.name,venue.location.address,venue.location.lat,venue.location.lng,venue.location.labeledLatLngs,venue.location.postalCode,venue.location.cc,venue.location.city,venue.location.state,venue.location.country,venue.location.formattedAddress,venue.categories,venue.photos.count,venue.photos.groups,venue.location.crossStreet,venue.delivery.id,venue.delivery.url,venue.delivery.provider.name,venue.delivery.provider.icon.prefix,venue.delivery.provider.icon.sizes,venue.delivery.provider.icon.name,venue.venuePage.id,flags.outsideRadius,venue.location.neighborhood
0,e-0-50c29756e4b047e9d36dc1a9-0,0,"[{'summary': 'This spot is popular', 'type': '...",50c29756e4b047e9d36dc1a9,Meso Maya,1611 McKinney Ave,32.78777,-96.804677,"[{'label': 'display', 'lat': 32.78776962197217...",75202,US,Dallas,TX,United States,"[1611 McKinney Ave, Dallas, TX 75202, United S...","[{'id': '4bf58dd8d48988d1c1941735', 'name': 'M...",0,[],,,,,,,,,,
1,e-0-4a36f23ef964a520099e1fe3-1,0,"[{'summary': 'This spot is popular', 'type': '...",4a36f23ef964a520099e1fe3,Javier's Gourmet Mexicano,4912 Cole Ave,32.827877,-96.786058,"[{'label': 'display', 'lat': 32.82787667540906...",75205,US,Dallas,TX,United States,"[4912 Cole Ave (at Harvard Ave.), Dallas, TX 7...","[{'id': '4bf58dd8d48988d1c1941735', 'name': 'M...",0,[],at Harvard Ave.,1875568.0,https://www.grubhub.com/restaurant/javiers-491...,grubhub,https://fastly.4sqi.net/img/general/cap/,"[40, 50]",/delivery_provider_grubhub_20180129.png,37771309.0,,
2,e-0-4ae0df23f964a520398321e3-2,0,"[{'summary': 'This spot is popular', 'type': '...",4ae0df23f964a520398321e3,Pappasito's Cantina,10433 Lombardy Ln,32.86554,-96.8983,"[{'label': 'display', 'lat': 32.86554, 'lng': ...",75220,US,Dallas,TX,United States,"[10433 Lombardy Ln (at Northwest Hwy), Dallas,...","[{'id': '4bf58dd8d48988d1c1941735', 'name': 'M...",0,[],at Northwest Hwy,1632102.0,https://www.grubhub.com/restaurant/pappasitos-...,grubhub,https://fastly.4sqi.net/img/general/cap/,"[40, 50]",/delivery_provider_grubhub_20180129.png,,,
3,e-0-4f7f7a44e4b06aeda3a4b934-3,0,"[{'summary': 'This spot is popular', 'type': '...",4f7f7a44e4b06aeda3a4b934,El Tizoncito - Lemmon Ave,5150 Lemmon Ave,32.825176,-96.820766,"[{'label': 'display', 'lat': 32.82517634868372...",75209,US,Dallas,TX,United States,"[5150 Lemmon Ave (at Wheeler St), Dallas, TX 7...","[{'id': '4bf58dd8d48988d1c1941735', 'name': 'M...",0,[],at Wheeler St,1591477.0,https://www.grubhub.com/restaurant/el-tizoncit...,grubhub,https://fastly.4sqi.net/img/general/cap/,"[40, 50]",/delivery_provider_grubhub_20180129.png,333958868.0,,
4,e-0-4e4721abc65bd6ffbe9c4bfc-4,0,"[{'summary': 'This spot is popular', 'type': '...",4e4721abc65bd6ffbe9c4bfc,MesoMaya,11909 Preston Rd,32.911259,-96.805183,"[{'label': 'display', 'lat': 32.91125881114663...",75230,US,Dallas,TX,United States,"[11909 Preston Rd, Dallas, TX 75230, United St...","[{'id': '4bf58dd8d48988d1c1941735', 'name': 'M...",0,[],,,,,,,,,,


Next we will clean the json and structure it into a pandas dataframe 

In [989]:
# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
dallas_df = venues.loc[:, filtered_columns]

# filter the category for each row
dallas_df['venue.categories'] = dallas_df.apply(get_category_type, axis=1)

# clean columns
dallas_df.columns = [col.split(".")[-1] for col in dallas_df.columns]
dallas_df.columns = ['Name', 'Categories', 'Latitude', 'Longitude']
dallas_df.head()

Unnamed: 0,Name,Categories,Latitude,Longitude
0,Meso Maya,Mexican Restaurant,32.78777,-96.804677
1,Javier's Gourmet Mexicano,Mexican Restaurant,32.827877,-96.786058
2,Pappasito's Cantina,Mexican Restaurant,32.86554,-96.8983
3,El Tizoncito - Lemmon Ave,Mexican Restaurant,32.825176,-96.820766
4,MesoMaya,Mexican Restaurant,32.911259,-96.805183


And how many venues were returned by Foursquare?

In [990]:
print('{} venues were returned by Foursquare.'.format(dallas_df.shape[0]))


100 venues were returned by Foursquare.


In [991]:
dallas_map = folium.Map(location=[latitude, longitude], zoom_start=9)

folium.CircleMarker(
    [latitude, longitude],
    radius=10,
    color='red',
    popup='Dallas Eye',
    fill = True,
    fill_color = 'red',
    fill_opacity = 0.6
).add_to(dallas_map)

for lat, lng, label in zip(dallas_df['Latitude'], dallas_df['Longitude'], dallas_df['Name']):
        label = folium.Popup(label, parse_html=True)
        folium.CircleMarker(
            [lat, lng],
            radius=5,
            popup=label,
            color='blue',
            fill=True,
            fill_color='blue',
            fill_opacity=0.6,
            parse_html=False).add_to(dallas_map)  

dallas_map

In [992]:
# Finds the mean latitude and longitude of Dallas
dallas_lat = np.mean([results['response']['geocode']['geometry']['bounds']['ne']['lat'],
                    results['response']['geocode']['geometry']['bounds']['sw']['lat']])
dallas_lng = np.mean([results['response']['geocode']['geometry']['bounds']['ne']['lng'],
                    results['response']['geocode']['geometry']['bounds']['sw']['lng']])
print(dallas_lat, dallas_lng)

# Not sure if this is correctly working ****************************

32.818504000000004 -96.73210999999999


In [993]:
# Mean coordinates of the restaurants in dallas
restaurant_mean = [dallas_df['Latitude'].mean(), dallas_df['Longitude'].mean()]
print(restaurant_mean)

# Mean distance from mean coordinates
print(np.mean(np.apply_along_axis(lambda x: np.linalg.norm(x - restaurant_mean),1,dallas_df[['Latitude','Longitude']].values)))

[32.83880148735847, -96.80346908923764]
0.091398980397128


In [994]:
# Preparing for K-Cluster

In [995]:
clusterList = []
labels = []
clusterArray = np.array([])

for row in range(len(dallas_df)):
    long = dallas_df.at[row, "Longitude"]
    lat  = dallas_df.at[row, "Latitude"]
    labels.append(dallas_df.at[row, "Name"])
    clusterList.append( (long,lat) )

clusterList

[(-96.80467677649854, 32.78776962197217),
 (-96.78605777572541, 32.82787667540906),
 (-96.8983, 32.86554),
 (-96.82076629996793, 32.82517634868372),
 (-96.80518302630891, 32.91125881114663),
 (-96.82550449246007, 32.81422667998143),
 (-96.78745228225957, 32.822439291688546),
 (-96.82497543, 32.81286019),
 (-96.68347, 32.755982),
 (-96.82069405691287, 32.96944128331272),
 (-96.8289519535759, 32.74141207241214),
 (-96.80481386859715, 32.78745530074029),
 (-96.83675358, 32.86570614),
 (-96.82848885698846, 32.77913949947765),
 (-96.86833262060509, 32.909353068379346),
 (-96.93795806859764, 32.869847845164145),
 (-96.76203051263424, 32.81346602671393),
 (-96.78579335030425, 32.8014622160881),
 (-96.80484909425734, 32.78796548521667),
 (-96.80602737375143, 32.83573147018587),
 (-96.85225280120643, 32.86064445066195),
 (-96.82069868678417, 32.850642439015026),
 (-96.81165344113775, 32.8175619278834),
 (-96.78199709027253, 32.78509927848842),
 (-96.82308170087705, 32.827188079153366),
 (-96.85

In [996]:
normalized_X = preprocessing.normalize(clusterList)
normalized_X

array([[-0.94714728,  0.32079904],
       [-0.94700921,  0.3212064 ],
       [-0.94701042,  0.32120284],
       [-0.94705226,  0.32107944],
       [-0.94678021,  0.32188079],
       [-0.9470896 ,  0.32096929],
       [-0.9470268 ,  0.32115454],
       [-0.94709313,  0.32095887],
       [-0.9471197 ,  0.32088046],
       [-0.94662245,  0.32234445],
       [-0.94730941,  0.32031998],
       [-0.94714836,  0.32079587],
       [-0.94694782,  0.32138736],
       [-0.9471969 ,  0.32065252],
       [-0.94684981,  0.32167597],
       [-0.94703759,  0.32112273],
       [-0.94702785,  0.32115145],
       [-0.94708754,  0.32097537],
       [-0.94714688,  0.32080024],
       [-0.94700599,  0.32121589],
       [-0.94697853,  0.32129686],
       [-0.94697643,  0.32130303],
       [-0.94706572,  0.32103974],
       [-0.94713238,  0.32084303],
       [-0.94704862,  0.3210902 ],
       [-0.94712616,  0.32086141],
       [-0.94666664,  0.32221462],
       [-0.94686249,  0.32163865],
       [-0.94650081,

In [997]:
epsilon = 0.3
minimumSamples = 50
db = DBSCAN(eps=epsilon, min_samples=minimumSamples).fit(clusterList)

In [998]:
# Firts, create an array of booleans using the labels from db.
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
core_samples_mask

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True])

In [999]:
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_clusters_

81

In [1000]:
# Remove repetition in labels by turning it into a set.
unique_labels = set(labels)
unique_labels

{'Amigos Restaurant',
 "Angela's Cafe",
 "Avila's",
 'Beto & Son',
 'Blue Goose Cantina',
 'Cantina Laredo',
 'Casa Navarro',
 'Chichen Itza',
 'Chipotle Mexican Grill',
 "Chuy's",
 'City View Tacos',
 'Desperados Mexican Restaurant',
 "Don Pepe's Rancho Mexican Grill",
 'E Bar Tex-Mex',
 'El Bolero',
 'El Fenix',
 'El Fenix Restaurant',
 'El Paisa Cocina Mexicana',
 'El Palote Panaderia',
 'El Pollo Loco',
 'El Pollo Regio',
 'El Rincon Tapatio',
 'El Taquito',
 'El Tizoncito',
 'El Tizoncito - Lemmon Ave',
 "Fernando's Mexican Cuisine",
 'Freebirds World Burrito',
 "Fuzzy's Taco Shop",
 "Gabriela's & Sofia's",
 'Gloria’s Latin Cuisine',
 'Gonzalez Restaurant',
 "Herrera's",
 'Hugo’s Invitados',
 'Iron Cactus',
 'Jalisco  Mex Kitchen + Bar',
 "Javier's Gourmet Mexicano",
 'Komali',
 'La Calle Doce',
 'La Hacienda Ranch Preston Trail',
 'La Ventana',
 "La Victoria's Restaurant",
 "Manny's Uptown Tex-Mex Restaurante",
 'Mannys Uptown Tex-Mex',
 "Mariano's Hacienda Ranch Dallas",
 "Mario

In [1001]:
# Create colors for the clusters.
colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))


In [1002]:
# Plot the points with colors
for k, col in zip(unique_labels, colors):
    if k == -1:
        # Black used for noise.
        col = 'k'

    class_member_mask = (labels == k)

    # Plot the datapoints that are clustered
    xy = clusterList[class_member_mask & core_samples_mask]
    plt.scatter(xy[:, 0], xy[:, 1],s=50, c=[col], marker=u'o', alpha=0.5)

    # Plot the outliers
    xy = clusterList[class_member_mask & ~core_samples_mask]
    plt.scatter(xy[:, 0], xy[:, 1],s=50, c=[col], marker=u'o', alpha=0.5)

TypeError: only integer scalar arrays can be converted to a scalar index