# Neighbourhoods of New York and Toronto

#### Finding best city from two cities(New york or Toronto) to open new Restaurant.

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
page= requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text

toronto_list = pd.read_html(page, header=0, attrs={"class":"wikitable sortable"})[0]
toronto_list.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [3]:
toronto_df=pd.DataFrame(toronto_list)

In [4]:
toronto_df.columns

Index(['Postcode', 'Borough', 'Neighbourhood'], dtype='object')

In [5]:
toronto_df.shape

(289, 3)

### Preprocessing the Newyork Data

In [6]:
toronto_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [7]:
borough_indexes=toronto_df[toronto_df['Borough']=='Not assigned'].index
borough_indexes

Int64Index([  0,   1,   9,  13,  20,  21,  30,  36,  37,  45,  46,  50,  51,
             52,  54,  55,  59,  60,  61,  73,  74,  75,  88,  89,  90, 104,
            105, 106, 120, 121, 136, 137, 148, 149, 155, 161, 162, 167, 175,
            181, 182, 188, 189, 190, 194, 195, 201, 202, 203, 204, 209, 210,
            223, 224, 238, 239, 242, 243, 248, 249, 254, 255, 259, 260, 261,
            262, 264, 265, 275, 276, 277, 278, 279, 280, 281, 282, 288],
           dtype='int64')

In [8]:
toronto_df.drop(toronto_df.index[borough_indexes],inplace=True)

In [9]:
toronto_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [10]:
neighbour_indexes=toronto_df[toronto_df['Neighbourhood']=='Not assigned'].index
neighbour_indexes

Int64Index([8], dtype='int64')

In [11]:
toronto_df.drop(toronto_df.index[neighbour_indexes],inplace=True)

In [12]:
toronto_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


#### Merging the Toronto cities into one row

In [13]:
toronto_data=toronto_df.groupby(['Postcode','Borough'])['Neighbourhood'].apply(','.join).reset_index()

In [14]:
toronto_data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,Malvern
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [15]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(toronto_data['Borough'].unique()),
        toronto_data.shape[0]
    )
)

The dataframe has 11 boroughs and 103 neighborhoods.


### Now we have to set Latitude and Longitude of Toronto

In [16]:
geo_data_df = pd.read_csv('https://cocl.us/Geospatial_data')
geo_data_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [17]:
geo_data_df.columns

Index(['Postal Code', 'Latitude', 'Longitude'], dtype='object')

In [18]:
merged_toronto_data=pd.merge(toronto_data,geo_data_df,left_on='Postcode',right_on='Postal Code',how='left').drop('Postal Code',axis=1)
merged_toronto_data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,Malvern,43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [19]:
tor_data=merged_toronto_data.drop(['Postcode'],axis=1)
tor_data.head()

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude
0,Scarborough,Malvern,43.806686,-79.194353
1,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,Scarborough,Woburn,43.770992,-79.216917
4,Scarborough,Cedarbrae,43.773136,-79.239476


In [20]:
tor_data.shape

(103, 4)

## Now we have to Import New York Data

In [21]:
!wget -q -O 'newyork_data.json' https://ibm.box.com/shared/static/fbpwbovar7lf8p5sgddm06cgipa2rxpe.json
print('Data downloaded!')

Data downloaded!


In [22]:
import json # library to handle JSON files

In [23]:
with open('newyork_data.json') as json_data:
    newyork_data=json.load(json_data)

In [24]:
neighbourhoods_data=newyork_data['features']
neighbourhoods_data[0]

{'geometry': {'coordinates': [-73.84720052054902, 40.89470517661],
  'type': 'Point'},
 'geometry_name': 'geom',
 'id': 'nyu_2451_34572.1',
 'properties': {'annoangle': 0.0,
  'annoline1': 'Wakefield',
  'annoline2': None,
  'annoline3': None,
  'bbox': [-73.84720052054902,
   40.89470517661,
   -73.84720052054902,
   40.89470517661],
  'borough': 'Bronx',
  'name': 'Wakefield',
  'stacked': 1},
 'type': 'Feature'}

In [25]:
Coloumn_names=['Borough','Neighbourhood','Latitude','Longitude']
newyorkdata=pd.DataFrame(columns=Coloumn_names)

In [26]:
for data in neighbourhoods_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    newyorkdata = newyorkdata.append({'Borough': borough,
                                          'Neighbourhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)

In [27]:
newyorkdata.head()

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585


In [28]:
print('The dataframe has {} boroughs and {} neighbourhoods.'.format(
        len(newyorkdata['Borough'].unique()),
        newyorkdata.shape[0]
    )
)

The dataframe has 5 boroughs and 306 neighbourhoods.


In [29]:
newyorkdata.shape

(306, 4)

We have 306 neighborhoods and 5 boroughs in Newyork Data <br>
and 103 neighborhoods and 11 boroughs in Toronto Data 

### Now we have to findout the different venues in two Cities (Newyork and Toronto)

Installing Geographic library for finding location 

In [30]:
!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

Fetching package metadata .............
Solving package specifications: .

# All requested packages already installed.
# packages in environment at /opt/conda/envs/DSX-Python35:
#
geopy                     1.18.1                     py_0    conda-forge


Creating Map for Newyork Neighborhoods

In [31]:
address = 'New York City, NY'

geolocator = Nominatim()
location = geolocator.geocode(address)
newyork_latitude = location.latitude
newyork_longitude = location.longitude
print('The geograpical coordinate of New York City are {}, {}.'.format(newyork_latitude, newyork_longitude))

  app.launch_new_instance()


GeocoderQuotaExceeded: HTTP Error 429: Too Many Requests

In [None]:
!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

In [None]:
# create map of New York using latitude and longitude values
map_newyork = folium.Map(location=[newyork_latitude, newyork_longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighbourhood in zip(newyorkdata['Latitude'], newyorkdata['Longitude'], newyorkdata['Borough'], newyorkdata['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_newyork)  
    
map_newyork

## Creating Map for Toronto Neighbourhoods

In [None]:
address = 'Toronto, T'

geolocator = Nominatim()
location = geolocator.geocode(address)
toronto_latitude = location.latitude
toronto_longitude = location.longitude
print('The geograpical coordinate of Toronto City are {}, {}.'.format(latitude, longitude))

In [None]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[toronto_latitude, toronto_longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighbourhood in zip(tor_data['Latitude'], tor_data['Longitude'], tor_data['Borough'], tor_data['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

## Creating Foursquare Url for New York to find the locations

In [None]:
CLIENT_ID = '1SWLON1MUHPTQN1HSBNKZE11YBLHILN3YVP0BADY14LAK1MH' # your Foursquare ID
CLIENT_SECRET = 'JZ2SXLABGNVZPA0BDMQOK5K5BALYCDQGAWXYNU0G11F1ISTP' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

In [None]:
LIMIT = 100
radius = 500
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    newyork_latitude, 
    newyork_longitude, 
    radius, 
    LIMIT)
url # display URL

Send the GET request and examine the resutls

In [None]:
newyork_results = requests.get(url).json()
newyork_results

### Writing the function to extracts the category of the venue

In [None]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

#### cleaning the json and structure it into a *pandas* dataframe.

In [None]:
venues = newyork_results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

In [None]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [None]:
newyork_venues = getNearbyVenues(names=newyorkdata['Neighbourhood'],
                                   latitudes=newyorkdata['Latitude'],
                                   longitudes=newyorkdata['Longitude']
                                  )


In [None]:
newyork_venues.head()

In [None]:
newyork_venues.shape

In [None]:
newyork_venues.groupby('Neighbourhood').count()

### Finding the Unique Categories 

In [None]:
len(newyork_venues['Venue Category'].unique())

In [None]:
# one hot encoding
newyork_onehot = pd.get_dummies(newyork_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
newyork_onehot['Neighbourhood'] = newyork_venues['Neighbourhood'] 

# move neighborhood column to the first column
fixed_columns = [newyork_onehot.columns[-1]] + list(newyork_onehot.columns[:-1])
newyork_onehot = newyork_onehot[fixed_columns]

newyork_onehot.head()

In [None]:
newyork_grouped = newyork_onehot.groupby('Neighbourhood').mean().reset_index()
newyork_grouped

### Finding the Frequency of Neighbourhoods for New York

In [None]:
num_top_venues = 5

for hood in newyork_grouped['Neighbourhood']:
    print("----"+hood+"----")
    temp = newyork_grouped[newyork_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=True).reset_index(drop=True).head(num_top_venues))
    print('\n')

## Finding top diffent venues in New York

In [None]:
def return_most_different_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=True)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [None]:
import numpy as np

In [None]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most different Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most different Venue'.format(ind+1))

# create a new dataframe
newyork_venues_sorted = pd.DataFrame(columns=columns)
newyork_venues_sorted['Neighbourhood'] = newyork_grouped['Neighbourhood']

for ind in np.arange(newyork_grouped.shape[0]):
    newyork_venues_sorted.iloc[ind, 1:] = return_most_different_venues(newyork_grouped.iloc[ind, :], num_top_venues)

newyork_venues_sorted

## Creating Foursquare Url for Toronto to find the locations

In [None]:
LIMIT = 100
radius = 500
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    toronto_latitude, 
    toronto_longitude, 
    radius, 
    LIMIT)
url # display URL

Send the GET request and examine the resutls

In [None]:
newyork_results = requests.get(url).json()
newyork_results

### Writing the function to extracts the category of the venue

In [None]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

#### cleaning the json and structure it into a *pandas* dataframe.

In [None]:
venues = newyork_results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

In [None]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [None]:
toronto_venues = getNearbyVenues(names=tor_data['Neighbourhood'],
                                   latitudes=tor_data['Latitude'],
                                   longitudes=tor_data['Longitude']
                                  )

In [None]:
toronto_venues.head()

In [None]:
toronto_venues.shape

In [None]:
toronto_venues.groupby('Neighbourhood').count()

In [None]:
len(toronto_venues['Venue Category'].unique())

### One hot encoding for Normalizing the Data

In [None]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighbourhood'] = toronto_venues['Neighbourhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

In [None]:
toronto_grouped = toronto_onehot.groupby('Neighbourhood').mean().reset_index()
toronto_grouped

### Finding the Frequency of Neighbourhoods for Toronto

In [None]:
num_top_venues = 5

for hood in toronto_grouped['Neighbourhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=True).reset_index(drop=True).head(num_top_venues))
    print('\n')

## Finding top diffent venues in Toronto

In [None]:
def return_most_different_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=True)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [None]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most different Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most different Venue'.format(ind+1))

# create a new dataframe
toronto_venues_sorted = pd.DataFrame(columns=columns)
toronto_venues_sorted['Neighbourhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    toronto_venues_sorted.iloc[ind, 1:] = return_most_different_venues(toronto_grouped.iloc[ind, :], num_top_venues)

toronto_venues_sorted

In [None]:
print("Toronto neighbourhood values :",toronto_venues_sorted.shape)
print("New York neighbourhood Values :",newyork_venues_sorted.shape)