# Peer-graded assignment : Segmentation and Clustering Neighborhoods of Toronto

In [1]:
import numpy as np
import pandas as pd

## Step 1 :
Getting data from wikipedia : https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M,

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
df_raw = pd.read_html(url)

print('There are {} table(s) on this pages'.format(len(df_raw)))

There are 3 table(s) on this pages


## Step 2:
We only need the first table. 

In [3]:
df_toronto = df_raw[0]
df_toronto.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


## Step 3 : Clean Data

### Remove Borough equals 'Not Assigned' rows
### Replace Nan Neighborhoods by Borough.

In [4]:
#### Function to replace Nan or 'Not Assigned Neighborhoods with the Borough value'

def normalize_Neighborhood(row):
    if (row['Neighborhood'] == np.nan or row['Neighborhood'] == 'Not Assigned'):
        print('Replacing {} by {}'.format(row['Neighborhood'], row['Borough']))
        return row['Borough']
    else:
        return row['Neighborhood']


Removing Not assigned rows en reset index

In [5]:
print('Number of rows before Not Assigned cleaning {}', len(df_toronto))
df_toronto = df_toronto[df_toronto['Borough'] != 'Not assigned'].reset_index(drop=True)
print('Number of rows after N ot Assigned cleaning {}', len(df_toronto))

Number of rows before Not Assigned cleaning {} 180
Number of rows after N ot Assigned cleaning {} 103


replace 'Not assigned neighborhoods by Borough values.

In [6]:
df_toronto['Neighborhood'] = df_toronto.apply(normalize_Neighborhood, axis=1)
df_toronto.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [7]:
print('thez dimensions of the dataframe is {}.'.format(df_toronto.shape))

thez dimensions of the dataframe is (103, 3).


## Step 4 : Get coordinates from  http://cocl.us/Geospatial_data (csv file) and merge with df_toronto dataframe

Save data local

In [8]:
#!wget -q -O 'Geospatial_data.csv' http://cocl.us/Geospatial_data
#print('Data downloaded!')

Load data from local file to dataframe. If the file doesn't exists fetch it from http://cocl.us/Geospatial_data.

In [9]:
try:
    df_geo = pd.read_csv("Geospatial_data.csv")
except:
    !wget -q -O 'Geospatial_data.csv' http://cocl.us/Geospatial_data
    print('Data downloaded!')
    df_geo = pd.read_csv("Geospatial_data.csv")

df_geo.head()

Data downloaded!


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Merge df_toronto with df_geo

In [10]:
df_toronto = pd.merge(df_toronto, df_geo, on="Postal Code")
df_toronto.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [11]:
print('The Totronto dataframe has {} boroughs and {} neighborhoods.'.format(
        len(df_toronto['Borough'].unique()),
        df_toronto.shape[0]
    )
)
df_toronto['Borough'].unique()

The Totronto dataframe has 10 boroughs and 103 neighborhoods.


array(['North York', 'Downtown Toronto', 'Etobicoke', 'Scarborough',
       'East York', 'York', 'East Toronto', 'West Toronto',
       'Central Toronto', 'Mississauga'], dtype=object)

## Get more information of the Neighborhoods of Toronto from Foursquare

In [12]:
# The code was removed by Watson Studio for sharing.

Your credentails:
CLIENT_ID: E5A3GWKSLRSOJZIPZZD5V3FOXFWMI1LDXZYHGCC3WFW52BZY
CLIENT_SECRET:Q5TZKSCDK41J1F0DEVKBMYCE0KE2L5RB2AJ4RGEQZUWYQDYX


Borrowed from the lab Clustering Neighborhoods. Get NearbyVenues

In [13]:
import requests #

def getNearbyVenues(boroughs, names, latitudes, longitudes, radius=500, LIMIT=100):
    
    venues_list=[]
    for borough,name, lat, lng in zip(boroughs, names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            borough,
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Borough',
                  'Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [14]:
#!rm df_toronto_venues.csv 
# First try to see if data is already uploaded from Foursquare (save credits). If you want to start always from fresh data uncomment line abouve to remove local file.
file_toronto_venues='df_toronto_venues.csv'
try:
    df_toronto_venues = pd.read_csv(file_toronto_venues)
    print('Reading data from local file {}'.format(file_toronto_venues))
except:
    radius = 500
    LIMIT = 100
    df_toronto_venues = getNearbyVenues(boroughs=df_toronto['Borough'], names=df_toronto['Neighborhood'], latitudes=df_toronto['Latitude'], longitudes=df_toronto['Longitude'], radius=radius, LIMIT=LIMIT)
    df_toronto_venues.to_csv(file_toronto_venues)

Parkwoods
Victoria Village
Regent Park, Harbourfront
Lawrence Manor, Lawrence Heights
Queen's Park, Ontario Provincial Government
Islington Avenue
Malvern, Rouge
Don Mills
Parkview Hill, Woodbine Gardens
Garden District, Ryerson
Glencairn
West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale
Rouge Hill, Port Union, Highland Creek
Don Mills
Woodbine Heights
St. James Town
Humewood-Cedarvale
Eringate, Bloordale Gardens, Old Burnhamthorpe, Markland Wood
Guildwood, Morningside, West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn
Leaside
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor, Wilson Heights, Downsview North
Thorncliffe Park
Richmond, Adelaide, King
Dufferin, Dovercourt Village
Scarborough Village
Fairview, Henry Farm, Oriole
Northwood Park, York University
East Toronto
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
Kennedy Park, Ionview, East Birchmount Park
Bayview Village
Downsview
The Danforth West, Ri

In [44]:
df_toronto_number_neighborhoods_borough = df_toronto_venues[['Borough','Neighborhood','Venue']].groupby(['Borough','Neighborhood'], as_index=False).count()

df_toronto_venus_analysis = df_toronto_number_neighborhoods_borough.groupby('Borough').agg(['count','sum','max', 'min', 'mean', 'std']).rename( columns={"count":"#Boroughs","sum" : "#Venues"})
df_toronto_venus_analysis

Unnamed: 0_level_0,Venue,Venue,Venue,Venue,Venue,Venue
Unnamed: 0_level_1,#Boroughs,#Venues,max,min,mean,std
Borough,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Central Toronto,9,114,35,2,12.666667,11.726039
Downtown Toronto,19,1210,100,4,63.684211,32.405508
East Toronto,5,124,42,4,24.8,16.037456
East York,5,74,34,3,14.8,11.987493
Etobicoke,11,75,13,1,6.818182,4.729021
Mississauga,1,12,12,12,12.0,
North York,18,239,64,1,13.277778,16.516678
Scarborough,16,94,14,1,5.875,4.129165
West Toronto,6,156,42,13,26.0,11.207141
York,5,17,4,2,3.4,0.894427


### Analysis of the Boroughs and Venues in Toronto
There is a hugh difference in the number of venues per Borough. ie. York has just 17 venues in 5 neighborhoods where as Downtown Toronto has 1210 in 19 neighborhoods. 

# Now the data is ready lets play.

## Visualize Boroughs, Neighborhoods en number of venues on map
Mark Neigborhoods on map with circle with:
<ul>
    <li> 
        the bordor-color to distiguist between Boroughs
    </li>
    <li>
        fill-color indicating the number of venues in that neighborhood
    </li>
    <li>
        popup when clicking on the circle displaying Neigborhood, Borough and number of Venues in the Neighborhood.
    </li>
        
</ul>
    

### Import libraries

In [48]:
try:
    from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
except errorValue:
    print('geopy not installed. Installing now. May take a while')
    !conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
    from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

try:
    import folium # map rendering library
except:
    print('Folium not installed. Installing now. This can take a while')
    !conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
    import folium # map rendering library

print('Libraries imported.')

Folium not installed. Installing now. This can take a while
Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    altair-4.1.0               |             py_1         614 KB  conda-forge
    ca-certificates-2020.4.5.1 |       hecc5488_0         146 KB  conda-forge
    python_abi-3.6             |          1_cp36m           4 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    branca-0.4.1               |             py_0          26 KB  conda-forge
    openssl-1.1.1g             |       h516909a_0         2.1 MB  conda-forge
    certifi-2020.4.5.1         |   py36h9f0ad1d_0         151 KB  conda-forge
    ------------------------

Funcion to define and calculate the color.

In [49]:
import matplotlib.cm as cm
import matplotlib.colors as colors

def make_rainbow(plist):
    colors_array = cm.rainbow(np.linspace(0, 1, len(plist)))
    return [colors.rgb2hex(i) for i in colors_array]

def get_color(elem, plist, rainbow):
    index = np.where(plist == elem)[0]
    return rainbow[index[0]]

def get_color_cat(total):
    if total < 5:
        return 'green'
    elif total < 10:
        return 'blue'
    elif total < 15:
        return 'yellow'
    elif total < 20:
        return 'orange'
    elif total < 30:
        return 'red'
    else:
        return 'purple'

In [57]:

def create_map(address, df_map):
    

    geolocator = Nominatim(user_agent="ny_explorer")
    location = geolocator.geocode(address)
    latitude = location.latitude
    longitude = location.longitude
    print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))
    print('Boroughs of Toronto {}'.format(df_toronto['Borough'].unique()))
    
    map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)


    #df_map = df_toronto_venues #[(df_toronto['Borough'] == 'Downtown Toronto') | (df_toronto['Borough'] == 'North York')]
    unique_list = df_map['Borough'].unique()
    rainbow = make_rainbow(unique_list)
    print('Availabe colors {}'.format(rainbow))
    # add markers to map
    for lat, lng, borough, neighborhood, venues in zip(df_map['Latitude'], df_map['Longitude'], df_map['Borough'], df_map['Neighborhood'], df_map['Venues']):
        label = '{}, {} has {} venues'.format(neighborhood, borough, venues)
        label = folium.Popup(label, parse_html=True)

        folium.CircleMarker(
            [lat, lng],
            radius=10,
            popup=label,
            color=get_color(elem=borough,plist=unique_list, rainbow=rainbow),
            fill=True,
            fill_color=get_color_cat(venues),
            fill_opacity=0.7,
            parse_html=False).add_to(map_toronto)
    
        
    return map_toronto

## Map of Toronto with the neighborhoods marked with a circle. Neighborhoods of the same Borough have the same border-color. The fill-color represent a qualification of the number of venues per neighborhood. green < 5, blue < 10, yellow < 15, orange 

In [58]:
address = 'Toronto, Canada'
df = df_toronto_venues.groupby(['Borough', 'Neighborhood', 'Neighborhood Latitude', 'Neighborhood Longitude'], as_index=False).count()
df = df[['Borough', 'Neighborhood', 'Neighborhood Latitude', 'Neighborhood Longitude', 'Venue']]
df.columns = ['Borough', 'Neighborhood', 'Latitude', 'Longitude', 'Venues']
map_toronto = create_map(address=address, df_map=df)
map_toronto

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.
Boroughs of Toronto ['North York' 'Downtown Toronto' 'Etobicoke' 'Scarborough' 'East York'
 'York' 'East Toronto' 'West Toronto' 'Central Toronto' 'Mississauga']
Availabe colors ['#8000ff', '#4856fb', '#10a2f0', '#2adddd', '#62fbc4', '#9cfba4', '#d4dd80', '#ffa256', '#ff562c', '#ff0000']


In [None]:
df = 