# Capstone Project Part 2
### In this part I will be segmenting and clustering Toronto neighbourhoods based on location and vicinity qualities

In [153]:
import pandas as pd
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors


In [24]:
# Scrape the wiki link for the required table
postal_codes = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

In [25]:
# convert to data frame
df=postal_codes[0]
print(df.shape)
df.head()

(180, 3)


Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


#### We cannot use any entries that is not assigned a Borough. We must replace 'Not assigned' in the 'Borough' column with NaN and drop those values

In [26]:
df['Borough'].replace('Not assigned',np.nan,inplace = True)
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,,
1,M2A,,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [27]:
# make dataframe consisting of all entries where 'Borough' does not have NaN values
df = df[df['Borough'].notna()]

In [28]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [29]:
df['Neighborhood'].replace(np.nan,df['Borough'],inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [30]:
# Reset index
df.reset_index(drop = True,inplace = True)

Lets look at the shape of the dataframe after cleaning:

In [31]:
df.shape

(103, 3)

We need to import a csv that contains the geographical coordinates of each postal code

In [32]:
postal_coords = pd.read_csv('https://cocl.us/Geospatial_data')

In [33]:
postal_coords.head()
postal_coords.sort_values(by='Postal Code',inplace = True)


In [34]:
df.sort_values(by =['Postal Code'],inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [35]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
6,M1B,Scarborough,"Malvern, Rouge"
12,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
18,M1E,Scarborough,"Guildwood, Morningside, West Hill"
22,M1G,Scarborough,Woburn
26,M1H,Scarborough,Cedarbrae


In [36]:
postal_coords.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


I created a new dataframe to merge the latitude and longitude columns

In [37]:
geo_df = pd.DataFrame(df)

In [38]:
geo_df = geo_df.merge(postal_coords)

In [320]:
geo_df.head()

Unnamed: 0,Cluster Labels,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,1,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,1,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,1,M1G,Scarborough,Woburn,43.770992,-79.216917
4,1,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


### Map of Toronto

Import Folium to render the map

In [41]:
!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library
print('Folium imported')

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2020.4.5.1 |       hecc5488_0         146 KB  conda-forge
    python_abi-3.6             |          1_cp36m           4 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    openssl-1.1.1g             |       h516909a_0         2.1 MB  conda-forge
    altair-4.1.0               |             py_1         614 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    certifi-2020.4.5.1         |   py36h9f0ad1d_0         151 KB  conda-forge
    branca-0.4.1               |             py_0          26 KB  conda-forge
    ------------------------------------------------------------
                       

In [135]:
geo_df['Borough'].unique()

array(['Scarborough', 'North York', 'East York', 'East Toronto',
       'Central Toronto', 'Downtown Toronto', 'York', 'West Toronto',
       'Mississauga', 'Etobicoke'], dtype=object)

Use the latitude and longitude of the first row as the location for the city of Toronto

In [136]:
latitude = geo_df.iloc[0,3]
longitude = geo_df.iloc[0,4]

In [137]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

for lat, lng, borough, neighborhood in zip(geo_df['Latitude'], geo_df['Longitude'], geo_df['Borough'], geo_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Cluster the neighbourhoods

We have several unique boroughs we can cluster

In [332]:
geo_df['Borough'].unique()

array(['Scarborough', 'North York', 'East York', 'East Toronto',
       'Central Toronto', 'Downtown Toronto', 'York', 'West Toronto',
       'Mississauga', 'Etobicoke'], dtype=object)

In [322]:
grouped = geo_df.groupby(['Borough']).count()

In [323]:
from sklearn.cluster import KMeans
kclusters = 4

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(grouped)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([2, 0, 1, 1, 2, 1, 3, 0, 1, 1], dtype=int32)

In [324]:
# grouped.drop('Cluster Labels',1,inplace = True) # drop if it already exists
grouped.insert(0,'Cluster Labels',kmeans.labels_) 


In [325]:
# grouped.reset_index(inplace = True) # reset index if needed

In [326]:
grouped

Unnamed: 0,Borough,Cluster Labels,Postal Code,Neighborhood,Latitude,Longitude
0,Central Toronto,2,9,9,9,9
1,Downtown Toronto,0,19,19,19,19
2,East Toronto,1,5,5,5,5
3,East York,1,5,5,5,5
4,Etobicoke,2,12,12,12,12
5,Mississauga,1,1,1,1,1
6,North York,3,24,24,24,24
7,Scarborough,0,17,17,17,17
8,West Toronto,1,6,6,6,6
9,York,1,5,5,5,5


Add the cluster labels to geo_df

In [327]:
clusters = []

for i in range(geo_df.shape[0]):
    clusters.append(grouped.loc[grouped['Borough']== geo_df.iloc[i,2]].iloc[0,1])

In [328]:
geo_df.drop('Cluster Labels',1,inplace = True)
geo_df.insert(0,'Cluster Labels',clusters)

In [329]:
geo_df.head()

Unnamed: 0,Cluster Labels,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,0,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,0,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,0,M1G,Scarborough,Woburn,43.770992,-79.216917
4,0,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [330]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
for lat,lon,neigh,borough,cluster in zip(geo_df['Latitude'], geo_df['Longitude'],geo_df['Neighborhood'],geo_df['Borough'],geo_df['Cluster Labels']):
    label = '{},{}'.format(cluster,borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7,
        parse_html= False).add_to(map_clusters)
       
map_clusters


As you can see, the data is formed into clusters by Borough