In [7]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
toronto_table = pd.read_csv('scraped_table.tsv', sep='\t', na_values='Not assigned')
'''In the line above we are changing all \'Not assigned\' values as NaN'''
toronto_table.dropna(subset=['Borough'],inplace=True)
print(toronto_table.shape)

(210, 3)


We set 'Not assigned' values as Nan for easier data handling.

### In order to perform groupby and agg function we are converting all data tpes to string for uniformity.
   We then do groupby-agg and join the returned list from groupby object using a comma

In [8]:
#df.groupby('A')['C'].apply(lambda x: "{%s}" % ', '.join(x))
toronto_table['Neighbourhood']  = toronto_table['Neighbourhood'].astype(str)
toronto_table['Postcode'] = toronto_table['Postcode'].astype(str)
toronto_table['Borough'] = toronto_table['Borough'].astype(str)
'''In order to perform groupby and agg function we are converting all data tpes to string for uniformity.
   We then do groupby-agg and join the returned list from groupby object using a comma '''
toronto_table = toronto_table.groupby(['Postcode','Borough'])['Neighbourhood'].agg(lambda col: ','.join(col)).reset_index()
#toronto_table.groupby('Postcode')['Neighbourhood'].apply(lambda x: ', '.join(x))
print(toronto_table)

    Postcode      Borough                                      Neighbourhood
0        M1B  Scarborough                                      Rouge,Malvern
1        M1C  Scarborough               Highland Creek,Rouge Hill,Port Union
2        M1E  Scarborough                    Guildwood,Morningside,West Hill
3        M1G  Scarborough                                             Woburn
4        M1H  Scarborough                                          Cedarbrae
..       ...          ...                                                ...
98       M9N         York                                             Weston
99       M9P    Etobicoke                                          Westmount
100      M9R    Etobicoke  Kingsview Village,Martin Grove Gardens,Richvie...
101      M9V    Etobicoke  Albion Gardens,Beaumond Heights,Humbergate,Jam...
102      M9W    Etobicoke                                          Northwest

[103 rows x 3 columns]


In [9]:
print(toronto_table.tail(40))
print(toronto_table.shape)

    Postcode           Borough  \
63       M5N   Central Toronto   
64       M5P   Central Toronto   
65       M5R   Central Toronto   
66       M5S  Downtown Toronto   
67       M5T  Downtown Toronto   
68       M5V  Downtown Toronto   
69       M5W  Downtown Toronto   
70       M5X  Downtown Toronto   
71       M6A        North York   
72       M6B        North York   
73       M6C              York   
74       M6E              York   
75       M6G  Downtown Toronto   
76       M6H      West Toronto   
77       M6J      West Toronto   
78       M6K      West Toronto   
79       M6L        North York   
80       M6M              York   
81       M6N              York   
82       M6P      West Toronto   
83       M6R      West Toronto   
84       M6S      West Toronto   
85       M7A  Downtown Toronto   
86       M7R       Mississauga   
87       M7Y      East Toronto   
88       M8V         Etobicoke   
89       M8W         Etobicoke   
90       M8X         Etobicoke   
91       M8Y  

#### We replace the values of 'nan' (remember it is a string rather than a np.nan value we check for since we converted all the values in all fields as 'str')

In [10]:
toronto_table['Neighbourhood'].replace('nan', toronto_table['Borough'],inplace=True)
toronto_table.shape


(103, 3)

In [11]:
toronto_table.tail(11)

Unnamed: 0,Postcode,Borough,Neighbourhood
92,M8Z,Etobicoke,"Kingsway Park South West,Mimico NW,The Queensw..."
93,M9A,Queen's Park,Queen's Park
94,M9B,Etobicoke,"Cloverdale,Islington,Martin Grove,Princess Gar..."
95,M9C,Etobicoke,"Bloordale Gardens,Eringate,Markland Wood,Old B..."
96,M9L,North York,Humber Summit
97,M9M,North York,"Emery,Humberlea"
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village,Martin Grove Gardens,Richvie..."
101,M9V,Etobicoke,"Albion Gardens,Beaumond Heights,Humbergate,Jam..."


In [12]:
toronto_table.shape

(103, 3)

In [13]:
lat_long = pd.read_csv('Geospatial_Coordinates.csv')
lat_long.tail(5)

Unnamed: 0,Postal Code,Latitude,Longitude
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437
102,M9W,43.706748,-79.594054


In [14]:
lat_long['Postal Code'].dtypes

dtype('O')

## Part 2:
### Let us sort both the DFs by the column - 'Postal code'. Since both have 103 columns, just the sliced columns must be mapping perfectly.

In [15]:
toronto_table.sort_values(by='Postcode') 
lat_long[['Latitude','Longitude']].sort_values(by='Latitude')
toronto_table[['Latitude','Longitude']] = lat_long[['Latitude','Longitude']]

In [16]:
toronto_table.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [17]:
toronto_venues = toronto_table[toronto_table.Borough.str.contains('Toronto')]

## Using K clusters of value '5'

In [20]:
toronto_venues

# import k-means from clustering stage
from sklearn.cluster import KMeans
# run k-means clustering
kclusters = 5
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_venues.drop(columns=['Postcode', 'Borough', 'Neighbourhood']) )
kmeans.labels_

array([0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       1, 1, 3, 3, 3, 4, 4, 4, 3, 2, 3, 3, 2, 2, 2, 4, 0])

In [21]:
toronto_merged = toronto_venues
toronto_merged['Cluster Labels'] = kmeans.labels_
toronto_merged

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels
37,M4E,East Toronto,The Beaches,43.676357,-79.293031,0
41,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188,0
42,M4L,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572,0
43,M4M,East Toronto,Studio District,43.659526,-79.340923,0
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,1
45,M4P,Central Toronto,Davisville North,43.712751,-79.390197,1
46,M4R,Central Toronto,North Toronto West,43.715383,-79.405678,1
47,M4S,Central Toronto,Davisville,43.704324,-79.38879,1
48,M4T,Central Toronto,"Moore Park,Summerhill East",43.689574,-79.38316,1
49,M4V,Central Toronto,"Deer Park,Forest Hill SE,Rathnelly,South Hill,...",43.686412,-79.400049,1


### Analyzing Each Neighborhood

In [24]:
#!conda install -c conda-forge folium=0.5.0 --yes 
# uncomment this line if you haven't completed the Foursquare API lab

# map rendering library
import folium 

toronto_merged

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels
37,M4E,East Toronto,The Beaches,43.676357,-79.293031,0
41,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188,0
42,M4L,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572,0
43,M4M,East Toronto,Studio District,43.659526,-79.340923,0
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,1
45,M4P,Central Toronto,Davisville North,43.712751,-79.390197,1
46,M4R,Central Toronto,North Toronto West,43.715383,-79.405678,1
47,M4S,Central Toronto,Davisville,43.704324,-79.38879,1
48,M4T,Central Toronto,"Moore Park,Summerhill East",43.689574,-79.38316,1
49,M4V,Central Toronto,"Deer Park,Forest Hill SE,Rathnelly,South Hill,...",43.686412,-79.400049,1


In [28]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

latitude = 43.6
longitude = -79.3

map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters