# Capstone Week 3 Part 3

#### Importing the library

In [1]:
import pandas as pd

#### Scraping the wiki to get the data

In [2]:
df = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')[0]
df.head(12)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
8,M8A,Not assigned,Not assigned
9,M9A,Queen's Park,Not assigned


#### Renaming the column postal code as shown and correcting the spelling of Neighborhood. This is not the band.

In [3]:
df.rename(columns={'Postcode':'PostalCode', 'Neighbourhood':'Neighborhood'}, inplace=True)
df.head(1)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned


#### Removing rows where **Borough** is **Not assigned**

In [4]:
badrows = df[df['Borough']=='Not assigned'].index
df.drop(badrows, inplace=True)
df.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
9,M9A,Queen's Park,Not assigned
10,M1B,Scarborough,Rouge
11,M1B,Scarborough,Malvern
13,M3B,North York,Don Mills North


#### Copying the Borough name to the Neighborhood for the, If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough requirement.

In [5]:
df.loc[df['Neighborhood']=='Not assigned','Neighborhood'] = df.loc[df['Neighborhood']=='Not assigned'].Borough
df.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
9,M9A,Queen's Park,Queen's Park
10,M1B,Scarborough,Rouge
11,M1B,Scarborough,Malvern
13,M3B,North York,Don Mills North


#### I use a groupby to combine redundant postal codes and apply a list for each neighborhood. I then use a join to convert eash neighborhood list into a comma delimited string for each row.

In [6]:
df = df.groupby(['PostalCode','Borough'], sort=False, as_index=True)['Neighborhood'].apply(list).reset_index(drop=False)
df['Neighborhood'] = df['Neighborhood'].apply(', '.join)
df.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park
5,M9A,Queen's Park,Queen's Park
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


#### This is the shape of the dataframe.

In [7]:
df.shape

(103, 3)

## Getting the CSV data for Question 2

In [8]:
df1 = pd.read_csv('Geospatial_Coordinates.csv')
df1.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### Checking the shape

In [9]:
df1.shape

(103, 3)

#### Pre-processing, fixing the column header so it matches the or dataframe and can be used as a key to match the rows in a join.

In [10]:
df1.rename(columns={'Postal Code':'PostalCode'}, inplace=True)
df1.head(1)

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353


#### In SQL this is a LEFT JOIN, more info on https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html for doing it in pandas. Using validate to check for duplicate keys.

In [11]:
result = pd.merge(df, df1, how='left', on=['PostalCode'], validate="one_to_one")
result.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494
5,M9A,Queen's Park,Queen's Park,43.667856,-79.532242
6,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937


In [12]:
import folium
import numpy as np
import matplotlib.pyplot as plt 
from sklearn.cluster import KMeans
%matplotlib inline

In [13]:
map_toronto = folium.Map(location=[43.6532, -79.3832], zoom_start=10)

for lat, lng, borough, neighborhood in zip(result['Latitude'], result['Longitude'], result['Borough'], result['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

## For question 3 I decieded to use k-means to cluster areas based on thier homicide data to see in which areas you would be more likely shot, stabbed, or other. I went to the toronto police database to pull the data. see below.

In [14]:
#Toronto Homicide Data
#http://data.torontopolice.on.ca/datasets/homicide-1
df2 = pd.read_csv('Homicide.csv')
df2.head(12)

Unnamed: 0,X,Y,Index_,Event_Unique_ID,Occurrence_year,Division,Homicide_Type,Occurrence_Date,Hood_ID,Neighbourhood,Lat,Long,ObjectId
0,-79.38662,43.64864,201,1787320,2006,D52,Shooting,2006-10-27T04:00:00.000Z,77,Waterfront Communities-The Island (77),43.64864,-79.38662,1
1,-79.304939,43.715988,202,1470621,2006,D54,Stabbing,2006-11-05T05:00:00.000Z,43,Victoria Village (43),43.715988,-79.304939,2
2,-79.304939,43.715988,203,1470621,2006,D54,Stabbing,2006-11-05T05:00:00.000Z,43,Victoria Village (43),43.715988,-79.304939,3
3,-79.228691,43.809391,204,1448992,2006,D42,Shooting,2006-11-14T05:00:00.000Z,132,Malvern (132),43.809391,-79.228691,4
4,-79.484482,43.649334,205,1945800,2006,D11,Other,2006-11-25T05:00:00.000Z,87,High Park-Swansea (87),43.649334,-79.484482,5
5,-79.393929,43.642738,206,1796411,2006,D14,Shooting,2006-12-03T05:00:00.000Z,77,Waterfront Communities-The Island (77),43.642738,-79.393929,6
6,-79.197136,43.796772,207,1462848,2006,D42,Other,2006-12-03T05:00:00.000Z,132,Malvern (132),43.796772,-79.197136,7
7,-79.358688,43.662418,208,1451923,2006,D51,Other,2006-12-20T05:00:00.000Z,72,Regent Park (72),43.662418,-79.358688,8
8,-79.427376,43.638962,209,1501929,2006,D14,Shooting,2006-12-25T05:00:00.000Z,85,South Parkdale (85),43.638962,-79.427376,9
9,-79.573471,43.725754,210,1490523,2006,D23,Stabbing,2006-12-26T05:00:00.000Z,4,Rexdale-Kipling (4),43.725754,-79.573471,10


In [15]:
# filter columns
filtered_columns = df2[['Neighbourhood', 'Homicide_Type', 'Lat', 'Long']]
filtered_columns.rename(columns={'Neighbourhood':'Neighborhood'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(**kwargs)


In [16]:
filtered_columns

Unnamed: 0,Neighborhood,Homicide_Type,Lat,Long
0,Waterfront Communities-The Island (77),Shooting,43.648640,-79.386620
1,Victoria Village (43),Stabbing,43.715988,-79.304939
2,Victoria Village (43),Stabbing,43.715988,-79.304939
3,Malvern (132),Shooting,43.809391,-79.228691
4,High Park-Swansea (87),Other,43.649334,-79.484482
...,...,...,...,...
1088,Moss Park (73),Stabbing,43.658295,-79.371033
1089,Etobicoke West Mall (13),Shooting,43.639656,-79.562935
1090,Eglinton East (138),Shooting,43.741611,-79.239731
1091,Malvern (132),Shooting,43.810932,-79.227135


In [17]:
import string
filtered_columns['Neighborhood'] = filtered_columns['Neighborhood'].str.strip("()")
filtered_columns['Neighborhood'] = filtered_columns['Neighborhood'].str.strip(string.digits)
filtered_columns['Neighborhood'] = filtered_columns['Neighborhood'].str.strip(" ()")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [18]:
filtered_columns.Homicide_Type.unique()

array(['Shooting', 'Stabbing', 'Other'], dtype=object)

In [19]:
map_toronto = folium.Map(location=[43.6532, -79.3832], zoom_start=10)

for lat, lng, borough, Neighborhood in zip(filtered_columns['Lat'], filtered_columns['Long'], filtered_columns['Homicide_Type'], filtered_columns['Neighborhood']):
    label = '{}, {}'.format(Neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=2,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [20]:
filtered_columns.groupby('Neighborhood').count()

Unnamed: 0_level_0,Homicide_Type,Lat,Long
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Agincourt North,5,5,5
Agincourt South-Malvern West,6,6,6
Alderwood,6,6,6
Annex,7,7,7
Banbury-Don Mills,6,6,6
...,...,...,...
Wychwood,5,5,5
Yonge-Eglinton,3,3,3
Yonge-St.Clair,1,1,1
York University Heights,11,11,11


In [21]:
# one hot encoding
homicide_onehot = pd.get_dummies(filtered_columns[['Homicide_Type']], prefix="", prefix_sep="")

In [22]:
homicide_onehot['Neighborhood'] = filtered_columns['Neighborhood']

In [23]:
fixed_columns = [homicide_onehot.columns[-1]] + list(homicide_onehot.columns[:-1])
homicide_onehot = homicide_onehot[fixed_columns]

homicide_onehot.head()

Unnamed: 0,Neighborhood,Other,Shooting,Stabbing
0,Waterfront Communities-The Island,0,1,0
1,Victoria Village,0,0,1
2,Victoria Village,0,0,1
3,Malvern,0,1,0
4,High Park-Swansea,1,0,0


In [24]:
homicide_grouped = homicide_onehot.groupby('Neighborhood').mean().reset_index()

In [26]:
homicide_grouped

Unnamed: 0,Neighborhood,Other,Shooting,Stabbing
0,Agincourt North,0.600000,0.200000,0.200000
1,Agincourt South-Malvern West,0.166667,0.833333,0.000000
2,Alderwood,0.000000,0.833333,0.166667
3,Annex,0.142857,0.285714,0.571429
4,Banbury-Don Mills,0.166667,0.500000,0.333333
...,...,...,...,...
129,Wychwood,0.400000,0.400000,0.200000
130,Yonge-Eglinton,0.333333,0.666667,0.000000
131,Yonge-St.Clair,0.000000,0.000000,1.000000
132,York University Heights,0.272727,0.363636,0.363636


In [27]:
num_top_venues = 5

for hood in homicide_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = homicide_grouped[homicide_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Agincourt North----
      venue  freq
0     Other   0.6
1  Shooting   0.2
2  Stabbing   0.2


----Agincourt South-Malvern West----
      venue  freq
0  Shooting  0.83
1     Other  0.17
2  Stabbing  0.00


----Alderwood----
      venue  freq
0  Shooting  0.83
1  Stabbing  0.17
2     Other  0.00


----Annex----
      venue  freq
0  Stabbing  0.57
1  Shooting  0.29
2     Other  0.14


----Banbury-Don Mills----
      venue  freq
0  Shooting  0.50
1  Stabbing  0.33
2     Other  0.17


----Bathurst Manor----
      venue  freq
0  Stabbing   1.0
1     Other   0.0
2  Shooting   0.0


----Bay Street Corridor----
      venue  freq
0  Shooting   0.5
1  Stabbing   0.5
2     Other   0.0


----Bayview Village----
      venue  freq
0  Shooting   0.5
1  Stabbing   0.5
2     Other   0.0


----Bayview Woods-Steeles----
      venue  freq
0  Shooting   1.0
1     Other   0.0
2  Stabbing   0.0


----Bedford Park-Nortown----
      venue  freq
0     Other  0.67
1  Stabbing  0.33
2  Shooting  0.00


----Bee

2  Stabbing  0.17


----Rockcliffe-Smythe----
      venue  freq
0  Shooting  0.47
1  Stabbing  0.33
2     Other  0.20


----Roncesvalles----
      venue  freq
0     Other   0.8
1  Shooting   0.2
2  Stabbing   0.0


----Rosedale-Moore Park----
      venue  freq
0     Other   1.0
1  Shooting   0.0
2  Stabbing   0.0


----Rouge----
      venue  freq
0  Shooting  0.81
1  Stabbing  0.19
2     Other  0.00


----Rustic----
      venue  freq
0  Shooting  0.78
1     Other  0.11
2  Stabbing  0.11


----Scarborough Village----
      venue  freq
0  Shooting  0.50
1     Other  0.36
2  Stabbing  0.14


----South Parkdale----
      venue  freq
0  Shooting  0.62
1     Other  0.23
2  Stabbing  0.15


----South Riverdale----
      venue  freq
0  Shooting  0.50
1  Stabbing  0.32
2     Other  0.18


----St.Andrew-Windfields----
      venue  freq
0     Other  0.50
1  Shooting  0.25
2  Stabbing  0.25


----Steeles----
      venue  freq
0     Other  0.75
1  Stabbing  0.25
2  Shooting  0.00


----Stonegate-Qu

In [28]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [29]:
num_top_venues = 3

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_homicides_sorted = pd.DataFrame(columns=columns)
neighborhoods_homicides_sorted['Neighborhood'] = homicide_grouped['Neighborhood']

for ind in np.arange(homicide_grouped.shape[0]):
    neighborhoods_homicides_sorted.iloc[ind, 1:] = return_most_common_venues(homicide_grouped.iloc[ind, :], num_top_venues)

neighborhoods_homicides_sorted.head(10)

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,Agincourt North,Other,Stabbing,Shooting
1,Agincourt South-Malvern West,Shooting,Other,Stabbing
2,Alderwood,Shooting,Stabbing,Other
3,Annex,Stabbing,Shooting,Other
4,Banbury-Don Mills,Shooting,Stabbing,Other
5,Bathurst Manor,Stabbing,Shooting,Other
6,Bay Street Corridor,Stabbing,Shooting,Other
7,Bayview Village,Stabbing,Shooting,Other
8,Bayview Woods-Steeles,Shooting,Stabbing,Other
9,Bedford Park-Nortown,Other,Stabbing,Shooting


In [30]:
# set number of clusters
kclusters = 3

homicide_grouped_clustering = homicide_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(homicide_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 1, 1, 2, 1, 2, 1, 1, 1, 0])

In [31]:
# add clustering labels
neighborhoods_homicides_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

homicide_merged = filtered_columns

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
homicide_merged = homicide_merged.join(neighborhoods_homicides_sorted.set_index('Neighborhood'), on='Neighborhood')

homicide_merged.head() # check the last columns!

Unnamed: 0,Neighborhood,Homicide_Type,Lat,Long,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,Waterfront Communities-The Island,Shooting,43.64864,-79.38662,1,Shooting,Other,Stabbing
1,Victoria Village,Stabbing,43.715988,-79.304939,1,Shooting,Stabbing,Other
2,Victoria Village,Stabbing,43.715988,-79.304939,1,Shooting,Stabbing,Other
3,Malvern,Shooting,43.809391,-79.228691,1,Shooting,Other,Stabbing
4,High Park-Swansea,Other,43.649334,-79.484482,0,Shooting,Other,Stabbing


In [37]:
import matplotlib.cm as cm
import matplotlib.colors as colors
# create map
map_clusters = folium.Map(location=[43.6532, -79.3832], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(homicide_merged['Lat'], homicide_merged['Long'], homicide_merged['Neighborhood'], homicide_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters