# Install required libs

In [60]:
from bs4 import BeautifulSoup
import pandas as pd
import urllib.request
import numpy as np

# import k-means from clustering stage
from sklearn.cluster import KMeans

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

!conda install -c conda-forge folium=0.5.0 --yes 
import folium # map rendering library

Solving environment: done

# All requested packages already installed.



# Get table from web

In [4]:
html = urllib.request.urlopen("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(html.read(), "html.parser")
table = soup.find("table")
output_rows = []

for table_row in table.findAll('tr'):
    columns = table_row.findAll('td')
    output_row = []
    for column in columns:
        output_row.append(column.text)
    output_rows.append(output_row)

df = pd.DataFrame(output_rows)


# Requirement 1
# The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood

In [7]:
df.columns = ['PostalCode','Borough','Neighborhood']
df["Neighborhood"] = df["Neighborhood"].str.replace("\n", "")
print (df)
print (df.shape) #result 289 cols x 3 rows

    PostalCode           Borough  \
0         None              None   
1          M1A      Not assigned   
2          M2A      Not assigned   
3          M3A        North York   
4          M4A        North York   
5          M5A  Downtown Toronto   
6          M5A  Downtown Toronto   
7          M6A        North York   
8          M6A        North York   
9          M7A      Queen's Park   
10         M8A      Not assigned   
11         M9A         Etobicoke   
12         M1B       Scarborough   
13         M1B       Scarborough   
14         M2B      Not assigned   
15         M3B        North York   
16         M4B         East York   
17         M4B         East York   
18         M5B  Downtown Toronto   
19         M5B  Downtown Toronto   
20         M6B        North York   
21         M7B      Not assigned   
22         M8B      Not assigned   
23         M9B         Etobicoke   
24         M9B         Etobicoke   
25         M9B         Etobicoke   
26         M9B         Etobi

# Requirement 2
# Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.

In [9]:
df = df[df.Borough != 'Not assigned']
print (df.shape) #result 212 cols x 3 rows
#Excluded 77 columns

(212, 3)


# Requirement 3
# More than one neighborhood can exist in one postal code area.

In [16]:
#Assumption keep original dataframe
df2 = df 
df2 = df2.groupby(['PostalCode','Borough'])['Neighborhood'].apply(', '.join).reset_index()
print (df2.shape)
print (df2['Neighborhood']) # combining post codes

(103, 3)
0                                         Rouge, Malvern
1                 Highland Creek, Rouge Hill, Port Union
2                      Guildwood, Morningside, West Hill
3                                                 Woburn
4                                              Cedarbrae
5                                    Scarborough Village
6            East Birchmount Park, Ionview, Kennedy Park
7                        Clairlea, Golden Mile, Oakridge
8        Cliffcrest, Cliffside, Scarborough Village West
9                            Birch Cliff, Cliffside West
10     Dorset Park, Scarborough Town Centre, Wexford ...
11                                     Maryvale, Wexford
12                                             Agincourt
13               Clarks Corners, Sullivan, Tam O'Shanter
14     Agincourt North, L'Amoreaux East, Milliken, St...
15                                       L'Amoreaux West
16                                           Upper Rouge
17                    

# Requirement 4
# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough. 

In [None]:
df2 = df2[(df.Borough != 'Not assigned') & (df.Neighborhood == 'Not assigned')]
df2 = df2.replace(['Not assigned'], df.Borough)
print (df2.shape)
print (df2)

#Assumption replace original values that needed correcting
df2 = df2[df2.Neighborhood != 'Not assigned']
result = pd.concat([df2,df], keys='PostalCode')
print(result.shape)
print (result)


# Requirement 5
### Markdown as seen
### Assumptions are stated as comments in code

# Requirement 6
## Links
https://github.com/M-P-L/Capstone/commit/a115a71b2d484378085f5e3a72f771d3a747ba9c to code
https://github.com/M-P-L/Capstone/blob/master/Week%203%20Submission.ipynb to Notebook

# Requirement 7
## Given that this package can be very unreliable, I used the csv to create the data frame

In [34]:
data = pd.read_csv('https://cocl.us/Geospatial_data/Geospatial_Coordinates.csv')
#print(data)
df_merge_col = pd.merge(result, data, left_on='PostalCode', right_on='Postal Code')
print (df_merge_col.head())
print (data.head())


  PostalCode      Borough                            Neighborhood Postal Code  \
0        M1B  Scarborough                          Rouge, Malvern         M1B   
1        M1C  Scarborough  Highland Creek, Rouge Hill, Port Union         M1C   
2        M1E  Scarborough       Guildwood, Morningside, West Hill         M1E   
3        M1G  Scarborough                                  Woburn         M1G   
4        M1H  Scarborough                               Cedarbrae         M1H   

    Latitude  Longitude  
0  43.806686 -79.194353  
1  43.784535 -79.160497  
2  43.763573 -79.188711  
3  43.770992 -79.216917  
4  43.773136 -79.239476  
  Postal Code   Latitude  Longitude
0         M1B  43.806686 -79.194353
1         M1C  43.784535 -79.160497
2         M1E  43.763573 -79.188711
3         M1G  43.770992 -79.216917
4         M1H  43.773136 -79.239476


# Requirement 8
## Once you are able to create the above dataframe, submit a link to the new Notebook on your Github repository. (2 marks)

https://github.com/M-P-L/Capstone/blob/master/Week%203%20Submission.ipynb 

# Requirement 9
## Clustering

In [55]:
# set number of clusters
kclusters = 5

data.drop(['Postal Code'], axis=1, inplace=True)
#print (data.head())

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(data)
# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

# add clustering labels
df_merge_col.insert(0, 'Cluster Labels', kmeans.labels_)

print (df_merge_col)


    Latitude  Longitude
0  43.806686 -79.194353
1  43.784535 -79.160497
2  43.763573 -79.188711
3  43.770992 -79.216917
4  43.773136 -79.239476
     Cluster Labels PostalCode           Borough  \
0                 0        M1B       Scarborough   
1                 0        M1C       Scarborough   
2                 0        M1E       Scarborough   
3                 0        M1G       Scarborough   
4                 0        M1H       Scarborough   
5                 0        M1J       Scarborough   
6                 0        M1K       Scarborough   
7                 2        M1L       Scarborough   
8                 0        M1M       Scarborough   
9                 2        M1N       Scarborough   
10                0        M1P       Scarborough   
11                0        M1R       Scarborough   
12                0        M1S       Scarborough   
13                0        M1T       Scarborough   
14                0        M1V       Scarborough   
15                0     

# Requirement 10
## Generate maps to visualize your neighborhoods and how they cluster together

In [65]:
# create map
latitude = 43.6532
longitude = -79.3832
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_merge_col['Latitude'], df_merge_col['Longitude'], df_merge_col['Neighborhood'], df_merge_col['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters