In [2]:
import random # library for random number generation
import numpy as np # library for vectorized computation
import pandas as pd # library to process data as dataframes

import matplotlib.pyplot as plt # plotting library
# backend for rendering plots within the browser
%matplotlib inline 

from sklearn.cluster import KMeans 
from sklearn.datasets.samples_generator import make_blobs

!pip install bs4
from bs4 import BeautifulSoup
import requests

print('Libraries imported.')

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
Collecting bs4
  Downloading bs4-0.0.1.tar.gz (1.1 kB)
Building wheels for collected packages: bs4
  Building wheel for bs4 (setup.py) ... [?25ldone
[?25h  Created wheel for bs4: filename=bs4-0.0.1-py3-none-any.whl size=1272 sha256=3f4f19860e9886d3e7b05f74b9d8a8e6be2e8df531e7420bda82df1a69145039
  Stored in directory: /tmp/wsuser/.cache/pip/wheels/0a/9e/ba/20e5bbc1afef3a491f0b3bb74d508f99403aabe76eda2167ca
Successfully built bs4
Installing collected packages: bs4
Successfully installed bs4-0.0.1
Libraries imported.


In [3]:
url = "https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=1011037969"

In [60]:
data = requests.get(url).text
soup = BeautifulSoup(data,"html5lib")
#print(soup.prettify())

In [61]:
tables = soup.find_all('table')

In [62]:
len(tables)

3

Expecting index to be 0

In [63]:
for index, table in enumerate(tables):
    if ("Postal Code" in str(table)):
        table_index = index
print(table_index)

0


View HTML data

In [65]:
#print(tables[table_index].prettify())

Writting data from HTML into dataframe

In [66]:
postal_code_data = pd.DataFrame(columns=["Postal Code","Borough","Neighbourhood"])

for row in tables[0].tbody.find_all("tr"):
    col = row.find_all("td")
    if (col != []):
        postal_code = col[0].text
        borough = col[1].text
        neighbourhood = col[2].text
        postal_code_data = postal_code_data.append({"Postal Code":postal_code, "Borough":borough, "Neighbourhood":neighbourhood}, ignore_index=True)

postal_code_data.head(5)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A\n,Not assigned\n,Not assigned\n
1,M2A\n,Not assigned\n,Not assigned\n
2,M3A\n,North York\n,Parkwoods\n
3,M4A\n,North York\n,Victoria Village\n
4,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront\n"


Clean up by removing "\n" at the end of every cell

In [67]:
postal_code_data = postal_code_data.replace('\\n','',regex=True)
postal_code_data.head(5)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


Cleaning up data (remove "Not assigned") and reset index

In [68]:
postal_code_data = postal_code_data[postal_code_data["Borough"] != "Not assigned"]
postal_code_data = postal_code_data[postal_code_data["Neighbourhood"] != "Not assigned"]
postal_code_data.reset_index(drop=True, inplace=True)
postal_code_data.head(5)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


Getting dimensions of data frame for report

In [69]:
postal_code_data.shape

(103, 3)

The GeoCoder tunction kept failing so I decided to use the given CSV file.

In [72]:
csvurl = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs_v1/Geospatial_Coordinates.csv"

In [74]:
!wget -q -O 'lat_lng.csv' https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs_v1/Geospatial_Coordinates.csv
print ("CSV dowloaded")

CSV dowloaded


In [80]:
lat_lng_df = pd.read_csv('lat_lng.csv')
lat_lng_df.head(5)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [81]:
lat_lng_df.shape

(103, 3)

The two data frames have the exact same dimensions

In [92]:
datatypes1 = postal_code_data.dtypes
datatypes2 = lat_lng_df.dtypes
print(datatypes1)
print(datatypes2)

Postal Code      object
Borough          object
Neighbourhood    object
dtype: object
Postal Code     object
Latitude       float64
Longitude      float64
dtype: object


Both DFs have the same data type for their "Postal Code"-column. Now we can start merging the two.

In [95]:
total_df = postal_code_data.merge(lat_lng_df, on='Postal Code')
total_df.head(5)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


Creating a DF with only Toronto Boroughs and reseting index.

In [98]:
toronto_df = total_df[total_df["Borough"].str.contains("Toronto")]
toronto_df.reset_index(drop=True, inplace=True)
toronto_df.head(5)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031


These value_counts give an overview of the data we are trying to cluster

In [99]:
total_df["Borough"].value_counts()

North York          24
Downtown Toronto    19
Scarborough         17
Etobicoke           12
Central Toronto      9
West Toronto         6
East Toronto         5
East York            5
York                 4
Toronto/York         1
Mississauga          1
Name: Borough, dtype: int64

In [100]:
toronto_df["Borough"].value_counts()

Downtown Toronto    19
Central Toronto      9
West Toronto         6
East Toronto         5
Toronto/York         1
Name: Borough, dtype: int64

We learn the coordinates of Toronto: 

In [107]:
from geopy.geocoders import Nominatim
! conda install -c conda-forge folium
#import folium
import matplotlib.cm as cm
import matplotlib.colors as colors

address = 'Toronto'
geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

Collecting package metadata (current_repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.
Solving environment: failed with repodata from current_repodata.json, will retry with next repodata source.
Collecting package metadata (repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.
Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python-3.7-main

  added / updated specs:
    - folium


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    _libgcc_mutex-0.1          |      conda_forge           3 KB  conda-forge
    _openmp_mutex-4.5          |           1_llvm           5 KB  conda-forge
    _py-xgboost-mutex-2.0      |            cpu_0           8 KB  conda-forge
    _pytorch_select-0.2        |            gpu_0           2 KB
    absl-py-0.12.0         

In [111]:
import folium
print(f'The lat/lng coordinates of Toronto are {latitude}, {longitude}.')

The lat/lng coordinates of Toronto are 43.6534817, -79.3839347.


In [136]:
#adding labels for easy colour application in graph
toronto_df['Label']=toronto_df['Borough'].replace(to_replace=['Downtown Toronto','Central Toronto','West Toronto','East Toronto'],value=[1,2,3,4],inplace=False)

#setting cluster number as unique borough names
k=len(toronto_df["Borough"].unique())

# create map
toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# setting color scheme for the clusters
X = np.arange(k)
Ys = [i + X + (i*X)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(Ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# adding markers to the map
markers_colors = []
for lat, lng, counter in zip(toronto_df['Latitude'], toronto_df['Longitude'], toronto_df['Label']):
    label = folium.Popup(str(toronto_df['Borough']) + ' Cluster ' + str(counter), parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=rainbow[counter-1],
        fill=True,
        fill_color=rainbow[counter-1],
        fill_opacity=0.7).add_to(toronto)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


TypeError: unsupported operand type(s) for -: 'str' and 'int'

Seems like some bug reporting an error message while the map has been assigned to toronto...

In [137]:
toronto