# Importing and installing required modules

In [2]:
import numpy as np
import pandas as pd
import matplotlib as plt
import folium
import json
import sklearn.cluster as kmeans
!pip install inline beautifulsoup4
!pip install inline lxml
!pip install inline requests



## Importing Modules for Webscraping and for CSV

In [3]:
from bs4 import BeautifulSoup
import requests
import csv

# Scraping data using BeautifulSoup and writing the data to a csv file

In [4]:
html = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text
soup = BeautifulSoup(html, "lxml")
table = soup.find("table", class_="wikitable sortable")
info = table.findAll("tr")

with open("nb.csv", "wt+", newline="") as f:
    writer = csv.writer(f)
    for i in info:
        csv_row = []
        for cell in i.findAll(["td"]):
            csv_row.append(cell.get_text())
        writer.writerow(csv_row)

# Preprocessing data using Pandas. 
#### First I imported the data from the csv. Secondly, I removed 'Not Assigned' values from Borough column
#### Thirdly, I striped unneccasary key values
#### Fourthly, I replaced the Not Assigned values in the Neighbourhood column with its equivalent Boroughs
#### Lastly, I grouped the data for duplicates so that the Postal Codes had all there assigned Nieghbourhoods in one row and sorted them 
#### by postal code

In [5]:
df = pd.read_csv('nb.csv', names = ["Postal Code", "Borough", "Neighbourhood"])
df_drop = df[df.Borough != 'Not assigned'].reset_index(drop=True)
new_df = df_drop.apply(lambda x: x.str.strip('\n') if x.dtype == "object" else x)
new_df['Neighbourhood'].replace('Not assigned', new_df.Borough, inplace = True)
new_df = new_df.groupby(['Postal Code','Borough'])['Neighbourhood'].apply(', '.join).reset_index()
new_df.head(11)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


# Sample of the data to prove correctness of processing

In [6]:
new_df.sample(n=20)

Unnamed: 0,Postal Code,Borough,Neighbourhood
21,M2M,North York,"Newtonbrook, Willowdale"
14,M1V,Scarborough,"Agincourt North, L'Amoreaux East, Milliken, St..."
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
27,M3C,North York,"Flemingdon Park, Don Mills South"
16,M1X,Scarborough,Upper Rouge
11,M1R,Scarborough,"Maryvale, Wexford"
71,M6A,North York,"Lawrence Heights, Lawrence Manor"
66,M5S,Downtown Toronto,"Harbord, University of Toronto"
74,M6E,York,Caledonia-Fairbanks
5,M1J,Scarborough,Scarborough Village


In [7]:
new_df.shape

(103, 3)