# Capstone Project Notebook
This notebook will be used for the IBM Data Science Professional Certificate capstone project

In [None]:
import pandas as pd
import numpy as np

In [2]:
print('Hello Capstone Project Course!')

## Segmenting and Clustering Neighborhoods in Toronto

#### Setting up BeautifulSoup 

In [3]:
import urllib.request
from bs4 import BeautifulSoup

In [4]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = urllib.request.urlopen(url)

In [5]:
soup = BeautifulSoup(page, 'lxml')

#### Scraping data from Wikipedia into a DataFrame

In [6]:
all_tables = soup.find_all('table')

In [7]:
right_table = soup.find('table', class_ = 'wikitable sortable')

In [8]:
# Initialises column lists
Postal_Code = []
Borough = []
Neighbourhood = []

# Scrapes table and inputs data into the column lists
for row in right_table.findAll('tr'):
    cells = row.findAll('td')
    
    if len(cells) == 3:
        Postal_Code.append (cells[0].find(text = True))
        Borough.append (cells[1].find(text = True))
        Neighbourhood.append (cells[2].find(text = True))

In [9]:
print(Postal_Code[0:5])
print(Borough[0:5])
print(Neighbourhood[0:5])

['M1A\n', 'M2A\n', 'M3A\n', 'M4A\n', 'M5A\n']
['Not assigned\n', 'Not assigned\n', 'North York\n', 'North York\n', 'Downtown Toronto\n']
['Not assigned\n', 'Not assigned\n', 'Parkwoods\n', 'Victoria Village\n', 'Regent Park, Harbourfront\n']


In [10]:
# Convert column lists to a pandas dataframe
df = pd.DataFrame(Postal_Code, columns = ['Postal Code'])
df['Borough'] = Borough
df['Neighbourhood'] = Neighbourhood
df[0:5]

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A\n,Not assigned\n,Not assigned\n
1,M2A\n,Not assigned\n,Not assigned\n
2,M3A\n,North York\n,Parkwoods\n
3,M4A\n,North York\n,Victoria Village\n
4,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront\n"


#### Cleaning the data

In [11]:
df = df.replace('\n','', regex=True) # Removes the newline characters from the end of the line
df[0:5]

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [12]:
df['Borough'].replace('Not assigned', np.nan, inplace = True) # Converts to NaN values

In [13]:
df.dropna(subset = ['Borough'], axis = 0, inplace = True) # Drops rows in the 'Borough' column with NaN values

In [14]:
df.reset_index(inplace = True) # Resets the index

In [15]:
df.drop('index', axis = 1, inplace = True) # Resets the columns

In [16]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [17]:
df.shape

(103, 3)

#### Using pgeocode to merge location data

In [18]:
import pgeocode

In [19]:
# Iterates through the rows and add the latitude and longitude coordinates
i = 0 # Initialises the index number
for row in df.iterrows():
    location = geolocator.query_postal_code(row[1][0]) # Inputs the postal code into the geolocator
    df.loc[i, 'Latitude'] = location.latitude # Inputs the latitude
    df.loc[i, 'Longitude'] = location.longitude # Inputs the longitude
    i += 1 # Increases index by 1

NameError: name 'geolocator' is not defined

In [None]:
df

In [None]:
df.shape