# IBM Data Science Professional Certificate Final Project
This notebook will be the work environment for the "Neighbothood Battles" project. It is the final project of the IBM Data Science PRofessional Certificate on Coursera.

### Importing the neighborhood data from https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M
Web scraping using urllib, BeautifulSoup and pandas

In [1]:
# Importing libraries
import pandas as pd
import numpy as np
import urllib.request
from bs4 import BeautifulSoup

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

# Requesting the page
page = urllib.request.urlopen(url)

# Creating BeautifulSoup Object
soup = BeautifulSoup(page, "lxml")

# Getting the table
table = soup.find('table', class_='wikitable sortable')

print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of postal codes of Canada: M - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"3f996570-256b-4f45-92a3-87ba6c847372","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":960187814,"wgRevisionId":960187814,"wgArticleId":539066,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles with short description","Communications in Ontario","Postal codes in Canada","Toron

In [3]:
# Getting the values of each column
A=[]
B=[]
C=[]

for row in table.findAll('tr'):
    cells=row.findAll('td')
    if len(cells)==3:
        A.append(cells[0].find(text=True))
        B.append(cells[1].find(text=True))
        C.append(cells[2].find(text=True))
        
# Creating the DataFrame
df = pd.DataFrame(A,columns=['PostalCode'])
df['Borough']=B
df['Neighborhood']=C

df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [4]:
# explore how the data is written in the table
df.values[1:10]

array([['M2A\n', 'Not assigned\n', 'Not assigned\n'],
       ['M3A\n', 'North York\n', 'Parkwoods\n'],
       ['M4A\n', 'North York\n', 'Victoria Village\n'],
       ['M5A\n', 'Downtown Toronto\n', 'Regent Park, Harbourfront\n'],
       ['M6A\n', 'North York\n', 'Lawrence Manor, Lawrence Heights\n'],
       ['M7A\n', 'Downtown Toronto\n',
        "Queen's Park, Ontario Provincial Government\n"],
       ['M8A\n', 'Not assigned\n', 'Not assigned\n'],
       ['M9A\n', 'Etobicoke\n',
        'Islington Avenue, Humber Valley Village\n'],
       ['M1B\n', 'Scarborough\n', 'Malvern, Rouge\n']], dtype=object)

In [5]:
# Removing the "\n" at the end of each string
for column in df.columns:
    df[column] = df[column].map(lambda x: x.rstrip('\n'))
    
# Removing the "Not assigned" values in each Brough
df = df[df['Borough'] != "Not assigned"]

# Resetting the dataframe index
df = df.reset_index(drop=True)

df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [6]:
# Assigning the Neighborhood "Not assigned" values to Borough Values
for n in range(0, df['Neighborhood'].shape[0]):
        if df['Neighborhood'][n] == "Not assigned":
            df['Neighborhood'][n] = df['Borough'][n]

df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [7]:
df.shape

(103, 3)

### Getting the latitude and the longitude coordinates of each neighborhood

In [8]:
# Importing coordinated DataFrame
df_coord = pd.read_csv("http://cocl.us/Geospatial_data")

df_coord.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [9]:
# Matching the values of the two DataFrames
Latitude = []
Longitude = []

for n in range(0, df.shape[0]):
    for m in range(0, df_coord.shape[0]):
        if df['PostalCode'][n] == df_coord['Postal Code'][m]:
            Latitude.append(df_coord['Latitude'][m])
            Longitude.append(df_coord['Longitude'][m])

In [10]:
# Adding the values to the data frame
df["Latitude"] = Latitude
df["Longitude"] = Longitude

df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


### Exploring and clustering the neighborhoods in Toronto

In [11]:
# Exploring the Borough columns
df["Borough"].value_counts()

North York          24
Downtown Toronto    19
Scarborough         17
Etobicoke           12
Central Toronto      9
West Toronto         6
York                 5
East Toronto         5
East York            5
Mississauga          1
Name: Borough, dtype: int64

In [12]:
# Getting the values that contains Toronto
df_toronto = df[df["Borough"].str.contains('Toronto')]

# Resetting the index
df_toronto = df_toronto.reset_index(drop = True)

df_toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031


In [13]:
# Checking the values
df_toronto['Borough'].value_counts()

Downtown Toronto    19
Central Toronto      9
West Toronto         6
East Toronto         5
Name: Borough, dtype: int64

In [14]:
# Defining FourSquare credintials
CLIENT_ID = '' # Foursquare ID
CLIENT_SECRET = '' # Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 500 # Value limit

In [15]:
import requests
# Defining a function to get all the venues in each neighbourhood
def GetNearbyVenues(borough, names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            borough,
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Borough',
                  'Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [16]:
TorontoVenues = GetNearbyVenues(borough=df_toronto['Borough'],
                                   names=df_toronto['Neighborhood'],
                                   latitudes=df_toronto['Latitude'],
                                   longitudes=df_toronto['Longitude'])

In [17]:
TorontoVenues.groupby('Neighborhood')['Venue'].count()

Neighborhood
Berczy Park                                                                                                    57
Brockton, Parkdale Village, Exhibition Place                                                                   23
Business reply mail Processing Centre, South Central Letter Processing Plant Toronto                           15
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport     17
Central Bay Street                                                                                             65
Christie                                                                                                       17
Church and Wellesley                                                                                           81
Commerce Court, Victoria Hotel                                                                                100
Davisville                                                                 

In [18]:
print('There are {} uniques categories.'.format(len(TorontoVenues['Venue Category'].unique())))

There are 236 uniques categories.


In [19]:
# Getting Dummy Values for categories
TorontoDummies=pd.get_dummies(TorontoVenues[['Venue Category']])
# Adding Neighborhood names
TorontoDummies['Neighborhood'] = TorontoVenues['Neighborhood']
# Rearranging columns
fixed_columns = [TorontoDummies.columns[-1]] + list(TorontoDummies.columns[:-1])
TorontoDummies = TorontoDummies[fixed_columns]
TorontoDummies.head()

Unnamed: 0,Neighborhood,Venue Category_Afghan Restaurant,Venue Category_Airport,Venue Category_Airport Food Court,Venue Category_Airport Lounge,Venue Category_Airport Service,Venue Category_Airport Terminal,Venue Category_American Restaurant,Venue Category_Antique Shop,Venue Category_Aquarium,...,Venue Category_Toy / Game Store,Venue Category_Trail,Venue Category_Train Station,Venue Category_Vegetarian / Vegan Restaurant,Venue Category_Video Game Store,Venue Category_Vietnamese Restaurant,Venue Category_Wine Bar,Venue Category_Wine Shop,Venue Category_Women's Store,Venue Category_Yoga Studio
0,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
TorontoDummies = TorontoDummies.groupby('Neighborhood').mean().reset_index()
TorontoDummies.head()

Unnamed: 0,Neighborhood,Venue Category_Afghan Restaurant,Venue Category_Airport,Venue Category_Airport Food Court,Venue Category_Airport Lounge,Venue Category_Airport Service,Venue Category_Airport Terminal,Venue Category_American Restaurant,Venue Category_Antique Shop,Venue Category_Aquarium,...,Venue Category_Toy / Game Store,Venue Category_Trail,Venue Category_Train Station,Venue Category_Vegetarian / Vegan Restaurant,Venue Category_Video Game Store,Venue Category_Vietnamese Restaurant,Venue Category_Wine Bar,Venue Category_Wine Shop,Venue Category_Women's Store,Venue Category_Yoga Studio
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.017544,0.0,0.0,0.0,0.0,0.0,0.0
1,"Brockton, Parkdale Village, Exhibition Place",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043478
2,"Business reply mail Processing Centre, South C...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"CN Tower, King and Spadina, Railway Lands, Har...",0.0,0.058824,0.058824,0.117647,0.176471,0.117647,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.015385,0.0,0.0,0.015385,0.0,0.0,0.015385


In [21]:
from sklearn.cluster import KMeans # Importing KMean clusterning
k=5
TorontoClustering = TorontoDummies.drop('Neighborhood', 1)

# Creating the clusers
Kmeans = KMeans(n_clusters=k, random_state=0).fit(TorontoClustering)

# Viewing the labels
Kmeans.labels_[0:10] 

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [22]:
def MostCommonVenues(row, NumTopVenues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:NumTopVenues]

In [23]:
NumTopVenues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(NumTopVenues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = TorontoDummies['Neighborhood']

for ind in np.arange(TorontoDummies.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = MostCommonVenues(TorontoDummies.iloc[ind, :], NumTopVenues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Venue Category_Coffee Shop,Venue Category_Cocktail Bar,Venue Category_Bakery,Venue Category_Seafood Restaurant,Venue Category_Restaurant,Venue Category_Café,Venue Category_Cheese Shop,Venue Category_Beer Bar,Venue Category_Park,Venue Category_Japanese Restaurant
1,"Brockton, Parkdale Village, Exhibition Place",Venue Category_Café,Venue Category_Breakfast Spot,Venue Category_Coffee Shop,Venue Category_Yoga Studio,Venue Category_Bakery,Venue Category_Stadium,Venue Category_Burrito Place,Venue Category_Restaurant,Venue Category_Climbing Gym,Venue Category_Performing Arts Venue
2,"Business reply mail Processing Centre, South C...",Venue Category_Gym / Fitness Center,Venue Category_Auto Workshop,Venue Category_Comic Shop,Venue Category_Pizza Place,Venue Category_Recording Studio,Venue Category_Restaurant,Venue Category_Burrito Place,Venue Category_Skate Park,Venue Category_Brewery,Venue Category_Light Rail Station
3,"CN Tower, King and Spadina, Railway Lands, Har...",Venue Category_Airport Service,Venue Category_Airport Lounge,Venue Category_Airport Terminal,Venue Category_Boutique,Venue Category_Sculpture Garden,Venue Category_Bar,Venue Category_Coffee Shop,Venue Category_Plane,Venue Category_Boat or Ferry,Venue Category_Rental Car Location
4,Central Bay Street,Venue Category_Coffee Shop,Venue Category_Italian Restaurant,Venue Category_Sandwich Place,Venue Category_Japanese Restaurant,Venue Category_Café,Venue Category_Salad Place,Venue Category_Thai Restaurant,Venue Category_Department Store,Venue Category_Burger Joint,Venue Category_Bubble Tea Shop


In [24]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', Kmeans.labels_)

TorontoMerged = df_toronto
# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
TorontoMerged = TorontoMerged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')
TorontoMerged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,0,Venue Category_Coffee Shop,Venue Category_Pub,Venue Category_Bakery,Venue Category_Park,Venue Category_Breakfast Spot,Venue Category_Café,Venue Category_Theater,Venue Category_Yoga Studio,Venue Category_Event Space,Venue Category_Restaurant
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,0,Venue Category_Coffee Shop,Venue Category_Sushi Restaurant,Venue Category_Yoga Studio,Venue Category_Café,Venue Category_Bar,Venue Category_Beer Bar,Venue Category_Italian Restaurant,Venue Category_Japanese Restaurant,Venue Category_Sandwich Place,Venue Category_Burrito Place
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,0,Venue Category_Clothing Store,Venue Category_Coffee Shop,Venue Category_Bubble Tea Shop,Venue Category_Café,Venue Category_Middle Eastern Restaurant,Venue Category_Cosmetics Shop,Venue Category_Italian Restaurant,Venue Category_Japanese Restaurant,Venue Category_Bookstore,Venue Category_Tea Room
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,0,Venue Category_Coffee Shop,Venue Category_Café,Venue Category_Cocktail Bar,Venue Category_Gastropub,Venue Category_American Restaurant,Venue Category_Cosmetics Shop,Venue Category_Moroccan Restaurant,Venue Category_Gym,Venue Category_Restaurant,Venue Category_Art Gallery
4,M4E,East Toronto,The Beaches,43.676357,-79.293031,2,Venue Category_Trail,Venue Category_Neighborhood,Venue Category_Health Food Store,Venue Category_Pub,Venue Category_Doner Restaurant,Venue Category_Diner,Venue Category_Discount Store,Venue Category_Distribution Center,Venue Category_Dog Run,Venue Category_Yoga Studio


In [27]:
!conda install -c conda-forge folium=0.5.0 --yes
import folium # map rendering library

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    certifi-2020.4.5.2         |   py36h9f0ad1d_0         152 KB  conda-forge
    python_abi-3.6             |          1_cp36m           4 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    openssl-1.1.1g             |       h516909a_0         2.1 MB  conda-forge
    altair-4.1.0               |             py_1         614 KB  conda-forge
    ca-certificates-2020.4.5.2 |       hecda079_0         147 KB  conda-forge
    branca-0.4.1               |             py_0          26 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    ------------------------------------------------------------
                       

In [29]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

In [31]:
# Toronto latitude and longitude
latitude = 43.6532
longitude = -79.3832
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(TorontoMerged['Latitude'], TorontoMerged['Longitude'], TorontoMerged['Neighborhood'], TorontoMerged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters