# Coursera Capstone Project

- This Jupyter Notebook will be used primarily for the requirements set forth in the final Capstone project

In [None]:
import pandas as pd
import numpy as np

In [None]:
print("Hello Capstone Project Course!")

# Continuing to build upon the previous notebook submission
### Utilizing BeautifulSoup, urllib and read_html to scrape data from the source Wikipedia page into a Pandas Dataframe

In [None]:
from bs4 import BeautifulSoup

In [None]:
import urllib
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
urllib.request.urlopen(url)

In [None]:
dfs = pd.read_html(url)
df = dfs[0]
df.head(10)

In [None]:
type(df)

## The starting dimensions of the table are 289 rows and 3 columns

In [None]:
df.shape

## This step will rename the columns to the column headers scraped from the website

In [None]:
df1 = df.rename(columns=df.iloc[0])
df1.head(10)

## The following step identifies which of the Borough values equal "Not assigned" and drops those rows from the table, resulting in the removal of 77 rows

In [None]:
# Get names of indexes for which column Borough has value 'Not assigned'
indexNames = df1[df1['Borough'] == 'Not assigned'].index
 
# Delete these row indexes from dataFrame
df1.drop(indexNames, inplace=True)

df1.shape

In [None]:
df1.head(10)

## The following step is performed to clean up the table appearance by removing the first row containing values equal to the column headers.

In [None]:
df2 = df1.drop(df1.index[0])
df2.head(10)

In [None]:
df2.shape

## Assigning the Borough value to the Neighbourhood column with values of "Not assigned" (see row index 9)

In [None]:
df2.loc[df2['Neighbourhood'] == 'Not assigned', 'Neighbourhood'] = df2['Borough']
df2.head(15)

## Identifying and grouping rows with identical Postcode values and Borough values.  At the same time, combining the Neighbourhood values into a single row using a join function and resetting the index, but ensuring the groupby does not conduct the default sort based on the Postcode.

In [None]:
df3 = df2.groupby(['Postcode','Borough'],sort=False)['Neighbourhood'].apply(lambda x: ', '.join(x.astype(str))).reset_index()
df3.head(15)

## Outputting the final dimensions of the dataframe; the previous step removed a significant number of rows resulting in a final count of 103 rows and 3 columns.

In [None]:
df3.shape

In [None]:
import numpy as np 
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import json

! conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim

import requests
from pandas.io.json import json_normalize

import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes
import folium # map rendering

print('All have been imported and you are ready to proceed!')

#### Contingency Data
https://cocl.us/Geospatial_data

In [None]:
df3.columns

In [None]:
df4 = pd.read_csv("https://cocl.us/Geospatial_data")
df4.columns

In [None]:
df4.rename(columns={'Postal Code': 'Postcode'}, inplace=True)
df4.columns

In [None]:
df5 = pd.merge(df3, df4, on='Postcode', how='outer')
df5.groupby('Postcode', sort=False)
df5

### Use the geopy library to get the latitude and longitude values of Toronto

In [None]:
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent='toronto_explorer')
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geographical coordinates of Toronto Canada are {}, {}.'.format(latitude, longitude))

### Create a map of Toronto

In [None]:
# Create a map of Toronto
#latitude=43.70011
#longitude=-79.4163
map_Toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to the map
for lat, lng, borough, neighbourhood in zip(df5['Latitude'], df5['Longitude'], df5['Borough'], df5['Neighbourhood']):
    label = ('{}'.format(borough + ' , ' + neighbourhood))
    label = folium.Popup(str(label), parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=7,
        parse_html=False).add_to(map_Toronto)

map_Toronto

print(df5[df5['Borough'].str.contains("Toronto")])

This cell can be changed from Markdown in order to identify Buroughs with "Toronto" in the name

In [None]:
toronto_data = df5

toronto_data = df5[df5['Borough'].str.contains("Toronto")].reset_index(drop=True)
toronto_data

This cell can be changed from Markdown to limit the data set to Boroughs that contain "Toronto" in the name

In [None]:
# Sensitive Cell
CLIENT_ID ='MKPDVLWEHMEB3INFYKHU5RY5OQH2G4DIM5GHWRL0D4Y004OS'
CLIENT_SECRET = 'DUKSMGJ4ES5M1YDDZIFUJS0YR0HX1ZGQXJ12JW4VQS1GCAVB'
VERSION = '20180605'

### Taking the first entry in our dataframe, we can explore nearby venues
#### (Extract the name, latitude, longitude, etc.)

In [None]:
toronto_data.loc[0, 'Neighbourhood']

In [None]:
neighbourhood_lat = toronto_data.loc[0, 'Latitude'] # latitude of the neighbourhood
neighbourhood_lng = toronto_data.loc[0, 'Longitude'] # longitude of the neighbourhood
neighbourhood_name = toronto_data.loc[0, 'Neighbourhood'] # name of the neighbourhood

print('Latitude and longitude of {} are {}, {}'.format(neighbourhood_name, 
                                                neighbourhood_lat, 
                                                neighbourhood_lng))

#### The top 100 venues within a 500 meter radius of Parkwoods

In [None]:
LIMIT = 100
radius = 500

url2 = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID,
    CLIENT_SECRET,
    VERSION,
    neighbourhood_lat,
    neighbourhood_lng,
    radius,
    LIMIT)

In [None]:
results = requests.get(url2).json()
results

### Extract the category of the venue

In [None]:
def get_category_type(row):
        try:
            categories_list = row['categories']
        except:
            categories_list = row['venue.categories']
            
        if len(categories_list) == 0:
            return None
        else:
            return categories_list[0]['name']

### Clean and structure the json date into a pandas dataframe

In [None]:
venues = results['response']['groups'][0]['items']

nearby_venues = json_normalize(venues)

# filtering the columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues = nearby_venues.loc[:, filtered_columns]

# filter the category
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean the columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues

### The number of venues returned by Foursquare

In [None]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

### Apply the function to all neighbourhoods

In [None]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
        
        # Create the API request URL
        url3 = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID,
            CLIENT_SECRET,
            VERSION,
            lat,
            lng,
            radius,
            LIMIT)
        
        # make the Get request
        results = requests.get(url3).json()["response"]['groups'][0]['items']
        
        # return only the relevant responses for each venue
        venues_list.append([(
            name,
            lat,
            lng,
            v['venue']['name'],
            v['venue']['location']['lat'],
            v['venue']['location']['lng'],
            v['venue']['categories'][0]['name']) for v in results])
        
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood',
                            'Neighbourhood Latitude',
                            'Neighbourhood Longitude',
                            'Venue',
                            'Venue Latitude',
                            'Venue Longitude',
                            'Venue Category']
    return(nearby_venues)

In [None]:
toronto_venues = getNearbyVenues(names=toronto_data['Neighbourhood'],
                                latitudes=toronto_data['Latitude'],
                                longitudes=toronto_data['Longitude']
                                )

### check the size of the new Dataframe

In [None]:
print(toronto_venues.shape)
toronto_venues.head(5)

### The number of venues for each neighbourhood

In [None]:
toronto_venues.groupby('Neighbourhood').count()

### Unique categories from all of the venues

In [None]:
print('There are {} unique categories.'.format(len(toronto_venues['Venue Category'].unique())))

### Analyze each of the Neighbourhoods

In [None]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighbourhood column back to dataframe
toronto_onehot['Neighbourhood'] = toronto_venues['Neighbourhood'] 

# move neighbourhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot

### What is the new dataframe size?

In [None]:
toronto_onehot.shape

Group rows by neighbourhood and the mean of the category frequency

In [None]:
toronto_grouped = toronto_onehot.groupby('Neighbourhood').mean().reset_index()
toronto_grouped

### What is the new size?

In [None]:
toronto_grouped.shape

### Neighbourhood with top 5 most common values

In [None]:
num_top_venues = 5

for hood in toronto_grouped['Neighbourhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

### Add data to a pandas dataframe

#### (Function to sort the venues in descending order)

In [None]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

#### (Create dataframe and display top 10 venues per neighbourhood)

In [None]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighbourhoods_venues_sorted = pd.DataFrame(columns=columns)
neighbourhoods_venues_sorted['Neighbourhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighbourhoods_venues_sorted.head()

### Cluster the Neighbourhoods

#### Run k-means to group the neighbourhoods into 5 different clusters

In [None]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighbourhood', 1)
toronto_grouped_clustering.dropna()

# run k-means clustering
k_means = KMeans(init="k-means++", n_clusters=kclusters, n_init=50)
k_means.fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
k_means.labels_.astype(int)

In [None]:
neighbourhoods_venues_sorted

#### (Create a new dataframe that includes the clusters as well as the top 10 venues)

In [None]:
# add clustering labels
neighbourhoods_venues_sorted.insert(0, 'Cluster Labels', k_means.labels_)

toronto_merged = toronto_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighbourhood
toronto_merged = toronto_merged.join(neighbourhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

toronto_merged.head(6) # check the last columns!

In [None]:
toronto_merged['Cluster Labels']

In [None]:
toronto_merged.dropna(inplace=True)
toronto_merged

### Map the resulting clusters

In [None]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lng, poi, cluster in zip(toronto_merged['Latitude'],
                                  toronto_merged['Longitude'],
                                  toronto_merged['Neighbourhood'],
                                  toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' included in Cluster ' + str(cluster+1), parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=rainbow[int(cluster-4)],
        fill=True,
        fill_color=rainbow[int(cluster-1)],
        fill_opacity=1).add_to(map_clusters)
       
map_clusters

## By grouping the results by the 1st Most Common Venue, we can classify the Neighbourhoods

### Examine the Clusters

### Cluster 1

In [None]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0,
                   toronto_merged.columns[[1,2] + list(range(6, toronto_merged.shape[1]))]]

In [None]:
toronto_merged[toronto_merged['Cluster Labels'] == 0].groupby('1st Most Common Venue').size()

### Cluster 2

In [None]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1,
                   toronto_merged.columns[[1,2] + list(range(6, toronto_merged.shape[1]))]]

In [None]:
toronto_merged[toronto_merged['Cluster Labels'] == 1].groupby('1st Most Common Venue').size()

### Cluster 3

In [None]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2,
                   toronto_merged.columns[[1,2] + list(range(6, toronto_merged.shape[1]))]]

In [None]:
toronto_merged[toronto_merged['Cluster Labels'] == 2].groupby('1st Most Common Venue').size()

### Cluster 4

In [None]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3,
                   toronto_merged.columns[[1,2] + list(range(6, toronto_merged.shape[1]))]]

In [None]:
toronto_merged[toronto_merged['Cluster Labels'] == 3].groupby('1st Most Common Venue').size()

### Cluster 5

In [None]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4,
                   toronto_merged.columns[[1,2] + list(range(6, toronto_merged.shape[1]))]]

In [None]:
toronto_merged[toronto_merged['Cluster Labels'] == 4].groupby('1st Most Common Venue').size()