# Final Capstone Project


# Analysis of opening a new shopping centre in Sydney

### Hamid Doost


_____________________________________________________________________________________________________________________________

### 1.1 Importing necessary libraries

In [18]:
from bs4 import BeautifulSoup # Library for web scraping
import requests   # Library to handle requests
import numpy as np # Library for numericals
import pandas as pd # Library for working with dataframs

from sklearn.cluster import KMeans # Library for machine learning

#!conda install -c conda-forge folium=0.5.0 --yes
import folium # Library for map rendering

!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim

#!conda install -c conda-forge geocoder --yes
import geocoder

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

print ("Libraries imported!")

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

## Package Plan ##

  environment location: C:\Users\doost\anaconda3

  added / updated specs:
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    geopy-1.21.0               |             py_0          58 KB  conda-forge
    ------------------------------------------------------------
                                           Total:          92 KB

The following NEW packages will be INSTALLED:

  geographiclib      conda-forge/noarch::geographiclib-1.50-py_0
  geopy              conda-forge/noarch::geopy-1.21.0-py_0



Downloading and Extracting Packages

geopy-1.21.0         | 58 KB     |            |   0% 
geopy-1.21.0         | 58 KB     | ##7        |  27% 
geopy-1.21.0         | 58 KB

### 1.2 Web scraping for list of Sydneys suburbs from Wikipedia

In [5]:
# Scraping list of Sydneys suburbs from Wikipedia
List_url = "https://en.wikipedia.org/wiki/Category:Suburbs_of_Sydney"
source = requests.get(List_url).text
soup = BeautifulSoup(source, 'html.parser')

# create a list to store neighborhood data
neighborhoodList = []


# append the data into the list
for row in soup.find_all("div", class_="mw-category")[0].findAll("a"):
    neighborhoodList.append(row.text)

# create a new DataFrame from the list
Syd_df = pd.DataFrame({"Neighborhood": neighborhoodList})

Syd_df.head()


Unnamed: 0,Neighborhood
0,"Agnes Banks, New South Wales"
1,"Alexandria, New South Wales"
2,"Allambie Heights, New South Wales"
3,"Annandale, New South Wales"
4,"Appin, New South Wales"


In [6]:
Syd_df.shape

(200, 1)

## 2. Get geographical data for Sydneys suburbs

In [7]:

# define a function to get coordinates
def get_latlng(neighborhood):
    # initialize your variable to None
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Sydney, Australia'.format(neighborhood))
        lat_lng_coords = g.latlng
    return lat_lng_coords

In [8]:
# call the function to get the coordinates, store in a new list using list comprehension

coords = [ get_latlng(neighborhood) for neighborhood in Syd_df["Neighborhood"].tolist() ]
coords

[[-33.61444999999998, 150.7108300000001],
 [-33.91236999999995, 151.19703000000004],
 [-33.76560999999998, 151.25159000000008],
 [-33.88004999999998, 151.1713000000001],
 [-34.08474165091943, 150.80666092840218],
 [-33.93664999999993, 151.14679],
 [-33.80813999999998, 151.18380000000002],
 [-33.68870999999996, 151.1094700000001],
 [-33.84847999999994, 151.02951000000007],
 [-33.897611192481975, 151.15457451924817],
 [-33.63585999999998, 151.3280800000001],
 [-33.85767999999996, 151.19137000000012],
 [-33.85599999999994, 151.17587000000003],
 [-33.94551999999993, 151.1414400000001],
 [-33.914589999999976, 151.03428000000008],
 [-34.07858772306238, 151.12971724808014],
 [-33.75758999999994, 150.9896000000001],
 [-33.73795999999993, 150.95551000000012],
 [-33.87884999999994, 151.25407000000007],
 [-33.728569999999934, 151.22090000000003],
 [-33.95088999999996, 151.1255500000001],
 [-33.85185999999993, 151.18099000000007],
 [-33.89235999999994, 151.02068000000008],
 [-33.98796999999996, 15

In [13]:
# create temporary dataframe to populate the coordinates into Latitude and Longitude
df_coords = pd.DataFrame(coords, columns=['Latitude', 'Longitude'])

In [14]:
# merge the coordinates into the original dataframe
Syd_df['Latitude'] = df_coords['Latitude']
Syd_df['Longitude'] = df_coords['Longitude']

Syd_df.head()

Unnamed: 0,Neighborhood,Latitude,Longitude
0,"Agnes Banks, New South Wales",-33.61445,150.71083
1,"Alexandria, New South Wales",-33.91237,151.19703
2,"Allambie Heights, New South Wales",-33.76561,151.25159
3,"Annandale, New South Wales",-33.88005,151.1713
4,"Appin, New South Wales",-34.084742,150.806661


In [15]:
Syd_df.shape

(200, 3)

In [16]:

# save the DataFrame as CSV file
Syd_df.to_csv("Syd_df.csv", index=False)

## 3. Creating a map of Sydney with lables from datafram

In [19]:
# get the coordinates of Sydney
address = 'Sydney, Australia'

geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Sydney, Australia {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Sydney, Australia -33.8548157, 151.2164539.


In [20]:
# create map of Sydney using latitude and longitude values
map_Syd = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, neighborhood in zip(Syd_df['Latitude'], Syd_df['Longitude'], Syd_df['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_Syd)  
    
map_Syd

## 4. Getting information from Foursquare for neighbourhood

### Foursquare information

In [21]:
CLIENT_ID = 'KCUQTOFTF4HZ0ROJNJTNXQJFTNFH32A1FKQOUF2QCYKLIA4X'
CLIENT_SECRET = 'JF3S4NHLZPERTTEOG4ATCPWRYTJIYKLBQ1YFEXEV2TZ3XYCW'
VERSION = '20200404'



### Now, let's get the top 100 venues that are within a radius of 2000 meters.



In [22]:
radius = 2000
LIMIT = 100

venues = []

for lat, long, neighborhood in zip(Syd_df['Latitude'], Syd_df['Longitude'], Syd_df['Neighborhood']):
    
    # create the API request URL
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    # make the GET request
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    # return only relevant information for each nearby venue
    for venue in results:
        venues.append((
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

In [23]:
# convert the venues list into a new DataFrame
venues_df = pd.DataFrame(venues)

# define the column names
venues_df.columns = ['Neighborhood', 'Latitude', 'Longitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']

print(venues_df.shape)
venues_df.head()

(12369, 7)


Unnamed: 0,Neighborhood,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,"Agnes Banks, New South Wales",-33.61445,150.71083,Wog Mobile,-33.619594,150.706412,Rental Car Location
1,"Agnes Banks, New South Wales",-33.61445,150.71083,Yarramundi Reserve,-33.613377,150.698378,Nature Preserve
2,"Agnes Banks, New South Wales",-33.61445,150.71083,D & V Turf Supplies Pty Ltd,-33.623196,150.702574,Other Repair Shop
3,"Agnes Banks, New South Wales",-33.61445,150.71083,Navua Reserve,-33.608786,150.69602,Park
4,"Agnes Banks, New South Wales",-33.61445,150.71083,Trees Adventure,-33.612809,150.692359,Rock Climbing Spot


In [24]:
# Number of venues which were returned for each neighorhood

venues_df.groupby(["Neighborhood"]).count()



Unnamed: 0_level_0,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Agnes Banks, New South Wales",5,5,5,5,5,5
"Alexandria, New South Wales",100,100,100,100,100,100
"Allambie Heights, New South Wales",50,50,50,50,50,50
"Annandale, New South Wales",100,100,100,100,100,100
"Appin, New South Wales",53,53,53,53,53,53
...,...,...,...,...,...,...
"Summer Hill, New South Wales",100,100,100,100,100,100
"Surry Hills, New South Wales",100,100,100,100,100,100
Sydney Olympic Park,100,100,100,100,100,100
"Tahmoor, New South Wales",7,7,7,7,7,7


In [25]:
# Let's find out how many unique categories can be curated from all the returned venues

print('There are {} uniques categories.'.format(len(venues_df['VenueCategory'].unique())))


There are 331 uniques categories.


In [26]:
# print out the list of categories
venues_df['VenueCategory'].unique()[:50]


array(['Rental Car Location', 'Nature Preserve', 'Other Repair Shop',
       'Park', 'Rock Climbing Spot', 'Bar', 'Café', 'Flea Market',
       'Brewery', 'Distillery', 'Souvlaki Shop', 'Bakery',
       'Furniture / Home Store', 'Ice Cream Shop', 'Recreation Center',
       'Climbing Gym', 'Gym / Fitness Center', 'Lebanese Restaurant',
       'Coffee Shop', 'Playground', 'Italian Restaurant',
       'Basketball Court', 'Dog Run', 'Liquor Store', 'Burger Joint',
       'Thai Restaurant', 'Home Service', 'Vietnamese Restaurant',
       'Seafood Restaurant', 'Greek Restaurant', 'Skating Rink',
       'Shopping Mall', 'Pet Store', 'Gym', 'Sporting Goods Shop',
       'Pizza Place', 'Supermarket', 'Hotel', 'Pub', 'Bistro',
       'Gourmet Shop', 'Dive Bar', 'Sake Bar', 'Portuguese Restaurant',
       'Grocery Store', 'Garden Center', 'Cocktail Bar', 'Theater',
       'Farmers Market', 'Fast Food Restaurant'], dtype=object)

In [41]:
# check if the results contain "Shopping Mall"
"Shopping Mall" in venues_df['VenueCategory'].unique()


True

## 5. Analysing neighbourhoods

In [28]:
# one hot encoding
Syd_onehot = pd.get_dummies(venues_df[['VenueCategory']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
Syd_onehot['Neighborhoods'] = venues_df['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [Syd_onehot.columns[-1]] + list(Syd_onehot.columns[:-1])
Syd_onehot = Syd_onehot[fixed_columns]

print(Syd_onehot.shape)
Syd_onehot.head()

(12369, 332)


Unnamed: 0,Neighborhoods,ATM,Accessories Store,Afghan Restaurant,Airport,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Arepa Restaurant,...,Warehouse Store,Water Park,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit
0,"Agnes Banks, New South Wales",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Agnes Banks, New South Wales",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Agnes Banks, New South Wales",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Agnes Banks, New South Wales",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Agnes Banks, New South Wales",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
# Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

Syd_grouped = Syd_onehot.groupby(["Neighborhoods"]).mean().reset_index()

print(Syd_grouped.shape)
Syd_grouped

(200, 332)


Unnamed: 0,Neighborhoods,ATM,Accessories Store,Afghan Restaurant,Airport,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Arepa Restaurant,...,Warehouse Store,Water Park,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit
0,"Agnes Banks, New South Wales",0.0,0.00,0.0,0.0,0.00,0.0,0.0,0.00,0.0,...,0.00,0.0,0.00,0.00,0.0,0.00,0.00,0.00,0.0,0.0
1,"Alexandria, New South Wales",0.0,0.00,0.0,0.0,0.00,0.0,0.0,0.00,0.0,...,0.00,0.0,0.00,0.00,0.0,0.00,0.00,0.00,0.0,0.0
2,"Allambie Heights, New South Wales",0.0,0.00,0.0,0.0,0.00,0.0,0.0,0.02,0.0,...,0.00,0.0,0.00,0.00,0.0,0.00,0.00,0.00,0.0,0.0
3,"Annandale, New South Wales",0.0,0.00,0.0,0.0,0.00,0.0,0.0,0.00,0.0,...,0.00,0.0,0.00,0.01,0.0,0.00,0.00,0.00,0.0,0.0
4,"Appin, New South Wales",0.0,0.00,0.0,0.0,0.00,0.0,0.0,0.00,0.0,...,0.00,0.0,0.00,0.00,0.0,0.00,0.00,0.00,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,"Summer Hill, New South Wales",0.0,0.00,0.0,0.0,0.00,0.0,0.0,0.00,0.0,...,0.00,0.0,0.00,0.00,0.0,0.00,0.00,0.00,0.0,0.0
196,"Surry Hills, New South Wales",0.0,0.00,0.0,0.0,0.00,0.0,0.0,0.00,0.0,...,0.00,0.0,0.01,0.02,0.0,0.01,0.00,0.01,0.0,0.0
197,Sydney Olympic Park,0.0,0.01,0.0,0.0,0.00,0.0,0.0,0.01,0.0,...,0.01,0.0,0.00,0.00,0.0,0.00,0.01,0.00,0.0,0.0
198,"Tahmoor, New South Wales",0.0,0.00,0.0,0.0,0.00,0.0,0.0,0.00,0.0,...,0.00,0.0,0.00,0.00,0.0,0.00,0.00,0.00,0.0,0.0


In [30]:
# Create a new DataFrame for Shopping Mall data only

Syd_mall = Syd_grouped[["Neighborhoods","Shopping Mall"]]
Syd_mall.head()


Unnamed: 0,Neighborhoods,Shopping Mall
0,"Agnes Banks, New South Wales",0.0
1,"Alexandria, New South Wales",0.01
2,"Allambie Heights, New South Wales",0.02
3,"Annandale, New South Wales",0.0
4,"Appin, New South Wales",0.09434


## 6. Clustering neighbourhoods

In [31]:
# set number of clusters
kclusters = 3

Syd_clustering = Syd_mall.drop(["Neighborhoods"], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Syd_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([0, 0, 2, 0, 1, 0, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 2, 2, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, 2, 0, 0, 0,
       2, 0, 2, 2, 1, 1, 0, 0, 2, 2, 2, 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 0, 2,
       2, 0, 0, 2, 2, 0, 0, 2, 0, 1, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0,
       2, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 2, 2,
       0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 2, 1, 0, 0, 0, 0, 0, 2, 0,
       0, 1, 0, 0, 0, 0, 2, 2, 0, 0, 1, 0, 0, 2, 2, 0, 2, 0, 0, 0, 0, 1,
       1, 2, 0, 0, 2, 2, 0, 0, 2, 1, 2, 2, 0, 1, 0, 1, 0, 0, 0, 0, 0, 2,
       1, 0])

In [32]:
# create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.
Syd_merged = Syd_mall.copy()

# add clustering labels
Syd_merged["Cluster Labels"] = kmeans.labels_


In [33]:
Syd_merged.rename(columns={"Neighborhoods": "Neighborhood"}, inplace=True)
Syd_merged.head()


Unnamed: 0,Neighborhood,Shopping Mall,Cluster Labels
0,"Agnes Banks, New South Wales",0.0,0
1,"Alexandria, New South Wales",0.01,0
2,"Allambie Heights, New South Wales",0.02,2
3,"Annandale, New South Wales",0.0,0
4,"Appin, New South Wales",0.09434,1


In [34]:
Syd_merged = Syd_merged.join(Syd_df.set_index("Neighborhood"), on="Neighborhood")

print(Syd_merged.shape)
Syd_merged.head() # check the last columns!

(200, 5)


Unnamed: 0,Neighborhood,Shopping Mall,Cluster Labels,Latitude,Longitude
0,"Agnes Banks, New South Wales",0.0,0,-33.61445,150.71083
1,"Alexandria, New South Wales",0.01,0,-33.91237,151.19703
2,"Allambie Heights, New South Wales",0.02,2,-33.76561,151.25159
3,"Annandale, New South Wales",0.0,0,-33.88005,151.1713
4,"Appin, New South Wales",0.09434,1,-34.084742,150.806661


In [35]:
# sort the results by Cluster Labels
print(Syd_merged.shape)
Syd_merged.sort_values(["Cluster Labels"], inplace=True)
Syd_merged


(200, 5)


Unnamed: 0,Neighborhood,Shopping Mall,Cluster Labels,Latitude,Longitude
0,"Agnes Banks, New South Wales",0.000000,0,-33.61445,150.71083
112,Kurraba Point,0.000000,0,-33.84249,151.22256
113,"Kyeemagh, New South Wales",0.000000,0,-33.94986,151.16380
114,"La Perouse, New South Wales",0.000000,0,-33.98795,151.23110
116,"Lane Cove, New South Wales",0.014493,0,-33.81347,151.17017
...,...,...,...,...,...
160,"Peakhurst, New South Wales",0.030303,2,-33.96257,151.05809
161,"Penrith, New South Wales",0.024096,2,-33.75374,150.69820
20,"Bexley, New South Wales",0.028169,2,-33.95089,151.12555
37,"Campbelltown, New South Wales",0.050847,2,-34.06408,150.81658


## 7.Visualising the resulting clusters



In [36]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Syd_merged['Latitude'], Syd_merged['Longitude'], Syd_merged['Neighborhood'], Syd_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' - Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [37]:
# save the map as HTML file
map_clusters.save('map_clusters.html')


## 8. Examining clusters

In [38]:
# Cluster 0
Syd_merged.loc[Syd_merged['Cluster Labels'] == 0]


Unnamed: 0,Neighborhood,Shopping Mall,Cluster Labels,Latitude,Longitude
0,"Agnes Banks, New South Wales",0.000000,0,-33.61445,150.71083
112,Kurraba Point,0.000000,0,-33.84249,151.22256
113,"Kyeemagh, New South Wales",0.000000,0,-33.94986,151.16380
114,"La Perouse, New South Wales",0.000000,0,-33.98795,151.23110
116,"Lane Cove, New South Wales",0.014493,0,-33.81347,151.17017
...,...,...,...,...,...
55,"Chippendale, New South Wales",0.000000,0,-33.88846,151.19811
76,"Double Bay, New South Wales",0.000000,0,-33.87730,151.24421
74,"Denistone, New South Wales",0.015152,0,-33.79642,151.08832
75,"Dolans Bay, New South Wales",0.000000,0,-34.06255,151.12612


In [39]:
# Cluster 1
Syd_merged.loc[Syd_merged['Cluster Labels'] == 1]

Unnamed: 0,Neighborhood,Shopping Mall,Cluster Labels,Latitude,Longitude
175,"Richmond, New South Wales",0.1,1,-33.60486,150.757608
97,"Harrington Park, New South Wales",0.1,1,-34.03034,150.73269
49,"Cecil Hills, New South Wales",0.111111,1,-33.89395,150.85633
198,"Tahmoor, New South Wales",0.142857,1,-34.22542,150.5923
176,"Riverstone, New South Wales",0.090909,1,-33.67821,150.8638
146,"Narellan, New South Wales",0.105263,1,-34.03971,150.73516
155,"Oran Park, New South Wales",0.25,1,-34.00565,150.7234
48,"Cattai, New South Wales",0.142857,1,-33.962643,150.95277
191,"St Marys, New South Wales",0.125,1,-33.74723,150.7716
16,"Baulkham Hills, New South Wales",0.08,1,-33.75759,150.9896


In [40]:
# Cluster 2
Syd_merged.loc[Syd_merged['Cluster Labels'] == 2]

Unnamed: 0,Neighborhood,Shopping Mall,Cluster Labels,Latitude,Longitude
197,Sydney Olympic Park,0.03,2,-33.84356,151.07064
88,"Gladesville, New South Wales",0.033898,2,-33.83136,151.12627
87,"Gilead, New South Wales",0.03,2,-33.8696,151.20691
170,"Pyrmont, New South Wales",0.02,2,-33.87076,151.19561
168,"Prospect, New South Wales",0.05,2,-33.79963,150.91699
167,"Potts Point, New South Wales",0.02,2,-33.87157,151.22288
187,"South Maroota, New South Wales",0.021739,2,-33.742953,151.057224
7,"Asquith, New South Wales",0.02,2,-33.68871,151.10947
177,"Rockdale, New South Wales",0.021978,2,-33.95181,151.13811
85,"Frenchs Forest, New South Wales",0.043478,2,-33.75081,151.23479
