# A Recommender System for a Wholesaler

In [27]:
# importing libraries
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
from bs4 import BeautifulSoup
import requests # library to handle requests
import json # library to handle JSON files
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
import lxml.html as lh
import bs4 as bs
import urllib.request

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# !conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
import geopy.geocoders # convert an address into latitude and longitude values

# !conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries are imported.')

Libraries are imported.


### Postal Codes in Toronto

In [28]:
## Source URL
url   = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

The following two functions below allows us to scrape html tables using

Python BeautifulSoup - python library for pulling data out of HTML and XML files, or Python lxml - simmilar library that supports scraping with xpath

In [29]:
# -----------------------------------------------------
# Using BS4 as suggested in Assignment.
# scrape_table_bs4 <tableClassName> <expected numberOfColumns>
# -----------------------------------------------------
def scrape_table_bs4(cname,cols):
    page  = urllib.request.urlopen(url).read()
    soup  = bs.BeautifulSoup(page,'lxml')
    table = soup.find("table",class_=cname)
    header = [head.findAll(text=True)[0].strip() for head in table.find_all("th")]
    data   = [[td.findAll(text=True)[0].strip() for td in tr.find_all("td")]
              for tr in table.find_all("tr")]
    data    = [row for row in data if len(row) == cols]
    # Store data to this temporary dataframe
    raw_df = pd.DataFrame(data,columns=header)
    return raw_df

# -----------------------------------------------------
# Parsing using xpath.
# -----------------------------------------------------
def scrape_table_lxml(XPATH,cols):
    page = requests.get(url)
    doc = lh.fromstring(page.content)
    table_content = doc.xpath(XPATH)
    for table in table_content:
        headers = [th.text_content().strip() for th in table.xpath('//th')]
        headers = headers[0:3]
        data    = [[td.text_content().strip() for td in tr.xpath('td')] 
                   for tr in table.xpath('//tbody/tr')]
        data    = [row for row in data if len(row) == cols]
        raw_df = pd.DataFrame(data,columns=headers)
        return raw_df

In [30]:
#Test in beautifulSoup
raw_TorontoPostalCodes = scrape_table_bs4("wikitable",3)

#Test in lxml ( for xpath based extraction)
#raw_TorontoPostalCodes = scrape_table_lxml("/html/body/div[3]/div[3]/div[4]/div/table[1]",3)

print("# Toronto Postal codes stored in data")
print(raw_TorontoPostalCodes.info(verbose=True))

# Toronto Postal codes stored in data
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 288 entries, 0 to 287
Data columns (total 3 columns):
Postcode         288 non-null object
Borough          288 non-null object
Neighbourhood    288 non-null object
dtypes: object(3)
memory usage: 6.8+ KB
None


In [31]:
# -----------------------------------------------------
# Only process the cells that have an assigned borough. 
# Ignore cells with a borough that is Not assigned.
# -----------------------------------------------------
TorontoPostalCodes=raw_TorontoPostalCodes[~raw_TorontoPostalCodes['Borough'].isin(['Not assigned'])]

# Sort and Reset index.
TorontoPostalCodes=TorontoPostalCodes.sort_values(by=['Postcode','Borough','Neighbourhood'], ascending=[1,1,1]).reset_index(drop=True)

# -----------------------------------------------------
# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.
# For example the value of the Borough and the Neighborhood columns will be Queen's Park.
# -----------------------------------------------------
TorontoPostalCodes.loc[TorontoPostalCodes['Neighbourhood'] == 'Not assigned', ['Neighbourhood']] = TorontoPostalCodes['Borough']
check_unassigned_post_state_sample = TorontoPostalCodes.loc[TorontoPostalCodes['Borough'] == 'Queen\'s Park']
#print('DEBUG:',check_unassigned_post_state_sample) ; # Print sample borough problem post state

# -----------------------------------------------------
# More than one neighborhood can exist in one postal code area. 
# For example, in the table on the Wikipedia page, you will notice that M5A is listed twice 
# and has two neighborhoods: Harbourfront and Regent Park. 
# These two rows will be combined into one row with the neighborhoods separated with a comma.
# -----------------------------------------------------
TorontoPostalCodes = TorontoPostalCodes.groupby(['Postcode','Borough'])['Neighbourhood'].apply(', '.join).reset_index()
TorontoPostalCodes

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [32]:
TorontoPostalCodes.to_csv('Toronto.csv',index=False)

In [33]:
Toronto = "Toronto.csv"
TorontoPostalCodes = pd.read_csv(Toronto).set_index("Postcode")
TorontoPostalCodes.rename_axis("Postal Code", axis='index', inplace=True)
TorontoPostalCodes.head()

Unnamed: 0_level_0,Borough,Neighbourhood
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,Scarborough,"Malvern, Rouge"
M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill"
M1E,Scarborough,"Guildwood, Morningside, West Hill"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae


### Retrieve postcode coordinates.

In [34]:
toronto_geocsv = 'https://cocl.us/Geospatial_data'
!wget -q -O 'toronto_m.geospatial_data.csv' toronto_geocsv
geocsv_data = pd.read_csv(toronto_geocsv).set_index("Postal Code")
geocsv_data.head()

Unnamed: 0_level_0,Latitude,Longitude
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,43.806686,-79.194353
M1C,43.784535,-79.160497
M1E,43.763573,-79.188711
M1G,43.770992,-79.216917
M1H,43.773136,-79.239476


In [39]:
toronto_neighborhoods = TorontoPostalCodes.join(geocsv_data)
toronto_neighborhoods.head()
toronto_neighborhoods.reset_index(inplace = True) 
toronto_neighborhoods

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


In [40]:
toronto_neighborhoods.to_csv('Toronto_data.csv',index=False)

In [42]:
df_toronto = pd.read_csv('Toronto_data.csv')
df_toronto.shape
df_toronto.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


### Create a Map of Toronto City (with its Postal Codes' Regions)

In [43]:
# for the city Toronto, latitude and longtitude are manually extracted via google search
toronto_latitude = 43.6932; toronto_longitude = -79.3832
map_toronto = folium.Map(location = [toronto_latitude, toronto_longitude], zoom_start = 10.7)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Borough'], df_toronto['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_toronto)  
    

map_toronto

### Focusing on the "North York" Borough in Toronto (its neighborhoods)

In [58]:
# df_toronto['Borough'] == 'North York'

# selecting only neighborhoods regarding to "North York" borough.
NorthYork_data = df_toronto[df_toronto['Borough'] == 'North York']
NorthYork_data.reset_index(inplace = True)
NorthYork_data.drop(columns = 'index', inplace = True)
NorthYork_data.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M2H,North York,Hillcrest Village,43.803762,-79.363452
1,M2J,North York,"Fairview, Henry Farm, Oriole",43.778517,-79.346556
2,M2K,North York,Bayview Village,43.786947,-79.385975
3,M2L,North York,"Silver Hills, York Mills",43.75749,-79.374714
4,M2M,North York,"Newtonbrook, Willowdale",43.789053,-79.408493


### Create a Map of North York and Its Neighbourhoods

In [59]:
address_scar = 'North York, Toronto'
latitude_scar = 43.803762
longitude_scar = -79.363452
print('The geograpical coordinate of "North York" are: {}, {}.'.format(latitude_scar, longitude_scar))

map_NorthYork = folium.Map(location=[latitude_scar, longitude_scar], zoom_start=11.5)

# add markers to map
for lat, lng, label in zip(NorthYork_data['Latitude'], NorthYork_data['Longitude'], NorthYork_data['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius = 10,
        popup = label,
        color ='blue',
        fill = True,
        fill_color = '#3186cc',
        fill_opacity = 0.7).add_to(map_NorthYork)  
    
map_NorthYork

The geograpical coordinate of "North York" are: 43.803762, -79.363452.


In [60]:
def foursquare_crawler (postal_code_list, neighborhood_list, lat_list, lng_list, LIMIT = 500, radius = 1000):
    result_ds = []
    counter = 0
    for postal_code, neighborhood, lat, lng in zip(postal_code_list, neighborhood_list, lat_list, lng_list):
         
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, CLIENT_SECRET, VERSION, 
            lat, lng, radius, LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        tmp_dict = {}
        tmp_dict['Postal Code'] = postal_code; tmp_dict['Neighborhood(s)'] = neighborhood; 
        tmp_dict['Latitude'] = lat; tmp_dict['Longitude'] = lng;
        tmp_dict['Crawling_result'] = results;
        result_ds.append(tmp_dict)
        counter += 1
        print('{}.'.format(counter))
        print('Data is Obtained, for the Postal Code {} (and Neighborhoods {}) SUCCESSFULLY.'.format(postal_code, neighborhood))
    return result_ds;

In [61]:
# @hiddel_cell
CLIENT_ID = 'AQ0MT4GNV1GNORTODMJD5ZM45X05RZTYG2ALGI4PZA2DGXGL' # your Foursquare ID
CLIENT_SECRET = 'HRDTXWBLG4RPIIT0QA1HY1Q24M5K4ZFVPIRTF2LXJ5QFE4BT' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

### Crawling Foursquare database for Venues in the Neighborhoods inside "North York"

In [63]:
print('Crawling different neighborhoods inside "North York"')
NorthYork_foursquare_dataset = foursquare_crawler(list(NorthYork_data['Postal Code']),
                                                   list(NorthYork_data['Neighbourhood']),
                                                   list(NorthYork_data['Latitude']),
                                                   list(NorthYork_data['Longitude']),)

Crawling different neighborhoods inside "North York"
1.
Data is Obtained, for the Postal Code M2H (and Neighborhoods Hillcrest Village) SUCCESSFULLY.
2.
Data is Obtained, for the Postal Code M2J (and Neighborhoods Fairview, Henry Farm, Oriole) SUCCESSFULLY.
3.
Data is Obtained, for the Postal Code M2K (and Neighborhoods Bayview Village) SUCCESSFULLY.
4.
Data is Obtained, for the Postal Code M2L (and Neighborhoods Silver Hills, York Mills) SUCCESSFULLY.
5.
Data is Obtained, for the Postal Code M2M (and Neighborhoods Newtonbrook, Willowdale) SUCCESSFULLY.
6.
Data is Obtained, for the Postal Code M2N (and Neighborhoods Willowdale South) SUCCESSFULLY.
7.
Data is Obtained, for the Postal Code M2P (and Neighborhoods York Mills West) SUCCESSFULLY.
8.
Data is Obtained, for the Postal Code M2R (and Neighborhoods Willowdale West) SUCCESSFULLY.
9.
Data is Obtained, for the Postal Code M3A (and Neighborhoods Parkwoods) SUCCESSFULLY.
10.
Data is Obtained, for the Postal Code M3B (and Neighborhoods 

In [64]:
# Saving results of Foursquare, so that we would not need to connect every time to Foursquare
import pickle
with open("NorthYork_foursquare_dataset.txt", "wb") as fp:   #Pickling
    pickle.dump(NorthYork_foursquare_dataset, fp)
print('Received Data from Internet is Saved to Computer.')

Received Data from Internet is Saved to Computer.


In [70]:
with open("NorthYork_foursquare_dataset.txt", "rb") as fp:   # Unpickling
    NorthYork_foursquare_dataset = pickle.load(fp)
# print(type(NorthYork_foursquare_dataset))
# Scarborough_foursquare_dataset

### Cleaning the RAW Data Received from Foursquare Database

In [71]:
# This function is created to connect to the saved list which is the received database. It will extract each venue 
# for every neighborhood inside the database

def get_venue_dataset(foursquare_dataset):
    result_df = pd.DataFrame(columns = ['Postal Code', 'Neighborhood', 
                                           'Neighborhood Latitude', 'Neighborhood Longitude',
                                          'Venue', 'Venue Summary', 'Venue Category', 'Distance'])
    # print(result_df)
    
    for neigh_dict in foursquare_dataset:
        postal_code = neigh_dict['Postal Code']; neigh = neigh_dict['Neighborhood(s)']
        lat = neigh_dict['Latitude']; lng = neigh_dict['Longitude']
        print('Number of Venuse in Coordination "{}" Posal Code and "{}" Negihborhood(s) is:'.format(postal_code, neigh))
        print(len(neigh_dict['Crawling_result']))
        
        for venue_dict in neigh_dict['Crawling_result']:
            summary = venue_dict['reasons']['items'][0]['summary']
            name = venue_dict['venue']['name']
            dist = venue_dict['venue']['location']['distance']
            cat =  venue_dict['venue']['categories'][0]['name']
            
            
            # print({'Postal Code': postal_code, 'Neighborhood': neigh, 
            #                   'Neighborhood Latitude': lat, 'Neighborhood Longitude':lng,
            #                   'Venue': name, 'Venue Summary': summary, 
            #                   'Venue Category': cat, 'Distance': dist})
            
            result_df = result_df.append({'Postal Code': postal_code, 'Neighborhood': neigh, 
                              'Neighborhood Latitude': lat, 'Neighborhood Longitude':lng,
                              'Venue': name, 'Venue Summary': summary, 
                              'Venue Category': cat, 'Distance': dist}, ignore_index = True)
            # print(result_df)
    
    return(result_df)

In [74]:
NorthYork_venues = get_venue_dataset(NorthYork_foursquare_dataset)


Number of Venuse in Coordination "M2H" Posal Code and "Hillcrest Village" Negihborhood(s) is:
22
Number of Venuse in Coordination "M2J" Posal Code and "Fairview, Henry Farm, Oriole" Negihborhood(s) is:
44
Number of Venuse in Coordination "M2K" Posal Code and "Bayview Village" Negihborhood(s) is:
14
Number of Venuse in Coordination "M2L" Posal Code and "Silver Hills, York Mills" Negihborhood(s) is:
4
Number of Venuse in Coordination "M2M" Posal Code and "Newtonbrook, Willowdale" Negihborhood(s) is:
32
Number of Venuse in Coordination "M2N" Posal Code and "Willowdale South" Negihborhood(s) is:
100
Number of Venuse in Coordination "M2P" Posal Code and "York Mills West" Negihborhood(s) is:
17
Number of Venuse in Coordination "M2R" Posal Code and "Willowdale West" Negihborhood(s) is:
11
Number of Venuse in Coordination "M3A" Posal Code and "Parkwoods" Negihborhood(s) is:
29
Number of Venuse in Coordination "M3B" Posal Code and "Don Mills North" Negihborhood(s) is:
31
Number of Venuse in Coo

In [79]:
NorthYork_venues.head()

Unnamed: 0,Postal Code,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Summary,Venue Category,Distance
0,M2H,Hillcrest Village,43.803762,-79.363452,고려삼계탕 Korean Ginseng Chicken Soup & Bibimbap,This spot is popular,Korean Restaurant,754
1,M2H,Hillcrest Village,43.803762,-79.363452,Tastee,This spot is popular,Bakery,692
2,M2H,Hillcrest Village,43.803762,-79.363452,Cummer Park,This spot is popular,Park,776
3,M2H,Hillcrest Village,43.803762,-79.363452,Galati,This spot is popular,Grocery Store,815
4,M2H,Hillcrest Village,43.803762,-79.363452,Tim Hortons,This spot is popular,Coffee Shop,731


### Saving a Cleaned Version of DataFrame as the Results from Foursquare

In [80]:
NorthYork_venues.to_csv('NorthYork_venues.csv')

### Loading Data from File (Saved "Foursquare " DataFrame for Venues)

In [81]:
NorthYork_venues = pd.read_csv('NorthYork_venues.csv')

### Summary Information about Neighborhoods inside "North York"

In [83]:
neigh_list = list(NorthYork_venues['Neighborhood'].unique())
print('Number of Neighborhoods inside Scarborough:')
print(len(neigh_list))
print('List of Neighborhoods inside Scarborough:')
neigh_list

Number of Neighborhoods inside Scarborough:
24
List of Neighborhoods inside Scarborough:


['Hillcrest Village',
 'Fairview, Henry Farm, Oriole',
 'Bayview Village',
 'Silver Hills, York Mills',
 'Newtonbrook, Willowdale',
 'Willowdale South',
 'York Mills West',
 'Willowdale West',
 'Parkwoods',
 'Don Mills North',
 'Don Mills South, Flemingdon Park',
 'Bathurst Manor, Downsview North, Wilson Heights',
 'Northwood Park, York University',
 'CFB Toronto, Downsview East',
 'Downsview West',
 'Downsview Central',
 'Downsview Northwest',
 'Victoria Village',
 'Bedford Park, Lawrence Manor East',
 'Lawrence Heights, Lawrence Manor',
 'Glencairn',
 'Downsview, North Park, Upwood Park',
 'Humber Summit',
 'Emery, Humberlea']

In [84]:

neigh_venue_summary = NorthYork_venues.groupby('Neighborhood').count()
neigh_venue_summary.drop(columns = ['Unnamed: 0']).head()

Unnamed: 0_level_0,Postal Code,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Summary,Venue Category,Distance
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
"Bathurst Manor, Downsview North, Wilson Heights",27,27,27,27,27,27,27
Bayview Village,14,14,14,14,14,14,14
"Bedford Park, Lawrence Manor East",38,38,38,38,38,38,38
"CFB Toronto, Downsview East",20,20,20,20,20,20,20
Don Mills North,31,31,31,31,31,31,31


In [85]:
print('There are {} uniques categories.'.format(len(NorthYork_venues['Venue Category'].unique())))

print('Here is the list of different categories:')
list(NorthYork_venues['Venue Category'].unique())

There are 152 uniques categories.
Here is the list of different categories:


['Korean Restaurant',
 'Bakery',
 'Park',
 'Grocery Store',
 'Coffee Shop',
 'Bank',
 'Pizza Place',
 'Sandwich Place',
 'Fast Food Restaurant',
 'Pharmacy',
 'Housing Development',
 'Chinese Restaurant',
 'Ice Cream Shop',
 'Shopping Mall',
 'Recreation Center',
 'Pool',
 'Residential Building (Apartment / Condo)',
 'Diner',
 'Convenience Store',
 'Toy / Game Store',
 'Movie Theater',
 'Burger Joint',
 'Tea Room',
 'Electronics Store',
 'American Restaurant',
 'Candy Store',
 'Department Store',
 'Salon / Barbershop',
 'Juice Bar',
 'Smoothie Shop',
 'Clothing Store',
 'Theater',
 'Caribbean Restaurant',
 'Japanese Restaurant',
 'Food Court',
 'Supermarket',
 'Restaurant',
 'Cosmetics Shop',
 'Liquor Store',
 'Video Game Store',
 'Beer Store',
 'Café',
 'Skating Rink',
 'Skate Park',
 'Trail',
 'Asian Restaurant',
 'Hookah Bar',
 'Middle Eastern Restaurant',
 'Dessert Shop',
 'Fried Chicken Joint',
 'Hot Dog Joint',
 'Indian Restaurant',
 'Ramen Restaurant',
 'Vape Store',
 'Halal Res

### One-hot Encoding the "categroies" Column into Every Unique Categorical Feature.

In [87]:
# one hot encoding
NorthYork_onehot = pd.get_dummies(data = NorthYork_venues, drop_first  = False, 
                              prefix = "", prefix_sep = "", columns = ['Venue Category'])
NorthYork_onehot.head(2)

Unnamed: 0.1,Unnamed: 0,Postal Code,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Summary,Distance,Accessories Store,Airport,American Restaurant,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Baby Store,Bagel Shop,Bakery,Bank,Bar,Baseball Field,Beer Store,Bike Shop,Boutique,Bowling Alley,Breakfast Spot,Bridal Shop,Bubble Tea Shop,Burger Joint,Burrito Place,Bus Line,Bus Stop,Butcher,Cafeteria,Café,Candy Store,Caribbean Restaurant,Cheese Shop,Chinese Restaurant,Clothing Store,Coffee Shop,Comfort Food Restaurant,Community Center,Convenience Store,Cosmetics Shop,Creperie,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Eastern European Restaurant,Electronics Store,Empanada Restaurant,Event Space,Falafel Restaurant,Fast Food Restaurant,Fireworks Store,Fish & Chips Shop,Food & Drink Shop,Food Court,French Restaurant,Fried Chicken Joint,Frozen Yogurt Shop,Furniture / Home Store,General Entertainment,Gift Shop,Golf Course,Greek Restaurant,Grocery Store,Gym,Gym / Fitness Center,Gym Pool,Halal Restaurant,History Museum,Hockey Arena,Hookah Bar,Hot Dog Joint,Hotel,Housing Development,Ice Cream Shop,Indian Restaurant,Indonesian Restaurant,Intersection,Italian Restaurant,Japanese Restaurant,Jewelry Store,Juice Bar,Karaoke Bar,Kitchen Supply Store,Korean Restaurant,Latin American Restaurant,Laundry Service,Liquor Store,Lounge,Massage Studio,Mediterranean Restaurant,Men's Store,Middle Eastern Restaurant,Miscellaneous Shop,Mobile Phone Shop,Movie Theater,Nightclub,Office,Optical Shop,Other Repair Shop,Paper / Office Supplies Store,Park,Pet Store,Pharmacy,Photography Lab,Pizza Place,Playground,Plaza,Pool,Portuguese Restaurant,Pub,Ramen Restaurant,Recreation Center,Rental Car Location,Residential Building (Apartment / Condo),Restaurant,Road,Salad Place,Salon / Barbershop,Sandwich Place,Seafood Restaurant,Shop & Service,Shopping Mall,Skate Park,Skating Rink,Ski Area,Ski Chalet,Smoothie Shop,Snack Place,Soccer Field,Spa,Sporting Goods Shop,Sports Bar,Sports Club,Steakhouse,Storage Facility,Supermarket,Sushi Restaurant,Tea Room,Tennis Court,Thai Restaurant,Theater,Toy / Game Store,Trail,Train Station,Turkish Restaurant,Vape Store,Video Game Store,Video Store,Vietnamese Restaurant,Wings Joint,Women's Store,Yoga Studio
0,0,M2H,Hillcrest Village,43.803762,-79.363452,고려삼계탕 Korean Ginseng Chicken Soup & Bibimbap,This spot is popular,754,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,M2H,Hillcrest Village,43.803762,-79.363452,Tastee,This spot is popular,692,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### Manually Selecting Related Features for the Wholesaler

In [88]:
# This list is created manually 
important_list_of_features = [
 'Neighborhood', 'Neighborhood Latitude', 'Neighborhood Longitude', 
    'American Restaurant', 'Asian Restaurant', 'Bakery',  'Beer Store', 'Breakfast Spot', 'Burger Joint', 'Burrito Place', 'Cafeteria', 'Caribbean Restaurant',  'Chinese Restaurant', 'Comfort Food Restaurant', 'Dim Sum Restaurant', 'Diner', 'Eastern European Restaurant', 'Empanada Restaurant', 'Falafel Restaurant', 'Fast Food Restaurant', 'Food Court', 'French Restaurant', 'Greek Restaurant', 'Halal Restaurant', 'Indian Restaurant', 'Indonesian Restaurant', 'Japanese Restaurant',  'Korean Restaurant', 'Ramen Restaurant', 
    'Vietnamese Restaurant']

In [89]:
NorthYork_onehot = NorthYork_onehot[important_list_of_features].drop(
    columns = ['Neighborhood Latitude', 'Neighborhood Longitude']).groupby(
    'Neighborhood').sum()


NorthYork_onehot.head()

Unnamed: 0_level_0,American Restaurant,Asian Restaurant,Bakery,Beer Store,Breakfast Spot,Burger Joint,Burrito Place,Cafeteria,Caribbean Restaurant,Chinese Restaurant,Comfort Food Restaurant,Dim Sum Restaurant,Diner,Eastern European Restaurant,Empanada Restaurant,Falafel Restaurant,Fast Food Restaurant,Food Court,French Restaurant,Greek Restaurant,Halal Restaurant,Indian Restaurant,Indonesian Restaurant,Japanese Restaurant,Korean Restaurant,Ramen Restaurant,Vietnamese Restaurant
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
"Bathurst Manor, Downsview North, Wilson Heights",0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
Bayview Village,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,2,0,0,0
"Bedford Park, Lawrence Manor East",1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,2,0,0,1,0,1,0,0,0,0,0
"CFB Toronto, Downsview East",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
Don Mills North,0,1,0,0,1,2,0,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,3,0,0,0


### Integrating Different Restaurants (Assuming Different Resaturants Use the Same Raw Groceries)

In [90]:
feat_name_list = list(NorthYork_onehot.columns)
restaurant_list = []


for counter, value in enumerate(feat_name_list):
    if value.find('Restaurant') != (-1):
        restaurant_list.append(value)
        
NorthYork_onehot['Total Restaurants'] = NorthYork_onehot[restaurant_list].sum(axis = 1)
NorthYork_onehot = NorthYork_onehot.drop(columns = restaurant_list)


feat_name_list = list(NorthYork_onehot.columns)
joint_list = []


for counter, value in enumerate(feat_name_list):
    if value.find('Joint') != (-1):
        joint_list.append(value)
        
NorthYork_onehot['Total Joints'] = NorthYork_onehot[joint_list].sum(axis = 1)
NorthYork_onehot = NorthYork_onehot.drop(columns = joint_list)

In [91]:
NorthYork_onehot

Unnamed: 0_level_0,Bakery,Beer Store,Breakfast Spot,Burrito Place,Cafeteria,Diner,Food Court,Total Restaurants,Total Joints
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
"Bathurst Manor, Downsview North, Wilson Heights",0,0,0,0,0,1,0,1,0
Bayview Village,0,0,0,0,0,0,0,4,0
"Bedford Park, Lawrence Manor East",1,0,0,0,0,0,0,6,0
"CFB Toronto, Downsview East",0,0,0,0,0,0,0,1,0
Don Mills North,0,0,1,0,1,1,0,6,2
"Don Mills South, Flemingdon Park",0,2,0,0,0,0,0,8,1
Downsview Central,0,0,0,0,0,0,0,2,0
Downsview Northwest,0,1,0,0,0,0,0,7,0
Downsview West,0,0,0,0,0,0,0,1,0
"Downsview, North Park, Upwood Park",1,0,0,0,0,0,0,2,0


### Run k-means to Cluster Neighborhoods into 5 Clusters

In [117]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

# run k-means clustering
kmeans = KMeans(n_clusters = 5, random_state = 0).fit(NorthYork_onehot)

In [115]:
# Showing Centers of Each Cluster
means_df = pd.DataFrame(kmeans.cluster_centers_)
means_df.columns = NorthYork_onehot.columns
means_df.index = ['G1','G2','G3','G4','G5']
means_df['Total Sum'] = means_df.sum(axis = 1)
means_df.sort_values(axis = 0, by = ['Total Sum'], ascending=False)

Unnamed: 0,Bakery,Beer Store,Breakfast Spot,Burrito Place,Cafeteria,Diner,Food Court,Total Restaurants,Total Joints,Total Sum
G2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,27.0,1.0,31.0
G3,0.5,1.0,0.0,0.0,0.0,0.0,0.25,7.75,0.5,10.0
G4,0.2,0.0,0.2,0.0,0.2,0.2,0.0,5.8,0.6,7.2
G5,0.4,0.0,0.0,0.0,0.0,0.2,0.0,2.8,0.0,3.4
G1,0.333333,2.775558e-17,0.0,0.0,0.0,0.111111,0.0,0.666667,0.0,1.111111


### Result: Best Group is G2; Second Best Group is G3; Third Best Group is G4;

### Inserting "kmeans.labels_" into the Original DataFrame. Finding the Corresponding Group for Each Neighborhood.

In [139]:
neigh_venue_summary = NorthYork_venues.groupby('Neighborhood').count()
neigh_venue_summary.drop(columns = ['Unnamed: 0']).head()
neigh_venue_summary.reset_index(inplace = True)
neigh_venue_summary["Group"] = kmeans.labels_ + 1
#NorthYork_venues
#NorthYork_data.reset_index(inplace = True)
#orthYork_data.drop(columns = 'index', inplace = True)
#neigh_summary = pd.DataFrame([neigh_venue_summary.index, 1 + kmeans.labels_])
#neigh_venue_summary.columns = ['Neighborhood', 'Group']
neigh_summary = neigh_venue_summary[['Neighborhood','Group']]
neigh_summary

Unnamed: 0,Neighborhood,Group
0,"Bathurst Manor, Downsview North, Wilson Heights",1
1,Bayview Village,5
2,"Bedford Park, Lawrence Manor East",4
3,"CFB Toronto, Downsview East",1
4,Don Mills North,4
5,"Don Mills South, Flemingdon Park",3
6,Downsview Central,5
7,Downsview Northwest,3
8,Downsview West,1
9,"Downsview, North Park, Upwood Park",5


### Deducing Results: Best Neighborhood is

In [140]:
neigh_summary[neigh_summary['Group'] == 2]

Unnamed: 0,Neighborhood,Group
21,Willowdale South,2


In [141]:
name_of_neigh = list(neigh_summary[neigh_summary['Group'] == 2]['Neighborhood'])[0]
NorthYork_venues[NorthYork_venues['Neighborhood'] == name_of_neigh].iloc[0,1:5].to_dict()

{'Postal Code': 'M2N',
 'Neighborhood': 'Willowdale South',
 'Neighborhood Latitude': 43.7701199,
 'Neighborhood Longitude': -79.40849279999998}

## Thank You 