In [1]:
#conda install -c conda-forge geopy

In [55]:
# Importing required module 
from geopy.geocoders import Nominatim 
import pandas as pd

import numpy as np

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

In [56]:
import json # library to handle JSON files

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans



<b>Import the data sets</b>

In [159]:
# imports all zip codes of Tokyo
tokyo_zip = pd.read_csv("./data/tokyo_zip_latlong.csv", index_col = 0)
tokyo_zip.head()

Unnamed: 0,zip_code,都,区,町,Prefecture,District,Area,lat,long
0,1000000,東京都,千代田区,以下に掲載がない場合,TOKYO TO,CHIYODA KU,IKANIKEISAIGANAIBAAI,,
1,1020072,東京都,千代田区,飯田橋,TOKYO TO,CHIYODA KU,IIDABASHI,,
2,1020082,東京都,千代田区,一番町,TOKYO TO,CHIYODA KU,ICHIBANCHO,35.729056,139.378416
3,1010032,東京都,千代田区,岩本町,TOKYO TO,CHIYODA KU,IWAMOTOCHO,35.6956,139.775379
4,1010047,東京都,千代田区,内神田,TOKYO TO,CHIYODA KU,UCHIKANDA,35.691038,139.76729


In [160]:
tokyo_venues = pd.read_csv("./data/tokyo_venues.csv")
tokyo_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,ICHIBANCHO,35.729056,139.378416,Kidoya (喜奴屋),35.730465,139.379067,Ramen Restaurant
1,ICHIBANCHO,35.729056,139.378416,7-Eleven (セブンイレブン 立川天王橋店),35.727325,139.382386,Convenience Store
2,ICHIBANCHO,35.729056,139.378416,7-Eleven (セブンイレブン 立川一番町4丁目店),35.731071,139.382957,Convenience Store
3,ICHIBANCHO,35.729056,139.378416,Lawson (ローソン 一番町四丁目店),35.731581,139.380197,Convenience Store
4,ICHIBANCHO,35.729056,139.378416,つかさ食堂,35.728742,139.377039,Sake Bar


In [161]:
# convert the venue data into one hot encoding
Tokyo_onehot = pd.get_dummies(tokyo_venues[['Venue Category']], prefix="", prefix_sep="")

# add area column back to dataframe
Tokyo_onehot["Area"] = tokyo_venues["Neighborhood"]
Tokyo_onehot.set_index("Area", inplace = True)
Tokyo_onehot.reset_index(inplace = True)

#grouping rows by area and by taking the mean of the frequency of occurrence of each category
Tokyo_grouped = Tokyo_onehot.groupby('Area').mean().reset_index()
Tokyo_grouped.head()

Unnamed: 0,Area,ATM,Acai House,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,Airfield,Airport,American Restaurant,...,Wine Shop,Wings Joint,Women's Store,Xinjiang Restaurant,Yakitori Restaurant,Yoga Studio,Yoshoku Restaurant,Yunnan Restaurant,Zoo,Zoo Exhibit
0,ABURADAI,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ADACHI,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,AIHARAMACHI,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,AJIRO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,AKABANE,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.02,0.0,0.01,0.0,0.0,0.0


In [162]:
# merge the data
zip_neighbor = tokyo_zip.merge(Tokyo_grouped, on = "Area")
zip_neighbor.dropna(inplace = True)

In [163]:
zip_neighbor.head()

Unnamed: 0,zip_code,都,区,町,Prefecture,District,Area,lat,long,ATM,...,Wine Shop,Wings Joint,Women's Store,Xinjiang Restaurant,Yakitori Restaurant,Yoga Studio,Yoshoku Restaurant,Yunnan Restaurant,Zoo,Zoo Exhibit
0,1020082,東京都,千代田区,一番町,TOKYO TO,CHIYODA KU,ICHIBANCHO,35.729056,139.378416,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1900033,東京都,立川市,一番町,TOKYO TO,TACHIKAWA SHI,ICHIBANCHO,35.729056,139.378416,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1010032,東京都,千代田区,岩本町,TOKYO TO,CHIYODA KU,IWAMOTOCHO,35.6956,139.775379,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0
3,1010047,東京都,千代田区,内神田,TOKYO TO,CHIYODA KU,UCHIKANDA,35.691038,139.76729,0.0,...,0.0,0.0,0.0,0.0,0.01,0.0,0.04,0.0,0.0,0.0
4,1000011,東京都,千代田区,内幸町,TOKYO TO,CHIYODA KU,UCHISAIWAICHO,35.669426,139.75546,0.0,...,0.01,0.01,0.0,0.0,0.03,0.0,0.03,0.0,0.0,0.0


In [164]:
# separate the data into campus in Shibuya data and the others
shibuya_campus_zip = "1500002"

campus_zip_shibuya = zip_neighbor.query('zip_code == {}'.format(shibuya_campus_zip))
zip_neighbor = zip_neighbor.query('zip_code != {}'.format(shibuya_campus_zip))

print("We have {} campuses data and {} areas data.".format(campus_zip_neighbor.shape[0], zip_neighbor.shape[0]))

We have 1 campuses data and 1033 areas data.


<b>Plot geographic coordinates in a map</b>

In [165]:
#obtain the location of Tokyo
address = 'Tokyo'

geolocator = Nominatim(user_agent="tk_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude


# create map of Tokyo using latitude and longitude values
map_tokyo = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map to visualize the areas
for lat, lng, label in zip(zip_neighbor['lat'], zip_neighbor['long'], zip_neighbor['Area']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_tokyo)  

# add markers to map to visualize Shibuya campus
for lat, lng, label in zip(campus_zip_shibuya['lat'], campus_zip_shibuya['long'], campus_zip_shibuya['Area']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='red',
        fill=True,
        fill_color='#cc4b31',
        fill_opacity=0.7,
        parse_html=False).add_to(map_tokyo)  
    

map_tokyo

<b>Comparison of the areas by the cosine similarity.</b>

In [166]:
# function to compute the cosine similarity
def cos_sim(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

In [167]:
#make vectors
drop_col = ["都", "区", "町", "Prefecture", "District", "Area", "lat", "long"]

zip_vector = zip_neighbor.drop(drop_col, axis = 1)

zip_vector_T = zip_vector.set_index("zip_code").T
zip_vector_T.head()

zip_code,1020082,1900033,1010032,1010047,1000011,1010044,1010062,1020094,1020073,1010021,...,1000103,1000104,1000212,1000511,1001212,1001102,1001213,1001622,1001623,1001511
ATM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Acai House,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Accessories Store,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Adult Boutique,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Afghan Restaurant,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [171]:
campus_zip_vector = campus_zip_shibuya.drop(drop_col, axis = 1)

campus_zip_vector_T = campus_zip_vector.set_index("zip_code").T
campus_zip_vector_T.head()

zip_code,1500002
ATM,0.0
Acai House,0.01
Accessories Store,0.0
Adult Boutique,0.0
Afghan Restaurant,0.0


In [172]:
campus_vec = np.array(campus_zip_vector_T[1500002])

# compare the campus vector with the other vectors and store the similarity score in a dict
cos_sim_dict = {}
for zip_code in zip_vector_T.columns:
    comp_vec = np.array(zip_vector_T[zip_code])
    score = cos_sim(campus_vec, comp_vec)
    cos_sim_dict[str(zip_code)] = score

# re-roder the score in descending order and store them in a list
cos_score_sorted = sorted(cos_sim_dict.items(), key=lambda x:x[1], reverse = True)

  return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))


In [173]:
# a function to retrieve zip codes from the list
def retrieve_zip(top_n, score_list):
    zip_list = []
    for i in range(top_n):
        zip_list.append(score_list[i][0])
    
    return zip_list

In [174]:
# retrieve top 10 most similar areas to the campus
similar_area_zip = retrieve_zip(10, cos_score_sorted)

similar_area = zip_neighbor.query('zip_code == {}'.format(similar_area_zip))
similar_area

Unnamed: 0,zip_code,都,区,町,Prefecture,District,Area,lat,long,ATM,...,Wine Shop,Wings Joint,Women's Store,Xinjiang Restaurant,Yakitori Restaurant,Yoga Studio,Yoshoku Restaurant,Yunnan Restaurant,Zoo,Zoo Exhibit
10,1000001,東京都,千代田区,千代田,TOKYO TO,CHIYODA KU,CHIYODA,35.681839,139.76188,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,1000006,東京都,千代田区,有楽町,TOKYO TO,CHIYODA KU,YURAKUCHO,35.674579,139.764846,0.0,...,0.0,0.0,0.0,0.0,0.01,0.01,0.02,0.0,0.0,0.0
48,1070062,東京都,港区,南青山,TOKYO TO,MINATO KU,MINAMIAOYAMA,35.659007,139.717332,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.018519,0.0,0.0,0.0
67,1620815,東京都,新宿区,筑土八幡町,TOKYO TO,SHINJUKU KU,TSUKUDOHACHIMANCHO,35.703513,139.741119,0.0,...,0.01,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0
99,1620043,東京都,新宿区,早稲田　南町,TOKYO TO,SHINJUKU KU,WASEDA MINAMICHO,35.702653,139.577244,0.0,...,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.01,0.0
182,1350014,東京都,江東区,石島,TOKYO TO,KOTO KU,ISHIJIMA,35.655052,139.699472,0.0,...,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0
249,1520035,東京都,目黒区,自由が丘,TOKYO TO,MEGURO KU,JIYUGAOKA,35.607538,139.668828,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
327,1540024,東京都,世田谷区,三軒茶屋,TOKYO TO,SETAGAYA KU,SANGENJAYA,35.642721,139.669912,0.0,...,0.0,0.01,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0
444,1700014,東京都,豊島区,池袋　（１丁目）,TOKYO TO,TOSHIMA KU,IKEBUKURO (1-CHOME),35.730103,139.711884,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.0
719,1800003,東京都,武蔵野市,吉祥寺　南町,TOKYO TO,MUSASHINO SHI,KICHIJOJI MINAMICHO,35.701596,139.579748,0.0,...,0.0,0.01,0.0,0.0,0.03,0.0,0.01,0.0,0.0,0.0


<b> Plot all the campus and candidate places in a map</b>

In [178]:
# collect data of all campuses
other_campus_zip = ["1520035", "1800004", "1800005", "1600004"]
other_campus_sites = zip_neighbor.query('zip_code == {}'.format(full_campus_zip))
full_campus_sites = pd.concat([campus_zip_shibuya, other_campus_sites])
full_campus_sites

Unnamed: 0,zip_code,都,区,町,Prefecture,District,Area,lat,long,ATM,...,Wine Shop,Wings Joint,Women's Store,Xinjiang Restaurant,Yakitori Restaurant,Yoga Studio,Yoshoku Restaurant,Yunnan Restaurant,Zoo,Zoo Exhibit
368,1500002,東京都,渋谷区,渋谷,TOKYO TO,SHIBUYA KU,SHIBUYA,35.66013,139.707191,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0
94,1600004,東京都,新宿区,四谷,TOKYO TO,SHINJUKU KU,YOTSUYA,35.684861,139.73079,0.0,...,0.0,0.0,0.0,0.0,0.046154,0.0,0.015385,0.0,0.0,0.0
249,1520035,東京都,目黒区,自由が丘,TOKYO TO,MEGURO KU,JIYUGAOKA,35.607538,139.668828,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
717,1800004,東京都,武蔵野市,吉祥寺　本町,TOKYO TO,MUSASHINO SHI,KICHIJOJI HONCHO,35.704327,139.577541,0.0,...,0.0,0.01,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0
722,1800005,東京都,武蔵野市,御殿山,TOKYO TO,MUSASHINO SHI,GOTENYAMA,35.702393,139.575435,0.0,...,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.01,0.02


In [179]:
campus_candidate_map = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map to visualize the areas
for lat, lng, label in zip(similar_area['lat'], similar_area['long'], similar_area['Area']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(campus_candidate_map)  

# add markers to map to visualize the campuses
for lat, lng, label in zip(full_campus_sites['lat'], full_campus_sites['long'], full_campus_sites['Area']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='red',
        fill=True,
        fill_color='#cc4b31',
        fill_opacity=0.7,
        parse_html=False).add_to(campus_candidate_map)  
    

campus_candidate_map