# Building datasets

We build datasets for the pjoject here. We already downloaded Japanese zip codes data from Japan post from [Japan Post](https://www.post.japanpost.jp/zipcode/download.html). The data is saved as tokyo_postal_code.xlsx.

In [1]:
#conda install -c conda-forge geopy
# Importing required module 
from geopy.geocoders import Nominatim 
import pandas as pd

In [2]:
import json # library to handle JSON files

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

In [3]:
#Import all the postal codes of Tokyo
zip_all = pd.read_excel("tokyo_postal_code.xlsx", header = None, names = ["zip_code", "都", "区", "町", "Prefecture", "District", "Area"])
zip_all.shape

(4005, 7)

In [4]:
zip_all.head()

Unnamed: 0,zip_code,都,区,町,Prefecture,District,Area
0,1000000,東京都,千代田区,以下に掲載がない場合,TOKYO TO,CHIYODA KU,IKANIKEISAIGANAIBAAI
1,1020072,東京都,千代田区,飯田橋,TOKYO TO,CHIYODA KU,IIDABASHI
2,1020082,東京都,千代田区,一番町,TOKYO TO,CHIYODA KU,ICHIBANCHO
3,1010032,東京都,千代田区,岩本町,TOKYO TO,CHIYODA KU,IWAMOTOCHO
4,1010047,東京都,千代田区,内神田,TOKYO TO,CHIYODA KU,UCHIKANDA


In [5]:
# append geographical coordinate

zip_all["lat"] = None
zip_all["long"] = None
zip_all.head()

Unnamed: 0,zip_code,都,区,町,Prefecture,District,Area,lat,long
0,1000000,東京都,千代田区,以下に掲載がない場合,TOKYO TO,CHIYODA KU,IKANIKEISAIGANAIBAAI,,
1,1020072,東京都,千代田区,飯田橋,TOKYO TO,CHIYODA KU,IIDABASHI,,
2,1020082,東京都,千代田区,一番町,TOKYO TO,CHIYODA KU,ICHIBANCHO,,
3,1010032,東京都,千代田区,岩本町,TOKYO TO,CHIYODA KU,IWAMOTOCHO,,
4,1010047,東京都,千代田区,内神田,TOKYO TO,CHIYODA KU,UCHIKANDA,,


In [158]:
# fetch geographical coordinate by Geopy

for i, code, district, area in zip(range(zip_all.shape[0]), zip_all["zip_code"], zip_all["District"], zip_all["Area"]):
    print("converting row {}".format(i))
    
    #use new agent everytime to evade timeout
    geolocator = Nominatim(user_agent="tokyo-coder" + str(i))
    
    # save the data in the dataframe
    location = geolocator.geocode("{}, {}, Tokyo".format(code, area, district))
    if location:
        zip_all.loc[i, "lat"] = location.latitude
        zip_all.loc[i, "long"] = location.longitude


converting row 0
converting row 1
converting row 2
converting row 3
converting row 4
converting row 5
converting row 6
converting row 7
converting row 8
converting row 9
converting row 10
converting row 11
converting row 12
converting row 13
converting row 14
converting row 15
converting row 16
converting row 17
converting row 18
converting row 19
converting row 20
converting row 21
converting row 22
converting row 23
converting row 24
converting row 25
converting row 26
converting row 27
converting row 28
converting row 29
converting row 30
converting row 31
converting row 32
converting row 33
converting row 34
converting row 35
converting row 36
converting row 37
converting row 38
converting row 39
converting row 40
converting row 41
converting row 42
converting row 43
converting row 44
converting row 45
converting row 46
converting row 47
converting row 48
converting row 49
converting row 50
converting row 51
converting row 52
converting row 53
converting row 54
converting row 55
co

In [9]:
#save the current data

#zip_all.to_csv("tokyo_zip_latlong.csv")

In [4]:
#load the data

#zip_all = pd.read_csv("tokyo_zip_latlong.csv", index_col = 0)
#zip_all.shape

(4005, 9)

In [5]:
# remove rows which do not have lat and long info

zip_filtered = zip_all.dropna()
print("{} geographical coordinates are obtained successfully.".format(zip_filtered.shape[0]))

1035 geographical coordinates are obtained successfully.


In [6]:
zip_filtered

Unnamed: 0,zip_code,都,区,町,Prefecture,District,Area,lat,long
2,1020082,東京都,千代田区,一番町,TOKYO TO,CHIYODA KU,ICHIBANCHO,35.729056,139.378416
3,1010032,東京都,千代田区,岩本町,TOKYO TO,CHIYODA KU,IWAMOTOCHO,35.695600,139.775379
4,1010047,東京都,千代田区,内神田,TOKYO TO,CHIYODA KU,UCHIKANDA,35.691038,139.767290
5,1000011,東京都,千代田区,内幸町,TOKYO TO,CHIYODA KU,UCHISAIWAICHO,35.669426,139.755460
45,1010044,東京都,千代田区,鍛冶町,TOKYO TO,CHIYODA KU,KAJICHO,35.691689,139.771942
...,...,...,...,...,...,...,...,...,...
3990,1001102,東京都,三宅島　三宅村,伊豆,TOKYO TO,MIYAKEJIMA MIYAKE MURA,IZU,31.999927,139.999227
3991,1001213,東京都,三宅島　三宅村,雄山,TOKYO TO,MIYAKEJIMA MIYAKE MURA,OYAMA,35.748624,139.702435
3998,1001622,東京都,八丈島　八丈町,末吉,TOKYO TO,HACHIJOJIMA HACHIJO MACHI,SUEYOSHI,35.658664,139.723526
3999,1001623,東京都,八丈島　八丈町,中之郷,TOKYO TO,HACHIJOJIMA HACHIJO MACHI,NAKANOGO,33.065879,139.813648


Fetch venues info by Folium

In [5]:
#Folium credentials

CLIENT_ID = '' #Foursquare ID ## Removed for sequrity reason
CLIENT_SECRET = '' # your Foursquare Secret ## Removed for sequrity reason
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: XF1O1VAJQ1PVM0JZDGZP5DLGC0STTBPDGLDWVKDADMWWLKC4
CLIENT_SECRET:BUI4S2QZWXPLJCZC5BJON0UD2TIIXN30BVR43FL3JGSR40RD


In [14]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [15]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    # inside the function, venues in all neighborhoods in our data are obtained iteratively.
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        try:
            results = requests.get(url).json()["response"]['groups'][0]['items']

            # return only relevant information for each nearby venue
            venues_list.append([(
                name, 
                lat, 
                lng, 
                v['venue']['name'], 
                v['venue']['location']['lat'], 
                v['venue']['location']['lng'],  
                v['venue']['categories'][0]['name']) for v in results])
        
        except KeyError:
            venues_list.append([(
                name, 
                lat, 
                lng,
                None,
                None,
                None,
                None)])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    return(nearby_venues)

In [16]:
Tokyo_venues = getNearbyVenues(names = zip_filtered["zip_code"],
                               latitudes = zip_filtered["lat"],
                               longitudes = zip_filtered["long"])

1020082
1010032
1010047
1000011
1010044
1010062
1020094
1020073
1010021
1000001
1010031
1000003
1010003
1020093
1020071
1000006
1020085
1040044
1040042
1040031
1040061
1040033
1040041
1040052
1040045
1040051
1040055
1030016
1040032
1040043
1040028
1060041
1050022
1080022
1070061
1050014
1050023
1050011
1050012
1080072
1080071
1050004
1350091
1080074
1060031
1050003
1070062
1060046
1600007
1690072
1620825
1600001
1600021
1620834
1600017
1600016
1620065
1600015
1690075
1620815
1620808
1620067
1610035
1620835
1610031
1690073
1620851
1620836
1620055
1600004
1600011
1620056
1620043
1120012
1120003
1120002
1120004
1120006
1120005
1120014
1120011
1130022
1130024
1130031
1130001
1120001
1130021
1130033
1130023
1120015
1130032
1130034
1100006
1110032
1110053
1100008
1110024
1100013
1100005
1110034
1100014
1110022
1110051
1110056
1110042
1110043
1100004
1110031
1100016
1110054
1110035
1110021
1100003
1110023
1110033
1100015
1110036
1100011
1110041
1100001
1110052
1100012
1300001
1300011
1310045


In [21]:
#Tokyo_venues.rename(columns={"Neighborhood": "zip_code"}, inplace = True)
#Tokyo_venues.to_csv("tokyo_venues_zip500.csv", index = False)

In [24]:
Tokyo_venues.shape

(38553, 7)