In [1]:
import json
import pandas as pd
import numpy as np
import csv
import time

# converting business.json to business.csv
### importing from json file

In [3]:
start = time.time()
print("Executing the code ...\n")
listOfDicts_business = []
counter =0
with open('/Users/kemalm/Desktop/yelp_dataset/business.json',encoding='utf-8') as f:
    for line in f:
        listOfDicts_business.append(json.loads(line))
        counter+=1
print("Successfully appended {} dictionaries.".format(counter))
end = time.time()
print("Execution time: ", end - start, " seconds.")

Executing the code ...

Successfully appended 192609 dictionaries.
Execution time:  3.6180918216705322  seconds.


In [4]:
!wc -l /Users/kemalm/Desktop/yelp_dataset/business.json

  192609 /Users/kemalm/Desktop/yelp_dataset/business.json


### naming a list of attributes

In [2]:
attr_arr = np.array(['business_id', 'name', 'address', 'city', 'state', 
                     'postal_code', 'latitude', 'longitude', 'stars', 
                     'review_count', 'is_open', 'attributes', 'categories', 'hours'])
print(attr_arr, type(attr_arr))

['business_id' 'name' 'address' 'city' 'state' 'postal_code' 'latitude'
 'longitude' 'stars' 'review_count' 'is_open' 'attributes' 'categories'
 'hours'] <class 'numpy.ndarray'>


<h3> checking if all json objects, we obtained from <i style="color:blue">business.json </i> file, actually contain all keys, that yelp dataset documentation claims they do </h3> 

In [6]:
df_containsfield= np.zeros((len(listOfDicts_business),len(attr_arr)))

for i in range(0,len(listOfDicts_business)):
    df_containsfield[i,:] = np.isin(attr_arr, np.array(list(listOfDicts_business[i].keys()))).astype(np.int64)
print(df_containsfield.shape[0], " number of records\n")
tkeys_counter = np.zeros((attr_arr.shape[0],2), dtype=np.object)
tkeys_counter[:,0] = np.array(attr_arr)
tkeys_counter[:,1] = df_containsfield.sum(axis=0)
print("Key associated with its frequency: \n", tkeys_counter)

192609  number of records

Key associated with its frequency: 
 [['business_id' 192609.0]
 ['name' 192609.0]
 ['address' 192609.0]
 ['city' 192609.0]
 ['state' 192609.0]
 ['postal_code' 192609.0]
 ['latitude' 192609.0]
 ['longitude' 192609.0]
 ['stars' 192609.0]
 ['review_count' 192609.0]
 ['is_open' 192609.0]
 ['attributes' 192609.0]
 ['categories' 192609.0]
 ['hours' 192609.0]]


<h3> number of null values per column</h3> 

In [7]:
start = time.time()
for a in list(attr_arr):
    !echo $a
    !grep -e "\"$a\":null" /Users/kemalm/Desktop/yelp_dataset/business.json | wc -l
end = time.time()
print("Execution time: ", end - start, " seconds.")

business_id
       0
name
       0
address
       0
city
       0
state
       0
postal_code
       0
latitude
       0
longitude
       0
stars
       0
review_count
       0
is_open
       0
attributes
   28836
categories
     482
hours
   44830


<h3> Creating a <i style="color:blue"> business.csv </i> file and writing data to it. </h3> 

In [6]:
class mydict(dict):
        def __str__(self):
            return json.dumps(self)

In [15]:
start = time.time()
print("Executing the code ...\n")
with open('/Users/kemalm/Desktop/yelp_dataset/business.csv','w',encoding='utf-8') as csv_file:
    writer = csv.DictWriter(csv_file,list(attr_arr), delimiter='\t' )
    writer.writeheader()
    counter=0
    for dictObj in listOfDicts_business:
        tempDict=dict(dictObj)    
        if tempDict.get('attributes') is not None:
            tempDict['attributes'] = mydict(tempDict['attributes']).__str__()
        else:
            tempDict['attributes']="{}"
            
        if tempDict.get('hours') is not None:
            tempDict['hours'] = mydict(tempDict['hours']).__str__()
        else:
            tempDict['hours']="{}"
        writer.writerow(tempDict)
        counter+=1
print("Successfully written {} rows".format(counter))
end = time.time()
print("Execution time: ", end - start, " seconds.")

Executing the code ...

Successfully written 192609 rows
Execution time:  6.614404201507568  seconds.


# converting user.json to user.csv
### importing from json file

In [3]:
start = time.time()
print("Executing the code ...\n")
listOfDicts_user = []
with open('/Users/kemalm/Desktop/yelp_dataset/user.json',encoding='utf-8') as f:
    counter=0
    for line in f:
        listOfDicts_user.append(json.loads(line))
        counter+=1
endend  = time.time()
print("Successfully appended {} rows".format(counter))
print("Execution time: ", end - start, " seconds.")

Executing the code ...

Successfully appended 1637138 rows
Execution time:  42.027015209198  seconds.


In [5]:
!wc -l /Users/kemalm/Desktop/yelp_dataset/user.json

 1637138 /Users/kemalm/Desktop/yelp_dataset/user.json


In [4]:
len(listOfDicts_user)

1637138

### naming a list of attributes

In [8]:
arr_user = np.array(['user_id', 'name', 'review_count', 'yelping_since', 'useful', 
                     'funny', 'cool', 'elite', 'friends', 'fans', 
                     'average_stars', 'compliment_hot', 'compliment_more', 'compliment_profile', 'compliment_cute', 
                     'compliment_list', 'compliment_note', 'compliment_plain', 'compliment_cool', 'compliment_funny', 
                     'compliment_writer', 'compliment_photos'])

<h3> checking if all json objects, we obtained from <i style="color:blue">user.json </i> file, actually contain all keys, that yelp dataset documentation claims they do </h3> 

In [9]:
start = time.time()
print("Executing the code ...\n")
df_containsfield= np.zeros((len(listOfDicts_user),len(arr_user)))

for i in range(0,len(listOfDicts_user)):
    df_containsfield[i,:] = np.isin(arr_user, np.array(list(listOfDicts_user[i].keys()))).astype(np.int64)
print(df_containsfield.shape[0], " number of records\n")
tkeys_counter = np.zeros((arr_user.shape[0],2), dtype=np.object)
tkeys_counter[:,0] = np.array(arr_user)
tkeys_counter[:,1] = df_containsfield.sum(axis=0)
print("Key associated with its frequency: \n", tkeys_counter)

end=time.time()
print("Execution time: ", end - start, " seconds.")


Executing the code ...

1637138  number of records

Key associated with its frequency: 
 [['user_id' 1637138.0]
 ['name' 1637138.0]
 ['review_count' 1637138.0]
 ['yelping_since' 1637138.0]
 ['useful' 1637138.0]
 ['funny' 1637138.0]
 ['cool' 1637138.0]
 ['elite' 1637138.0]
 ['friends' 1637138.0]
 ['fans' 1637138.0]
 ['average_stars' 1637138.0]
 ['compliment_hot' 1637138.0]
 ['compliment_more' 1637138.0]
 ['compliment_profile' 1637138.0]
 ['compliment_cute' 1637138.0]
 ['compliment_list' 1637138.0]
 ['compliment_note' 1637138.0]
 ['compliment_plain' 1637138.0]
 ['compliment_cool' 1637138.0]
 ['compliment_funny' 1637138.0]
 ['compliment_writer' 1637138.0]
 ['compliment_photos' 1637138.0]]
Execution time:  111.81420087814331  seconds.


<h3> number of null values per column</h3> 
<h5 style="color:red;"> WARNING! Following method works very slow for very large datasets (user.json). </h5> 
<h5 style="color:red;"> Therefore, it shouldn't be run more than once. </h5> 

In [9]:
start = time.time()
for a in list(arr_user):
    !echo $a
    !grep -e "\"$a\":null" /Users/kemalm/Desktop/yelp_dataset/user.json | wc -l
end = time.time()
print("Execution time: ", end - start, " seconds.")

user_id
       0
name
       0
review_count
       0
yelping_since
       0
useful
       0
funny
       0
cool
       0
elite
       0
friends
       0
fans
       0
average_stars
       0
compliment_hot
       0
compliment_more
       0
compliment_profile
       0
compliment_cute
       0
compliment_list
       0
compliment_note
       0
compliment_plain
       0
compliment_cool
       0
compliment_funny
       0
compliment_writer
       0
compliment_photos
       0
Execution time:  963.2632689476013  seconds.


<h3> Creating a <i style="color:blue"> user.csv </i> file and writing data to it. </h3> 

In [24]:
start = time.time()
print("Executing the code ...\n")
with open('/Users/kemalm/Desktop/yelp_dataset/user.csv','w',encoding='utf-8') as csv_file:
    writer = csv.DictWriter(csv_file,user_cols, delimiter='\t' )
    writer.writeheader()
    counter=0
    for dictObj in listOfDicts_user:
        writer.writerow(dictObj)
        counter+=1
print("Successfully written {} rows".format(counter))
end = time.time()
print("Execution time: ", end - start, " seconds.")

Executing the code ...

Successfully written 1637138 rows
Execution time:  75.70107102394104


In [47]:
!wc -l yelp_dataset/user.json
!echo "User.csv has one more row used as a header."
!wc -l yelp_dataset/user.csv


 1637138 yelp_dataset/user.json
User.csv has one more row used as a header.
 1637139 yelp_dataset/user.csv


# converting review.json to review.csv
### importing from json file

In [4]:
start = time.time()
print("Executing the code ...\n")
listOfDicts_review = []
with open('/Users/kemalm/Desktop/yelp_dataset/review.json',encoding='utf-8') as f:
    counter=0
    for line in f:
        listOfDicts_review.append(json.loads(line))
        counter+=1
print("Successfully appended {} dictionaries.".format(counter))
end = time.time()
print("Execution time: ", end - start, " seconds.")

Executing the code ...

Successfully appended 6685900 dictionaries.
Execution time:  71.14869093894958  seconds.


In [6]:
!wc -l /Users/kemalm/Desktop/yelp_dataset/review.json

 6685900 /Users/kemalm/Desktop/yelp_dataset/review.json


### naming a list of attributes

In [11]:
arr_review= np.array(['review_id', 'user_id', 'business_id', 'stars', 'useful',
                      'funny', 'cool', 'text', 'date'])

<h3> checking if all json objects, we obtained from <i style="color:blue">review.json </i> file, actually contain all keys, that yelp dataset documentation claims they do </h3> 

In [10]:
start = time.time()
print("Executing the code ...\n")
df_containsfield= np.zeros((len(listOfDicts_review),len(arr_review)))

for i in range(0,len(listOfDicts_review)):
    df_containsfield[i,:] = np.isin(arr_review, np.array(list(listOfDicts_review[i].keys()))).astype(np.int64)
print(df_containsfield.shape[0], " number of records\n")
tkeys_counter = np.zeros((arr_review.shape[0],2), dtype=np.object)
tkeys_counter[:,0] = np.array(arr_review)
tkeys_counter[:,1] = df_containsfield.sum(axis=0)
print("Key associated with its frequency: \n", tkeys_counter)

end=time.time()
print("Execution time: ", end - start, " seconds.")

Executing the code ...

6685900  number of records

Key associated with its frequency: 
 [['review_id' 6685900.0]
 ['user_id' 6685900.0]
 ['business_id' 6685900.0]
 ['stars' 6685900.0]
 ['useful' 6685900.0]
 ['funny' 6685900.0]
 ['cool' 6685900.0]
 ['text' 6685900.0]
 ['date' 6685900.0]]
Execution time:  299.6732749938965  seconds.


<h3> number of null values per column</h3> 
<h5 style="color:red;"> WARNING! Following method works very slow for very large datasets (user.json). </h5> 
<h5 style="color:red;"> Therefore, it shouldn't be run more than once. </h5> 

In [13]:
start = time.time()
for a in list(arr_review):
    !echo $a
    !grep -e "\"$a\":null" /Users/kemalm/Desktop/yelp_dataset/review.json | wc -l
end = time.time()
print("Execution time: ", end - start, " seconds.")

review_id
       0
user_id
       0
business_id
       0
stars
       0
useful
       0
funny
       0
cool
       0
text
       0
date
       0
Execution time:  913.4484198093414  seconds.


In [3]:
review_cols = list(arr_review)
print(review_cols)

['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny', 'cool', 'text', 'date']


<h3> Creating a <i style="color:blue"> review.csv </i> file and writing data to it. </h3> 

In [4]:
start = time.time()
print("Executing the code ...\n")
with open('/Users/kemalm/Desktop/yelp_dataset/review.csv','w',encoding='utf-8') as csv_file:
    writer = csv.DictWriter(csv_file,review_cols, delimiter='\t' )
    writer.writeheader()
    counter=0
    for dictObj in listOfDicts_review:
        writer.writerow(dictObj)
        counter+=1
print("Successfully written {} rows".format(counter))
end = time.time()
print("Execution time: ", end - start, " seconds.")



Executing the code ...

Successfully written 6685900 rows
Execution time:  232.80819010734558  seconds.


# converting checkin.json to checkin.csv
### importing from json file

In [7]:
start = time.time()
print("Executing the code ...\n")
listOfDicts_checkin = []
with open('/Users/kemalm/Desktop/yelp_dataset/checkin.json',encoding='utf-8') as f:
    counter=0
    for line in f:
        listOfDicts_checkin.append(json.loads(line))
        counter+=1
print("Successfully appended {} dictionaries.".format(counter))
end = time.time()
print("Execution time: ", end - start, " seconds.")

Executing the code ...

Successfully appended 161950 dictionaries.
Execution time:  2.0320558547973633  seconds.


In [3]:
!wc -l /Users/kemalm/Desktop/yelp_dataset/checkin.json

  161950 /Users/kemalm/Desktop/yelp_dataset/checkin.json


### naming a list of attributes

In [15]:
arr_checkin = np.array(['business_id', 'date'])

<h3> checking if all json objects, we obtained from <i style="color:blue">checkin.json </i> file, actually contain all keys, that yelp dataset documentation claims they do </h3> 

In [12]:
start = time.time()
print("Executing the code ...\n")
df_containsfield= np.zeros((len(listOfDicts_checkin),len(arr_checkin)))

for i in range(0,len(listOfDicts_checkin)):
    df_containsfield[i,:] = np.isin(arr_checkin, np.array(list(listOfDicts_checkin[i].keys()))).astype(np.int64)
print(df_containsfield.shape[0], " number of records\n")
tkeys_counter = np.zeros((arr_checkin.shape[0],2), dtype=np.object)
tkeys_counter[:,0] = np.array(arr_checkin)
tkeys_counter[:,1] = df_containsfield.sum(axis=0)
print("Key associated with its frequency: \n", tkeys_counter)

end=time.time()
print("Execution time: ", end - start, " seconds.")

Executing the code ...

161950  number of records

Key associated with its frequency: 
 [['business_id' 161950.0]
 ['date' 161950.0]]
Execution time:  2.568455934524536  seconds.


<h3> number of null values per column</h3> 

In [16]:
start = time.time()
for a in list(arr_checkin):
    !echo $a
    !grep -e "\"$a\":null" /Users/kemalm/Desktop/yelp_dataset/checkin.json | wc -l
end = time.time()
print("Execution time: ", end - start, " seconds.")

business_id
       0
date
       0
Execution time:  14.164305925369263  seconds.


In [13]:
checkin_cols = list(arr_checkin)
print(checkin_cols)

['business_id', 'date']


<h3> Creating a <i style="color:blue"> checkin.csv </i> file and writing data to it. </h3> 

In [6]:
start = time.time()
print("Executing the code ...\n")
with open('/Users/kemalm/Desktop/yelp_dataset/checkin.csv','w',encoding='utf-8') as csv_file:
    writer = csv.DictWriter(csv_file,checkin_cols, delimiter='\t' )
    writer.writeheader()
    counter=0
    for dictObj in listOfDicts_checkin:
        writer.writerow(dictObj)
        counter+=1
print("Successfully written {} rows".format(counter))
end = time.time()
print("Execution time: ", end - start, " seconds.")

Executing the code ...

Successfully written 161950 rows
Execution time:  9.046382665634155  seconds.


In [8]:
!wc -l yelp_dataset/checkin.json
!wc -l yelp_dataset/checkin.csv


  161950 yelp_dataset/checkin.json
  161951 yelp_dataset/checkin.csv


# converting tip.json to tip.csv
### importing from json file

In [3]:
start = time.time()
print("Executing the code ...\n")
listOfDicts_tip = []
with open('/Users/kemalm/Desktop/yelp_dataset/tip.json',encoding='utf-8') as f:
    counter=0
    for line in f:
        listOfDicts_tip.append(json.loads(line))
        counter+=1
print("Successfully appended {} dictionaries.".format(counter))
end = time.time()
print("Execution time: ", end - start, " seconds.")

Executing the code ...

Successfully appended 1223094 dictionaries.
Execution time:  6.717769145965576  seconds.


In [6]:
!wc -l /Users/kemalm/Desktop/yelp_dataset/tip.json

 1223094 /Users/kemalm/Desktop/yelp_dataset/tip.json


### naming a list of attributes

In [17]:
arr_tip = np.array(['user_id', 'business_id', 'text', 'date', 'compliment_count'])

<h3> checking if all json objects, we obtained from <i style="color:blue">tip.json </i> file, actually contain all keys, that yelp dataset documentation claims they do </h3> 

In [10]:
start = time.time()
print("Executing the code ...\n")
df_containsfield= np.zeros((len(listOfDicts_tip),len(arr_tip)))

for i in range(0,len(listOfDicts_tip)):
    df_containsfield[i,:] = np.isin(arr_tip, np.array(list(listOfDicts_tip[i].keys()))).astype(np.int64)
print(df_containsfield.shape[0], " number of records\n")
tkeys_counter = np.zeros((arr_tip.shape[0],2), dtype=np.object)
tkeys_counter[:,0] = np.array(arr_tip)
tkeys_counter[:,1] = df_containsfield.sum(axis=0)
print("Key associated with its frequency: \n", tkeys_counter)

end=time.time()
print("Execution time: ", end - start, " seconds.")

Executing the code ...

1223094  number of records

Key associated with its frequency: 
 [['user_id' 1223094.0]
 ['business_id' 1223094.0]
 ['text' 1223094.0]
 ['date' 1223094.0]
 ['compliment_count' 1223094.0]]
Execution time:  30.554124116897583  seconds.


<h3> number of null values per column</h3> 

In [18]:
start = time.time()
for a in list(arr_tip):
    !echo $a
    !grep -e "\"$a\":null" /Users/kemalm/Desktop/yelp_dataset/tip.json | wc -l
end = time.time()
print("Execution time: ", end - start, " seconds.")

user_id
       0
business_id
       0
text
       0
date
       0
compliment_count
       0
Execution time:  25.752610683441162  seconds.


In [9]:
tip_cols = list(arr_tip)
print(tip_cols)

['user_id', 'business_id', 'text', 'date', 'compliment_count']


<h3> Creating a <i style="color:blue"> tip.csv </i> file and writing data to it. </h3> 

In [16]:
start = time.time()
print("Executing the code ...\n")
with open('/Users/kemalm/Desktop/yelp_dataset/tip.csv','w',encoding='utf-8') as csv_file:
    writer = csv.DictWriter(csv_file,tip_cols, delimiter='\t' )
    writer.writeheader()
    counter=0
    for dictObj in listOfDicts_tip:
        writer.writerow(dictObj)
        counter+=1
print("Successfully written {} rows".format(counter))
end = time.time()
print("Execution time: ", end - start, " seconds.")

Executing the code ...

Successfully written 1223094 rows
Execution time:  8.033058166503906  seconds.


# converting photo.json to photo.csv
### importing from json file

In [6]:
start = time.time()
print("Executing the code ...\n")
listOfDicts_photo = []
with open('/Users/kemalm/Desktop/yelp_dataset/photo.json',encoding='utf-8') as f:
    counter=0
    for line in f:
        listOfDicts_photo.append(json.loads(line))
        counter+=1
print("Successfully appended {} dictionaries.".format(counter))
end = time.time()
print("Execution time: ", end - start, " seconds.")

Executing the code ...

Successfully appended 200000 dictionaries.
Execution time:  0.9235949516296387  seconds.


In [2]:
!wc -l /Users/kemalm/Desktop/yelp_dataset/photo.json

  200000 /Users/kemalm/Desktop/yelp_dataset/photo.json


### naming a list of attributes

In [20]:
arr_photo = np.array(['caption', 'photo_id', 'business_id', 'label'])

<h3> checking if all json objects, we obtained from <i style="color:blue">photo.json </i> file, actually contain all keys, that yelp dataset documentation claims they do </h3> 

In [10]:
start = time.time()
print("Executing the code ...\n")
df_containsfield= np.zeros((len(listOfDicts_photo),len(arr_photo)))

for i in range(0,len(listOfDicts_photo)):
    df_containsfield[i,:] = np.isin(arr_photo, np.array(list(listOfDicts_photo[i].keys()))).astype(np.int64)
print(df_containsfield.shape[0], " number of records\n")
tkeys_counter = np.zeros((arr_photo.shape[0],2), dtype=np.object)
tkeys_counter[:,0] = np.array(arr_photo)
tkeys_counter[:,1] = df_containsfield.sum(axis=0)
print("Key associated with its frequency: \n", tkeys_counter)

end=time.time()
print("Execution time: ", end - start, " seconds.")

Executing the code ...

200000  number of records

Key associated with its frequency: 
 [['caption' 200000.0]
 ['photo_id' 200000.0]
 ['business_id' 200000.0]
 ['label' 200000.0]]
Execution time:  4.147678852081299  seconds.


<h3> number of null values per column</h3> 

In [21]:
start = time.time()
for a in list(arr_photo):
    !echo $a
    !grep -e "\"$a\":null" /Users/kemalm/Desktop/yelp_dataset/photo.json | wc -l
end = time.time()
print("Execution time: ", end - start, " seconds.")

caption
       0
photo_id
       0
business_id
       0
label
       0
Execution time:  3.233721971511841  seconds.


In [9]:
photo_cols = list(arr_photo)
print(photo_cols)

['caption', 'photo_id', 'business_id', 'label']


<h3> Creating a <i style="color:blue"> photo.csv </i> file and writing data to it. </h3> 

In [5]:
start = time.time()
print("Executing the code ...\n")
with open('/Users/kemalm/Desktop/yelp_dataset/photo.csv','w',encoding='utf-8') as csv_file:
    writer = csv.DictWriter(csv_file,photo_cols, delimiter='\t' )
    writer.writeheader()
    counter=0
    for dictObj in listOfDicts_photo:
        writer.writerow(dictObj)
        counter+=1
print("Successfully written {} rows".format(counter))
end = time.time()
print("Execution time: ", end - start, " seconds.")

Executing the code ...

Successfully written 200000 rows
Execution time:  0.9482808113098145  seconds.


# Python-to-PostgreSQL client

In [2]:
import psycopg2

In [3]:
try:
    conn=psycopg2.connect("dbname='yelpDB' user='postgres' host='localhost' password='P0$tgre$QL'")
except:
    print("I am unable to connect to the database")

In [8]:
cur = conn.cursor()
#cur.execute("""select city,state, count(business_id)
#                from Businesses
#                where is_open = 1
#                group by city,state
#                order by 3 desc
#                limit 10""")
#recordsDB = cur.fetchall()

In [30]:
cur = conn.cursor()
cur.execute("""select name, address, city, state, latitude, longitude, categories, is_open, hours
                from Businesses""")
recordsDB = cur.fetchall()

# Google Places: Sending HTTP Requests 

In [63]:
import requests
from scipy.spatial.distance import pdist
from geopy.distance import geodesic
import numpy as np
import random as rn

In [64]:
key_content= !cat /Users/kemalm/Desktop/gmAPI.txt
api_key = key_content[0]

In [65]:
fields = ['name', 'address', 'city', 'state', 'latitude', 'longitude', 'categories', 'is_open', 'hours']
k = np.core.defchararray.add(np.array(['obt_']),np.array(fields))

#print(k)
indices =[x for x in range(0,len(fields)*2)]
#print(indices)
mapDictIndexes = dict(zip(fields+list(k),indices))
mapDictIndexes['diff_distance_in_meters'] = len(fields)*2
print(mapDictIndexes)

{'name': 0, 'address': 1, 'city': 2, 'state': 3, 'latitude': 4, 'longitude': 5, 'categories': 6, 'is_open': 7, 'hours': 8, 'obt_name': 9, 'obt_address': 10, 'obt_city': 11, 'obt_state': 12, 'obt_latitude': 13, 'obt_longitude': 14, 'obt_categories': 15, 'obt_is_open': 16, 'obt_hours': 17, 'diff_distance_in_meters': 18}


In [26]:
coordinates = np.array([[0,0],
                        [ 0, 180]])# Using the geodesic distance function.
m_dist = pdist(coordinates, # Coordinates matrix or tuples list
               lambda u, v: geodesic(u, v).kilometers)


In [59]:
smpl_lat,smpl_lng =  33.5221294, -112.0181866
url_api_place_nearbySearch ="https://maps.googleapis.com/maps/api/place/nearbysearch/json?location={},{}&radius=500&keyword={}&key={}".format(
smpl_lat,smpl_lng,'Arizona Biltmore Golf Club', api_key)
response_api_place_nearbySearch =requests.get(url_api_place_nearbySearch)

In [60]:
print(json.dumps(response_api_place_nearbySearch.json()))

{"html_attributions": [], "results": [{"geometry": {"location": {"lat": 33.5175972, "lng": -112.0213628}, "viewport": {"northeast": {"lat": 33.5220429, "lng": -112.0195567201073}, "southwest": {"lat": 33.5161153, "lng": -112.0222563798927}}}, "icon": "https://maps.gstatic.com/mapfiles/place_api/icons/golf-71.png", "id": "4f875490889efc8934301f8e02a335a0b908f81f", "name": "Arizona Biltmore Golf Club", "opening_hours": {"open_now": true}, "photos": [{"height": 2988, "html_attributions": ["<a href=\"https://maps.google.com/maps/contrib/112214149588074350919/photos\">Rodolfo Concepcion</a>"], "photo_reference": "CmRaAAAAusRoo-LbrwqaaaN2MjOjbvkIvfwdas5ojS2L3f9v_wbOrQMr-bso5CZor3XbYUZbLgtW3t8uBCRH2jxbl5uO_-H_4Don1rnlNsNiFm23m2OecwtMq4vw_2gbttX6emFcEhBioqy9PTeeDIQXtLEpTAC7GhRE5Dh_E8m1zn2ewTjwmTpyURSx2Q", "width": 5312}], "place_id": "ChIJ47o1_EENK4cRCeK-yfYA-V8", "plus_code": {"compound_code": "GX9H+2F Phoenix, Arizona, USA", "global_code": "8559GX9H+2F"}, "rating": 4.3, "reference": "ChIJ47o

In [56]:
smpl_lat,smpl_lng =  33.654815, -112.188568
url_api_place_nearbySearch ="https://maps.googleapis.com/maps/api/place/nearbysearch/json?location={},{}&radius=500&keyword={}&key={}".format(
smpl_lat,smpl_lng,'Vita Bella Fine Day Spa', api_key)
response_api_place_nearbySearch =requests.get(url_api_place_nearbySearch)

In [57]:
response_api_place_nearbySearch.json()

{'html_attributions': [], 'results': [], 'status': 'ZERO_RESULTS'}

In [82]:
start = time.time()
print("Executing the code ...\n")
listOfDicts =list()
counter=0
for record in rn.sample(recordsDB, 100):
    
    dictObj=dict()
    #Fields of a record from yelpDB database
    
    business_name = record[mapDictIndexes['name']] 
    business_address = record[mapDictIndexes['address']].replace(" ","+")
    business_latitude = record[mapDictIndexes['latitude']]
    business_longitude = record[mapDictIndexes['longitude']]
    
    dictObj['name']= record[mapDictIndexes['name']]
    dictObj['address']= record[mapDictIndexes['address']]
    dictObj['city']= record[mapDictIndexes['city']]
    dictObj['state']= record[mapDictIndexes['state']]
    dictObj['latitude']= record[mapDictIndexes['latitude']]
    dictObj['longitude']= record[mapDictIndexes['longitude']]
    dictObj['categories']= record[mapDictIndexes['categories']]
    dictObj['is_open']= record[mapDictIndexes['is_open']]
    dictObj['hours']= record[mapDictIndexes['hours']]
      
    #generating an url to find whether there is a business in radius of 500 meters centered around geo coordinates we previously obtained
    url_nearbySearch ="https://maps.googleapis.com/maps/api/place/nearbysearch/json?location={},{}&radius=500&keyword={}&key={}".format(
    business_latitude,business_longitude,business_name, api_key)
    
    
   
    #http request
    response_nearbySearch =requests.get(url_nearbySearch)
    #converting to json (type:dict)
    response_nearbySearchJSON= response_nearbySearch.json()
    if(response_nearbySearchJSON['status']=='OK'):
        dictObj['obt_name']= response_nearbySearchJSON['results'][0]['name']
        obt_add,obt_ct = response_nearbySearchJSON['results'][0]['vicinity'].rsplit(',', 1)
        dictObj['obt_address']= obt_add
        
        dictObj['obt_city'] = obt_ct
        dictObj['obt_state'] = '...'
        obt_lat,obt_lng =response_nearbySearchJSON['results'][0]['geometry']['location']['lat'], response_nearbySearchJSON['results'][0]['geometry']['location']['lng']
        dictObj['obt_latitude']= obt_lat
        dictObj['obt_longitude']= obt_lng
           
        #generating matrix of geo coordinates
        coordinates=np.array([[business_latitude, business_longitude],[obt_lat,obt_lng]])
        #executing pdist function to calculate distance between spherical points (in kilometers) 
        m_dist = pdist(coordinates, # Coordinates matrix or tuples list
               lambda u, v: geodesic(u, v).kilometers)
        
        dictObj['diff_distance_in_meters']= float(m_dist)* 1000.0        
        dictObj['categories']= response_nearbySearchJSON['results'][0]['types']        
        dictObj['obt_is_open'] = 'Not obtained'
        dictObj['obt_hours'] = 'Not obtained'        
    else:
        dictObj['obt_name']= None
        dictObj['obt_address']= None
        dictObj['obt_city'] = None
        dictObj['obt_state'] = None
        dictObj['obt_latitude']= np.nan
        dictObj['obt_longitude']= np.nan
        dictObj['diff_distance_in_meters']= np.nan 
        dictObj['categories']= None
        dictObj['obt_is_open'] = None
        dictObj['obt_hours'] = None
    print("Status: ", response_nearbySearchJSON['status'])  
    listOfDicts.append(dictObj)
    counter+=1
    end = time.time()
    print("Record num. {}".format(counter))
    print("Execution time: ", end - start, " seconds.")
print("Successfully appended {} dictionaries".format(counter))
end = time.time()
print("Execution time: ", end - start, " seconds.")

Executing the code ...

Status:  OK
Record num. 1
Execution time:  0.7021129131317139  seconds.
Status:  OK
Record num. 2
Execution time:  1.4191508293151855  seconds.
Status:  OK
Record num. 3
Execution time:  2.3403608798980713  seconds.
Status:  OK
Record num. 4
Execution time:  2.9542388916015625  seconds.
Status:  OK
Record num. 5
Execution time:  3.8762879371643066  seconds.
Status:  OK
Record num. 6
Execution time:  4.491046905517578  seconds.
Status:  OK
Record num. 7
Execution time:  5.2076098918914795  seconds.
Status:  OK
Record num. 8
Execution time:  6.129266023635864  seconds.
Status:  OK
Record num. 9
Execution time:  6.846045970916748  seconds.
Status:  OK
Record num. 10
Execution time:  7.684054851531982  seconds.
Status:  OK
Record num. 11
Execution time:  8.284544944763184  seconds.
Status:  OK
Record num. 12
Execution time:  8.893998861312866  seconds.
Status:  OK
Record num. 13
Execution time:  9.713221788406372  seconds.
Status:  OK
Record num. 14
Execution time: 

In [83]:
df_dataset = pd.DataFrame(listOfDicts)

In [86]:
df_dataset[['name', 'obt_name','address', 'obt_address' ,'city', 'obt_city', 'latitude', 'longitude', 'obt_latitude', 'obt_longitude', 'diff_distance_in_meters'  ]].head()

Unnamed: 0,name,obt_name,address,obt_address,city,obt_city,latitude,longitude,obt_latitude,obt_longitude,diff_distance_in_meters
0,Bond Street Wines,Bond Street Wines,605 Providence Rd,605 Providence Rd,Charlotte,Charlotte,35.201695,-80.824522,35.201689,-80.824522,0.6989489
1,Mazda of South Charlotte,Mazda of South Charlotte,10515 Cadillac St,10515 Cadillac St,Pineville,Pineville,35.097067,-80.882105,35.096992,-80.881727,35.50711
2,Best Western Ville-Marie Montreal Hotel & Suites,Best Western Ville-Marie Montreal Hotel & Suites,3407 Peel St,3407 Peel St,Montreal,Montreal,45.502181,-73.576738,45.502181,-73.576738,1.11065e-09
3,Revolution Fitness Evolved,Revolution Fitness Evolved,"3065 E Patrick Ln, Ste 2",3065 E Patrick Ln,Las Vegas,Las Vegas,36.07861,-115.108479,36.078136,-115.108525,52.68739
4,Starbucks,Starbucks,300 Front Street W,300 Front St W #1,Toronto,Toronto,43.644103,-79.389446,43.643979,-79.389468,13.88991
