In [1]:
import json
import pandas as pd
import numpy as np
import csv
import time

# converting business.json to business.csv
### importing from json file

In [3]:
start = time.time()
print("Executing the code ...\n")
listOfDicts_business = []
counter =0
with open('/Users/kemalm/Desktop/yelp_dataset/business.json',encoding='utf-8') as f:
    for line in f:
        listOfDicts_business.append(json.loads(line))
        counter+=1
print("Successfully appended {} dictionaries.".format(counter))
end = time.time()
print("Execution time: ", end - start, " seconds.")

Executing the code ...

Successfully appended 192609 dictionaries.
Execution time:  3.6180918216705322  seconds.


In [4]:
!wc -l /Users/kemalm/Desktop/yelp_dataset/business.json

  192609 /Users/kemalm/Desktop/yelp_dataset/business.json


### naming a list of attributes

In [2]:
attr_arr = np.array(['business_id', 'name', 'address', 'city', 'state', 
                     'postal_code', 'latitude', 'longitude', 'stars', 
                     'review_count', 'is_open', 'attributes', 'categories', 'hours'])
print(attr_arr, type(attr_arr))

['business_id' 'name' 'address' 'city' 'state' 'postal_code' 'latitude'
 'longitude' 'stars' 'review_count' 'is_open' 'attributes' 'categories'
 'hours'] <class 'numpy.ndarray'>


<h3> checking if all json objects, we obtained from <i style="color:blue">business.json </i> file, actually contain all keys, that yelp dataset documentation claims they do </h3> 

In [6]:
df_containsfield= np.zeros((len(listOfDicts_business),len(attr_arr)))

for i in range(0,len(listOfDicts_business)):
    df_containsfield[i,:] = np.isin(attr_arr, np.array(list(listOfDicts_business[i].keys()))).astype(np.int64)
print(df_containsfield.shape[0], " number of records\n")
tkeys_counter = np.zeros((attr_arr.shape[0],2), dtype=np.object)
tkeys_counter[:,0] = np.array(attr_arr)
tkeys_counter[:,1] = df_containsfield.sum(axis=0)
print("Key associated with its frequency: \n", tkeys_counter)

192609  number of records

Key associated with its frequency: 
 [['business_id' 192609.0]
 ['name' 192609.0]
 ['address' 192609.0]
 ['city' 192609.0]
 ['state' 192609.0]
 ['postal_code' 192609.0]
 ['latitude' 192609.0]
 ['longitude' 192609.0]
 ['stars' 192609.0]
 ['review_count' 192609.0]
 ['is_open' 192609.0]
 ['attributes' 192609.0]
 ['categories' 192609.0]
 ['hours' 192609.0]]


<h3> number of null values per column</h3> 

In [7]:
start = time.time()
for a in list(attr_arr):
    !echo $a
    !grep -e "\"$a\":null" /Users/kemalm/Desktop/yelp_dataset/business.json | wc -l
end = time.time()
print("Execution time: ", end - start, " seconds.")

business_id
       0
name
       0
address
       0
city
       0
state
       0
postal_code
       0
latitude
       0
longitude
       0
stars
       0
review_count
       0
is_open
       0
attributes
   28836
categories
     482
hours
   44830


<h3> Creating a <i style="color:blue"> business.csv </i> file and writing data to it. </h3> 

In [6]:
class mydict(dict):
        def __str__(self):
            return json.dumps(self)

In [15]:
start = time.time()
print("Executing the code ...\n")
with open('/Users/kemalm/Desktop/yelp_dataset/business.csv','w',encoding='utf-8') as csv_file:
    writer = csv.DictWriter(csv_file,list(attr_arr), delimiter='\t' )
    writer.writeheader()
    counter=0
    for dictObj in listOfDicts_business:
        tempDict=dict(dictObj)    
        if tempDict.get('attributes') is not None:
            tempDict['attributes'] = mydict(tempDict['attributes']).__str__()
        else:
            tempDict['attributes']="{}"
            
        if tempDict.get('hours') is not None:
            tempDict['hours'] = mydict(tempDict['hours']).__str__()
        else:
            tempDict['hours']="{}"
        writer.writerow(tempDict)
        counter+=1
print("Successfully written {} rows".format(counter))
end = time.time()
print("Execution time: ", end - start, " seconds.")

Executing the code ...

Successfully written 192609 rows
Execution time:  6.614404201507568  seconds.


# converting user.json to user.csv
### importing from json file

In [3]:
start = time.time()
print("Executing the code ...\n")
listOfDicts_user = []
with open('/Users/kemalm/Desktop/yelp_dataset/user.json',encoding='utf-8') as f:
    counter=0
    for line in f:
        listOfDicts_user.append(json.loads(line))
        counter+=1
endend  = time.time()
print("Successfully appended {} rows".format(counter))
print("Execution time: ", end - start, " seconds.")

Executing the code ...

Successfully appended 1637138 rows
Execution time:  42.027015209198  seconds.


In [5]:
!wc -l /Users/kemalm/Desktop/yelp_dataset/user.json

 1637138 /Users/kemalm/Desktop/yelp_dataset/user.json


In [4]:
len(listOfDicts_user)

1637138

### naming a list of attributes

In [8]:
arr_user = np.array(['user_id', 'name', 'review_count', 'yelping_since', 'useful', 
                     'funny', 'cool', 'elite', 'friends', 'fans', 
                     'average_stars', 'compliment_hot', 'compliment_more', 'compliment_profile', 'compliment_cute', 
                     'compliment_list', 'compliment_note', 'compliment_plain', 'compliment_cool', 'compliment_funny', 
                     'compliment_writer', 'compliment_photos'])

<h3> checking if all json objects, we obtained from <i style="color:blue">user.json </i> file, actually contain all keys, that yelp dataset documentation claims they do </h3> 

In [9]:
start = time.time()
print("Executing the code ...\n")
df_containsfield= np.zeros((len(listOfDicts_user),len(arr_user)))

for i in range(0,len(listOfDicts_user)):
    df_containsfield[i,:] = np.isin(arr_user, np.array(list(listOfDicts_user[i].keys()))).astype(np.int64)
print(df_containsfield.shape[0], " number of records\n")
tkeys_counter = np.zeros((arr_user.shape[0],2), dtype=np.object)
tkeys_counter[:,0] = np.array(arr_user)
tkeys_counter[:,1] = df_containsfield.sum(axis=0)
print("Key associated with its frequency: \n", tkeys_counter)

end=time.time()
print("Execution time: ", end - start, " seconds.")


Executing the code ...

1637138  number of records

Key associated with its frequency: 
 [['user_id' 1637138.0]
 ['name' 1637138.0]
 ['review_count' 1637138.0]
 ['yelping_since' 1637138.0]
 ['useful' 1637138.0]
 ['funny' 1637138.0]
 ['cool' 1637138.0]
 ['elite' 1637138.0]
 ['friends' 1637138.0]
 ['fans' 1637138.0]
 ['average_stars' 1637138.0]
 ['compliment_hot' 1637138.0]
 ['compliment_more' 1637138.0]
 ['compliment_profile' 1637138.0]
 ['compliment_cute' 1637138.0]
 ['compliment_list' 1637138.0]
 ['compliment_note' 1637138.0]
 ['compliment_plain' 1637138.0]
 ['compliment_cool' 1637138.0]
 ['compliment_funny' 1637138.0]
 ['compliment_writer' 1637138.0]
 ['compliment_photos' 1637138.0]]
Execution time:  111.81420087814331  seconds.


<h3> number of null values per column</h3> 
<h5 style="color:red;"> WARNING! Following method works very slow for very large datasets (user.json). </h5> 
<h5 style="color:red;"> Therefore, it shouldn't be run more than once. </h5> 

In [9]:
start = time.time()
for a in list(arr_user):
    !echo $a
    !grep -e "\"$a\":null" /Users/kemalm/Desktop/yelp_dataset/user.json | wc -l
end = time.time()
print("Execution time: ", end - start, " seconds.")

user_id
       0
name
       0
review_count
       0
yelping_since
       0
useful
       0
funny
       0
cool
       0
elite
       0
friends
       0
fans
       0
average_stars
       0
compliment_hot
       0
compliment_more
       0
compliment_profile
       0
compliment_cute
       0
compliment_list
       0
compliment_note
       0
compliment_plain
       0
compliment_cool
       0
compliment_funny
       0
compliment_writer
       0
compliment_photos
       0
Execution time:  963.2632689476013  seconds.


<h3> Creating a <i style="color:blue"> user.csv </i> file and writing data to it. </h3> 

In [24]:
start = time.time()
print("Executing the code ...\n")
with open('/Users/kemalm/Desktop/yelp_dataset/user.csv','w',encoding='utf-8') as csv_file:
    writer = csv.DictWriter(csv_file,user_cols, delimiter='\t' )
    writer.writeheader()
    counter=0
    for dictObj in listOfDicts_user:
        writer.writerow(dictObj)
        counter+=1
print("Successfully written {} rows".format(counter))
end = time.time()
print("Execution time: ", end - start, " seconds.")

Executing the code ...

Successfully written 1637138 rows
Execution time:  75.70107102394104


In [47]:
!wc -l yelp_dataset/user.json
!echo "User.csv has one more row used as a header."
!wc -l yelp_dataset/user.csv


 1637138 yelp_dataset/user.json
User.csv has one more row used as a header.
 1637139 yelp_dataset/user.csv


# converting review.json to review.csv
### importing from json file

In [4]:
start = time.time()
print("Executing the code ...\n")
listOfDicts_review = []
with open('/Users/kemalm/Desktop/yelp_dataset/review.json',encoding='utf-8') as f:
    counter=0
    for line in f:
        listOfDicts_review.append(json.loads(line))
        counter+=1
print("Successfully appended {} dictionaries.".format(counter))
end = time.time()
print("Execution time: ", end - start, " seconds.")

Executing the code ...

Successfully appended 6685900 dictionaries.
Execution time:  71.14869093894958  seconds.


In [6]:
!wc -l /Users/kemalm/Desktop/yelp_dataset/review.json

 6685900 /Users/kemalm/Desktop/yelp_dataset/review.json


### naming a list of attributes

In [11]:
arr_review= np.array(['review_id', 'user_id', 'business_id', 'stars', 'useful',
                      'funny', 'cool', 'text', 'date'])

<h3> checking if all json objects, we obtained from <i style="color:blue">review.json </i> file, actually contain all keys, that yelp dataset documentation claims they do </h3> 

In [10]:
start = time.time()
print("Executing the code ...\n")
df_containsfield= np.zeros((len(listOfDicts_review),len(arr_review)))

for i in range(0,len(listOfDicts_review)):
    df_containsfield[i,:] = np.isin(arr_review, np.array(list(listOfDicts_review[i].keys()))).astype(np.int64)
print(df_containsfield.shape[0], " number of records\n")
tkeys_counter = np.zeros((arr_review.shape[0],2), dtype=np.object)
tkeys_counter[:,0] = np.array(arr_review)
tkeys_counter[:,1] = df_containsfield.sum(axis=0)
print("Key associated with its frequency: \n", tkeys_counter)

end=time.time()
print("Execution time: ", end - start, " seconds.")

Executing the code ...

6685900  number of records

Key associated with its frequency: 
 [['review_id' 6685900.0]
 ['user_id' 6685900.0]
 ['business_id' 6685900.0]
 ['stars' 6685900.0]
 ['useful' 6685900.0]
 ['funny' 6685900.0]
 ['cool' 6685900.0]
 ['text' 6685900.0]
 ['date' 6685900.0]]
Execution time:  299.6732749938965  seconds.


<h3> number of null values per column</h3> 
<h5 style="color:red;"> WARNING! Following method works very slow for very large datasets (user.json). </h5> 
<h5 style="color:red;"> Therefore, it shouldn't be run more than once. </h5> 

In [13]:
start = time.time()
for a in list(arr_review):
    !echo $a
    !grep -e "\"$a\":null" /Users/kemalm/Desktop/yelp_dataset/review.json | wc -l
end = time.time()
print("Execution time: ", end - start, " seconds.")

review_id
       0
user_id
       0
business_id
       0
stars
       0
useful
       0
funny
       0
cool
       0
text
       0
date
       0
Execution time:  913.4484198093414  seconds.


In [3]:
review_cols = list(arr_review)
print(review_cols)

['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny', 'cool', 'text', 'date']


<h3> Creating a <i style="color:blue"> review.csv </i> file and writing data to it. </h3> 

In [4]:
start = time.time()
print("Executing the code ...\n")
with open('/Users/kemalm/Desktop/yelp_dataset/review.csv','w',encoding='utf-8') as csv_file:
    writer = csv.DictWriter(csv_file,review_cols, delimiter='\t' )
    writer.writeheader()
    counter=0
    for dictObj in listOfDicts_review:
        writer.writerow(dictObj)
        counter+=1
print("Successfully written {} rows".format(counter))
end = time.time()
print("Execution time: ", end - start, " seconds.")



Executing the code ...

Successfully written 6685900 rows
Execution time:  232.80819010734558  seconds.


# converting checkin.json to checkin.csv
### importing from json file

In [7]:
start = time.time()
print("Executing the code ...\n")
listOfDicts_checkin = []
with open('/Users/kemalm/Desktop/yelp_dataset/checkin.json',encoding='utf-8') as f:
    counter=0
    for line in f:
        listOfDicts_checkin.append(json.loads(line))
        counter+=1
print("Successfully appended {} dictionaries.".format(counter))
end = time.time()
print("Execution time: ", end - start, " seconds.")

Executing the code ...

Successfully appended 161950 dictionaries.
Execution time:  2.0320558547973633  seconds.


In [3]:
!wc -l /Users/kemalm/Desktop/yelp_dataset/checkin.json

  161950 /Users/kemalm/Desktop/yelp_dataset/checkin.json


### naming a list of attributes

In [15]:
arr_checkin = np.array(['business_id', 'date'])

<h3> checking if all json objects, we obtained from <i style="color:blue">checkin.json </i> file, actually contain all keys, that yelp dataset documentation claims they do </h3> 

In [12]:
start = time.time()
print("Executing the code ...\n")
df_containsfield= np.zeros((len(listOfDicts_checkin),len(arr_checkin)))

for i in range(0,len(listOfDicts_checkin)):
    df_containsfield[i,:] = np.isin(arr_checkin, np.array(list(listOfDicts_checkin[i].keys()))).astype(np.int64)
print(df_containsfield.shape[0], " number of records\n")
tkeys_counter = np.zeros((arr_checkin.shape[0],2), dtype=np.object)
tkeys_counter[:,0] = np.array(arr_checkin)
tkeys_counter[:,1] = df_containsfield.sum(axis=0)
print("Key associated with its frequency: \n", tkeys_counter)

end=time.time()
print("Execution time: ", end - start, " seconds.")

Executing the code ...

161950  number of records

Key associated with its frequency: 
 [['business_id' 161950.0]
 ['date' 161950.0]]
Execution time:  2.568455934524536  seconds.


<h3> number of null values per column</h3> 

In [16]:
start = time.time()
for a in list(arr_checkin):
    !echo $a
    !grep -e "\"$a\":null" /Users/kemalm/Desktop/yelp_dataset/checkin.json | wc -l
end = time.time()
print("Execution time: ", end - start, " seconds.")

business_id
       0
date
       0
Execution time:  14.164305925369263  seconds.


In [13]:
checkin_cols = list(arr_checkin)
print(checkin_cols)

['business_id', 'date']


<h3> Creating a <i style="color:blue"> checkin.csv </i> file and writing data to it. </h3> 

In [6]:
start = time.time()
print("Executing the code ...\n")
with open('/Users/kemalm/Desktop/yelp_dataset/checkin.csv','w',encoding='utf-8') as csv_file:
    writer = csv.DictWriter(csv_file,checkin_cols, delimiter='\t' )
    writer.writeheader()
    counter=0
    for dictObj in listOfDicts_checkin:
        writer.writerow(dictObj)
        counter+=1
print("Successfully written {} rows".format(counter))
end = time.time()
print("Execution time: ", end - start, " seconds.")

Executing the code ...

Successfully written 161950 rows
Execution time:  9.046382665634155  seconds.


In [8]:
!wc -l yelp_dataset/checkin.json
!wc -l yelp_dataset/checkin.csv


  161950 yelp_dataset/checkin.json
  161951 yelp_dataset/checkin.csv


# converting tip.json to tip.csv
### importing from json file

In [3]:
start = time.time()
print("Executing the code ...\n")
listOfDicts_tip = []
with open('/Users/kemalm/Desktop/yelp_dataset/tip.json',encoding='utf-8') as f:
    counter=0
    for line in f:
        listOfDicts_tip.append(json.loads(line))
        counter+=1
print("Successfully appended {} dictionaries.".format(counter))
end = time.time()
print("Execution time: ", end - start, " seconds.")

Executing the code ...

Successfully appended 1223094 dictionaries.
Execution time:  6.717769145965576  seconds.


In [6]:
!wc -l /Users/kemalm/Desktop/yelp_dataset/tip.json

 1223094 /Users/kemalm/Desktop/yelp_dataset/tip.json


### naming a list of attributes

In [17]:
arr_tip = np.array(['user_id', 'business_id', 'text', 'date', 'compliment_count'])

<h3> checking if all json objects, we obtained from <i style="color:blue">tip.json </i> file, actually contain all keys, that yelp dataset documentation claims they do </h3> 

In [10]:
start = time.time()
print("Executing the code ...\n")
df_containsfield= np.zeros((len(listOfDicts_tip),len(arr_tip)))

for i in range(0,len(listOfDicts_tip)):
    df_containsfield[i,:] = np.isin(arr_tip, np.array(list(listOfDicts_tip[i].keys()))).astype(np.int64)
print(df_containsfield.shape[0], " number of records\n")
tkeys_counter = np.zeros((arr_tip.shape[0],2), dtype=np.object)
tkeys_counter[:,0] = np.array(arr_tip)
tkeys_counter[:,1] = df_containsfield.sum(axis=0)
print("Key associated with its frequency: \n", tkeys_counter)

end=time.time()
print("Execution time: ", end - start, " seconds.")

Executing the code ...

1223094  number of records

Key associated with its frequency: 
 [['user_id' 1223094.0]
 ['business_id' 1223094.0]
 ['text' 1223094.0]
 ['date' 1223094.0]
 ['compliment_count' 1223094.0]]
Execution time:  30.554124116897583  seconds.


<h3> number of null values per column</h3> 

In [18]:
start = time.time()
for a in list(arr_tip):
    !echo $a
    !grep -e "\"$a\":null" /Users/kemalm/Desktop/yelp_dataset/tip.json | wc -l
end = time.time()
print("Execution time: ", end - start, " seconds.")

user_id
       0
business_id
       0
text
       0
date
       0
compliment_count
       0
Execution time:  25.752610683441162  seconds.


In [9]:
tip_cols = list(arr_tip)
print(tip_cols)

['user_id', 'business_id', 'text', 'date', 'compliment_count']


<h3> Creating a <i style="color:blue"> tip.csv </i> file and writing data to it. </h3> 

In [16]:
start = time.time()
print("Executing the code ...\n")
with open('/Users/kemalm/Desktop/yelp_dataset/tip.csv','w',encoding='utf-8') as csv_file:
    writer = csv.DictWriter(csv_file,tip_cols, delimiter='\t' )
    writer.writeheader()
    counter=0
    for dictObj in listOfDicts_tip:
        writer.writerow(dictObj)
        counter+=1
print("Successfully written {} rows".format(counter))
end = time.time()
print("Execution time: ", end - start, " seconds.")

Executing the code ...

Successfully written 1223094 rows
Execution time:  8.033058166503906  seconds.


# converting photo.json to photo.csv
### importing from json file

In [6]:
start = time.time()
print("Executing the code ...\n")
listOfDicts_photo = []
with open('/Users/kemalm/Desktop/yelp_dataset/photo.json',encoding='utf-8') as f:
    counter=0
    for line in f:
        listOfDicts_photo.append(json.loads(line))
        counter+=1
print("Successfully appended {} dictionaries.".format(counter))
end = time.time()
print("Execution time: ", end - start, " seconds.")

Executing the code ...

Successfully appended 200000 dictionaries.
Execution time:  0.9235949516296387  seconds.


In [2]:
!wc -l /Users/kemalm/Desktop/yelp_dataset/photo.json

  200000 /Users/kemalm/Desktop/yelp_dataset/photo.json


### naming a list of attributes

In [20]:
arr_photo = np.array(['caption', 'photo_id', 'business_id', 'label'])

<h3> checking if all json objects, we obtained from <i style="color:blue">photo.json </i> file, actually contain all keys, that yelp dataset documentation claims they do </h3> 

In [10]:
start = time.time()
print("Executing the code ...\n")
df_containsfield= np.zeros((len(listOfDicts_photo),len(arr_photo)))

for i in range(0,len(listOfDicts_photo)):
    df_containsfield[i,:] = np.isin(arr_photo, np.array(list(listOfDicts_photo[i].keys()))).astype(np.int64)
print(df_containsfield.shape[0], " number of records\n")
tkeys_counter = np.zeros((arr_photo.shape[0],2), dtype=np.object)
tkeys_counter[:,0] = np.array(arr_photo)
tkeys_counter[:,1] = df_containsfield.sum(axis=0)
print("Key associated with its frequency: \n", tkeys_counter)

end=time.time()
print("Execution time: ", end - start, " seconds.")

Executing the code ...

200000  number of records

Key associated with its frequency: 
 [['caption' 200000.0]
 ['photo_id' 200000.0]
 ['business_id' 200000.0]
 ['label' 200000.0]]
Execution time:  4.147678852081299  seconds.


<h3> number of null values per column</h3> 

In [21]:
start = time.time()
for a in list(arr_photo):
    !echo $a
    !grep -e "\"$a\":null" /Users/kemalm/Desktop/yelp_dataset/photo.json | wc -l
end = time.time()
print("Execution time: ", end - start, " seconds.")

caption
       0
photo_id
       0
business_id
       0
label
       0
Execution time:  3.233721971511841  seconds.


In [9]:
photo_cols = list(arr_photo)
print(photo_cols)

['caption', 'photo_id', 'business_id', 'label']


<h3> Creating a <i style="color:blue"> photo.csv </i> file and writing data to it. </h3> 

In [13]:
start = time.time()
print("Executing the code ...\n")
with open('/Users/kemalm/Desktop/yelp_dataset/photo.csv','w',encoding='utf-8') as csv_file:
    writer = csv.DictWriter(csv_file,photo_cols, delimiter='\t' )
    writer.writeheader()
    counter=0
    for dictObj in listOfDicts_photo:
        writer.writerow(dictObj)
        counter+=1
print("Successfully written {} rows".format(counter))
end = time.time()
print("Execution time: ", end - start, " seconds.")

Executing the code ...



NameError: name 'csv' is not defined

# Python-to-PostgreSQL client

In [1]:
import psycopg2
import pandas as pd

In [2]:
try:
    conn=psycopg2.connect("dbname='yelpDB' user='postgres' host='localhost' password='P0$tgre$QL'")
except:
    print("I am unable to connect to the database")

In [194]:
cur = conn.cursor()
#cur.execute("""select city,state, count(business_id)
#                from Businesses
#                where is_open = 1
#                group by city,state
#                order by 3 desc
#                limit 10""")
#recordsDB = cur.fetchall()

In [3]:
cur = conn.cursor()
cur.execute("""select business_id, name, address, city, state, latitude, longitude, categories, is_open
                from Businesses""")
recordsDB = cur.fetchall()

# Frequency

### US/Canada states/provinces by number of businesses

In [196]:
cur.execute("""SELECT state,  COUNT(business_id) AS Total_Count
               FROM Businesses
               WHERE is_open = 1
               GROUP BY state 
               ORDER BY 2 DESC""")
freqrecords = cur.fetchall()

In [197]:
pd.DataFrame(freqrecords, columns=['State','Total # of businesses']).head(10)

Unnamed: 0,State,Total # of businesses
0,AZ,46910
1,NV,29562
2,ON,26525
3,OH,12546
4,NC,12419
5,PA,9430
6,QC,7623
7,AB,6694
8,WI,4210
9,IL,1545


### Cities by number of businesses

In [17]:
cur.execute("""select city,state, count(business_id)
               from Businesses
               where is_open = 1
               group by city,state
               order by 3 desc""")
freqrecords = cur.fetchall()

In [18]:
pd.DataFrame(freqrecords, columns=['City','State','Total # of businesses']).head(10)

Unnamed: 0,City,State,Total # of businesses
0,Las Vegas,NV,23784
1,Phoenix,AZ,15471
2,Toronto,ON,14329
3,Charlotte,NC,7945
4,Scottsdale,AZ,7081
5,Calgary,AB,6445
6,Pittsburgh,PA,5736
7,Montréal,QC,5163
8,Mesa,AZ,5149
9,Henderson,NV,4026


# Google Places: Sending HTTP Requests 

In [6]:
import time
import requests
from scipy.spatial.distance import pdist
from geopy.distance import geodesic
import numpy as np
import random as rn
import json

In [7]:
key_content= !cat /Users/kemalm/Desktop/gmAPI.txt
api_key = key_content[0]

In [8]:
fields = ['business_id','name', 'address', 'city', 'state', 'latitude', 'longitude', 'categories', 'is_open', 'hours']
#k = np.core.defchararray.add(np.array(['obt_']),np.array(fields))

#print(k)
indices =[x for x in range(0,len(fields))]
#print(indices)
mapDictIndexes = dict(zip(fields,indices))
print(mapDictIndexes)

{'business_id': 0, 'name': 1, 'address': 2, 'city': 3, 'state': 4, 'latitude': 5, 'longitude': 6, 'categories': 7, 'is_open': 8, 'hours': 9}


In [19]:
coordinates = np.array([[0,0],
                        [ 0, 180]])# Using the geodesic distance function.
m_dist = pdist(coordinates, # Coordinates matrix or tuples list
               lambda u, v: geodesic(u, v).kilometers)


In [10]:
smpl_lat,smpl_lng =  33.5221294, -112.0181866
url_api_place_nearbySearch ="https://maps.googleapis.com/maps/api/place/nearbysearch/json?location={},{}&radius=500&keyword={}&key={}".format(
smpl_lat,smpl_lng,'Arizona Biltmore Golf Club', api_key)
response_api_place_nearbySearch =requests.get(url_api_place_nearbySearch)

In [12]:
#print(json.dumps(response_api_place_nearbySearch.json()))

In [11]:
smpl_lat,smpl_lng =  33.626171,-111.915779
url_api_place_nearbySearch ="https://maps.googleapis.com/maps/api/place/nearbysearch/json?location={},{}&radius=500&keyword={}&key={}".format(
smpl_lat,smpl_lng,'Precision Door Service', api_key)
response_api_place_nearbySearch =requests.get(url_api_place_nearbySearch)

In [109]:
smpl_address = '19420 N 59th Ave, Ste 13'.replace(' ','+')
url_geocoding = "https://maps.googleapis.com/maps/api/geocode/json?address={}&key={}".format(smpl_address,api_key)
response_geocoding = requests.get(url_geocoding)
response_geocodingJSON= response_geocoding.json()
#print(json.dumps(response_geocodingJSON))

#frm_address = response_geocodingJSON['results'][0]['formatted_address']
#frm_address.rsplit(',',4)

In [8]:
smpl_address = '19420 N 59th Ave, Ste 13'.replace(' ','+')
url_geocoding = "https://maps.googleapis.com/maps/api/geocode/json?address={}&key={}".format(smpl_address,api_key)
response_geocoding = requests.get(url_geocoding)
response_geocodingJSON= response_geocoding.json()

# Geocoding

In [1]:
import psycopg2
import numpy as np
import pandas as pd
import time
import requests
from scipy.spatial.distance import pdist
from geopy.distance import geodesic
import random as rn
import json

try:
    conn=psycopg2.connect("dbname='yelpDB' user='postgres' host='localhost' password='P0$tgre$QL'")
except:
    print("I am unable to connect to the database")
cur = conn.cursor()
cur.execute("""select business_id, name, address, city, state, latitude, longitude, categories
                from Businesses""")
recordsDB = cur.fetchall()

In [5]:
len(recordsDB)

192609

In [2]:
key_content= !cat /Users/kemalm/Desktop/gmAPI.txt
api_key = key_content[0]
fields = ['business_id','name', 'address', 'city', 'state', 'latitude', 'longitude', 'categories']
#k = np.core.defchararray.add(np.array(['obt_']),np.array(fields))

#print(k)
indices =[x for x in range(0,len(fields))]
#print(indices)
mapDictIndexes = dict(zip(fields,indices))
print(mapDictIndexes)

{'business_id': 0, 'name': 1, 'address': 2, 'city': 3, 'state': 4, 'latitude': 5, 'longitude': 6, 'categories': 7}


In [4]:
def GeocodingRequestHandling(record):
    dictObj=dict()
    
    business_name = record[mapDictIndexes['name']] 
    business_address = record[mapDictIndexes['address']]
    business_latitude = record[mapDictIndexes['latitude']]
    business_longitude = record[mapDictIndexes['longitude']]
    
    dictObj['business_id'] = record[mapDictIndexes['business_id']] 
    dictObj['name']= business_name
    dictObj['address']= business_address
    dictObj['city']= record[mapDictIndexes['city']]
    dictObj['state']= record[mapDictIndexes['state']]
    dictObj['latitude']= business_latitude
    dictObj['longitude']= business_longitude
    dictObj['categories']= record[mapDictIndexes['categories']]
    #dictObj['is_open']= record[mapDictIndexes['is_open']]
     
    if(business_address !=''):
        url_geocoding ="https://maps.googleapis.com/maps/api/geocode/json?address={}&key={}".format(
        business_address.replace(' ','+'),api_key)
        resp =requests.get(url_geocoding)
        resp_dict= resp.json()
        dictObj['geocoding_status'] = resp_dict['status']
        if(dictObj['geocoding_status']=='OK'):
            GeocodingFunc(dictObj,resp_dict)
    
    url_invgeocoding = "https://maps.googleapis.com/maps/api/geocode/json?latlng={},{}&key={}".format(
        business_latitude,business_longitude,api_key)
    resp =requests.get(url_invgeocoding)
    resp_dict= resp.json()
    dictObj['invgeocoding_status'] = resp_dict['status']
    if(dictObj['invgeocoding_status']=='OK'):
        InvGeocodingFunc(dictObj, resp_dict)
    
    return dictObj

In [5]:
def GeocodingFunc(dictObj,resp_dict):
    dictObj['latitude_from_address']= resp_dict['results'][0]['geometry']['location']['lat']
    dictObj['longitude_from_address']= resp_dict['results'][0]['geometry']['location']['lng']
    #dictObj['formatted_address_from_address'] = response_placeSearchJSON['results'][0]['formatted_address']
    coordinates=np.array([[dictObj['latitude'], dictObj['longitude']],
                          [dictObj['latitude_from_address'],dictObj['longitude_from_address']]])    
    m_dist = pdist(coordinates, # Coordinates matrix or tuples list
           lambda u, v: geodesic(u, v).kilometers)
    dictObj['dist_diff']= float(m_dist)* 1000.0
    dictObj['place_id_from_address']= resp_dict['results'][0]['place_id']

In [6]:
def InvGeocodingFunc(dictObj,resp_dict):
    dictObj['formatted_address_from_coord'] = resp_dict['results'][0]['formatted_address']    
    dictObj['address_components_from_coord'] = resp_dict['results'][0]['address_components']  
    dictObj['place_id_from_coord']= resp_dict['results'][0]['place_id']    

In [7]:
import requests
import multiprocessing
import time as time

start = time.time()
counter = 0
sample = rn.sample(recordsDB,3000)
listOfDicts=[]
for i in range(0,len(sample),100):
    with multiprocessing.Pool( processes=multiprocessing.cpu_count()) as pool:
        listOfDicts+=pool.map(GeocodingRequestHandling, sample[i:i+100])
    counter+=100   
    print("Appended 100 rows. In total {}".format(counter))
    end = time.time()
    print("Execution time: ", end - start, " seconds.")

Appended 100 rows. In total 100
Execution time:  15.504426956176758  seconds.
Appended 100 rows. In total 200
Execution time:  30.028430938720703  seconds.
Appended 100 rows. In total 300
Execution time:  45.61023211479187  seconds.
Appended 100 rows. In total 400
Execution time:  61.22262406349182  seconds.
Appended 100 rows. In total 500
Execution time:  76.96387195587158  seconds.
Appended 100 rows. In total 600
Execution time:  92.58970808982849  seconds.
Appended 100 rows. In total 700
Execution time:  107.43603587150574  seconds.
Appended 100 rows. In total 800
Execution time:  120.93393421173096  seconds.
Appended 100 rows. In total 900
Execution time:  136.52965903282166  seconds.
Appended 100 rows. In total 1000
Execution time:  151.4632158279419  seconds.
Appended 100 rows. In total 1100
Execution time:  167.63475513458252  seconds.
Appended 100 rows. In total 1200
Execution time:  183.08225107192993  seconds.
Appended 100 rows. In total 1300
Execution time:  198.390354871749

In [18]:
len(listOfDicts)

3000

In [25]:
frm_listOfDicts = []
counter = 0
for el in listOfDicts:
    dictObj = dict(el)
    street_name, route, city, state,country = "","","","",""
    for sub_el in el['address_components_from_coord']:
        if('street_number' in sub_el['types']):
            street_name = sub_el['long_name']
        if('route' in sub_el['types']):
            route = sub_el['long_name']
        if('locality' in sub_el['types'] ):
            city = sub_el['long_name']
        if('administrative_area_level_1' in sub_el['types'] ):
            state =  sub_el['short_name']
        if('country' in sub_el['types'] ):
            country =  sub_el['short_name']
    if(street_name != "" and route != ""):
        dictObj['address_from_coord'] = street_name+' '+route
    else:
        dictObj['address_from_coord'] = street_name+route      
    dictObj['city_from_coord'] = city
    dictObj['state_from_coord'] = state
    dictObj['country_from_coord'] = country     
    frm_listOfDicts.append(dictObj)
    counter+=1
print("Appended {} dictionaries".format(counter))    

Appended 3000 dictionaries


In [26]:
df_sampleGEO = pd.DataFrame(frm_listOfDicts)

In [27]:
df_sampleGEO.shape

(3000, 21)

In [129]:
#Ignoring address components
cols_GEO = [ 'business_id',
            'name',
            'address',
            'city',
            'state',
            'address_from_coord',
            'city_from_coord',
            'state_from_coord',
            'country_from_coord', 
            'formatted_address_from_coord',      
            'latitude',
            'latitude_from_address',
            'longitude',
            'longitude_from_address',
            'dist_diff',   
            'categories',
            'place_id_from_address',
            'place_id_from_coord',
            'geocoding_status',
            'invgeocoding_status',
]

In [130]:
len(cols_GEO)

20

In [35]:
df_sampleGEO[cols_GEO].to_csv(path_or_buf='/Users/kemalm/Desktop/Google Maps Data Matching/geocodingSampleFormatted.csv',index=False)

In [3]:
df_test = pd.read_csv('/Users/kemalm/Desktop/Google Maps Data Matching/geocodingSampleFormatted.csv')
df_test.loc[ df_test['address'].isnull(), 'address'] = ''
df_test.loc[ df_test['address_from_coord'].isnull(), 'address_from_coord'] = ''
df_test.loc[ df_test['city_from_coord'].isnull(), 'city_from_coord'] = ''

In [4]:
df_test.count()

business_id                     3000
name                            3000
address                         3000
city                            3000
state                           3000
geocoding_status                2894
latitude                        3000
latitude_from_address           2869
longitude                       3000
longitude_from_address          2869
dist_diff                       2869
invgeocoding_status             3000
address_from_coord              3000
city_from_coord                 3000
state_from_coord                3000
country_from_coord              3000
formatted_address_from_coord    3000
categories                      2993
place_id_from_address           2869
place_id_from_coord             3000
dtype: int64

In [5]:
#df_sampleGEO[cols_GEO].count() == df_test.count()

In [6]:
df_test[~df_test['dist_diff'].isnull()].sort_values(by=['dist_diff']).head(5)

Unnamed: 0,business_id,name,address,city,state,geocoding_status,latitude,latitude_from_address,longitude,longitude_from_address,dist_diff,invgeocoding_status,address_from_coord,city_from_coord,state_from_coord,country_from_coord,formatted_address_from_coord,categories,place_id_from_address,place_id_from_coord
0,9AA2fmkKTBB53k5NjHUZEA,EnergyXChange,698 Queen Street E,Toronto,ON,OK,43.658897,43.658897,-79.350264,-79.350264,0.0,OK,,Toronto,ON,CA,"698 Queen Street East, Floor 2nd, Toronto, ON ...","Chiropractors, Active Life, Fitness & Instruct...",ChIJlV4sRGzL1IkRnJRrAlUIDLU,ChIJUyNWRGzL1IkR0EP41bIbIho
1486,cydzZL5xw01hmSY1hU-sRQ,Hookah Hotspot,"2600 Central Ave, Unit H",Charlotte,NC,OK,35.218815,35.218815,-80.798997,-80.798997,0.0,OK,2600 Central Avenue,Charlotte,NC,US,"2600 Central Ave, Charlotte, NC 28205, USA","Hookah Bars, Bars, Restaurants, Cafes, Nightli...",EjJVbml0IEgsIDI2MDAgQ2VudHJhbCBBdmUsIENoYXJsb3...,ChIJ6ZBfffcfVIgR-0sZIeJtvlk
1480,eFoKqhHf-jxwKVB69iEbrw,Punto Gelato,1070 Street Clair Avenue W,Toronto,ON,OK,43.678958,43.678958,-79.439269,-79.439269,0.0,OK,1070 Saint Clair Avenue West,Toronto,ON,CA,"1070 St Clair Ave W, Toronto, ON M6E 1A5, Canada","Gelato, Food",ChIJ5eL_Jnc0K4gR3cXyR6IwVlo,ChIJ5eL_Jnc0K4gR3cXyR6IwVlo
1476,beBdNA6cEuDLvrhTi_EjKQ,McDonald's,501 N Mattis,Champaign,IL,OK,40.120572,40.120572,-88.277423,-88.277423,0.0,OK,501 North Mattis Avenue,Champaign,IL,US,"501 N Mattis Ave, Champaign, IL 61821, USA","Fast Food, Restaurants, Burgers, Coffee & Tea,...",ChIJRwYlpI_QDIgRz5FPWhH93C0,ChIJRwYlpI_QDIgRz5FPWhH93C0
1474,qBRlDFifJORikBqTWkt5DA,Red Brick Pizza,3765 S Gilbert Rd,Gilbert,AZ,OK,33.281721,33.281721,-111.788975,-111.788975,0.0,OK,85249 Crossroads Towne Center,Chandler,AZ,US,"85249 Phoenix-Wickenburg Hwy, Chandler, AZ 852...","Pizza, Restaurants",ChIJW115XQqrK4cR1foUrf3og70,ChIJudUVXQqrK4cRIVosatoqyBs


In [7]:
print("""Min value for calc_distance: {} meters
Max value for calc_distance: {} meters
Mean value for calc_distance: {} meters
Median for calc_distance: {} meters""".format(
                            df_test['dist_diff'].min(), 
                            df_test['dist_diff'].max(), 
                            df_test['dist_diff'].mean(),
                            df_test['dist_diff'].median(),


))

Min value for calc_distance: 0.0 meters
Max value for calc_distance: 6891676.510079709 meters
Mean value for calc_distance: 110513.04258306714 meters
Median for calc_distance: 6.524573901023411 meters


In [8]:
df_test.dist_diff.describe()

count    2.869000e+03
mean     1.105130e+05
std      5.221751e+05
min      0.000000e+00
25%      0.000000e+00
50%      6.524574e+00
75%      4.917491e+01
max      6.891677e+06
Name: dist_diff, dtype: float64

In [9]:
ranges = [0,15]
temp_var = 15
while temp_var <= int(df_test['dist_diff'].max()):
    temp_var *=2
    ranges.append(temp_var) 
print(int(df_test['dist_diff'].max()))
print(ranges)
print(len(ranges))
k = df_test[~df_test['dist_diff'].isnull()]

6891676
[0, 15, 30, 60, 120, 240, 480, 960, 1920, 3840, 7680, 15360, 30720, 61440, 122880, 245760, 491520, 983040, 1966080, 3932160, 7864320]
21


In [10]:
k.groupby(pd.cut(
                k['dist_diff'], 
                ranges)).count()['business_id']

dist_diff
(0, 15]               982
(15, 30]              243
(30, 60]              249
(60, 120]             179
(120, 240]            126
(240, 480]             49
(480, 960]             13
(960, 1920]            19
(1920, 3840]           11
(3840, 7680]           15
(7680, 15360]          21
(15360, 30720]         15
(30720, 61440]          9
(61440, 122880]         1
(122880, 245760]        6
(245760, 491520]       18
(491520, 983040]       59
(983040, 1966080]      42
(1966080, 3932160]     63
(3932160, 7864320]      4
Name: business_id, dtype: int64

In [15]:
QUANTILES = df_test.dist_diff.quantile([0.25,0.5,0.75])
Q1, Q3 = QUANTILES[0.25], QUANTILES[0.75]
print(Q1,Q3)
IQR = Q3 - Q1

0.0 49.17490654771142


In [16]:
np.array(np.bitwise_or(df_test.dist_diff <= Q1-IQR*1.5, df_test.dist_diff  >= Q3+IQR*1.5 )).astype(np.int64).sum()


467

In [17]:
df_test[np.bitwise_or(df_test.dist_diff <= Q1-IQR*1.5, df_test.dist_diff  >= Q3+IQR*1.5 )].sort_values(by=['dist_diff'])

Unnamed: 0,business_id,name,address,city,state,geocoding_status,latitude,latitude_from_address,longitude,longitude_from_address,dist_diff,invgeocoding_status,address_from_coord,city_from_coord,state_from_coord,country_from_coord,formatted_address_from_coord,categories,place_id_from_address,place_id_from_coord
1271,TpO8XDs9v19OwBsPUcPlFQ,Birdtown Crossfit,"13000 Athens Ave, Ste 300-A",Lakewood,OH,OK,41.472569,41.473677,-81.780698,-81.780667,1.230063e+02,OK,13001 Athens Avenue,Lakewood,OH,US,"13001 Athens Ave, Lakewood, OH 44107, USA","Active Life, Trainers, Gyms, Fitness & Instruc...",EjRTdGUgMzAwLUEsIDEzMDAwIEF0aGVucyBBdmUsIExha2...,ChIJVzc8IwLyMIgR7p_omu4BAvw
340,LVQg38jxGhhBSGRwyw__nA,Lululemon Athletica,"3500 S Las Vegas Blvd, Unit F01A",Las Vegas,NV,OK,36.118889,36.119132,-115.173517,-115.174850,1.230337e+02,OK,3500 South Las Vegas Boulevard,Las Vegas,NV,US,"3500 S Las Vegas Blvd, Las Vegas, NV 89109, USA","Shopping, Women's Clothing, Men's Clothing, Sp...",ChIJo057qDzEyIAR7dbaMDGDxCQ,ChIJK7HbJiPEyIAR1wHMB7HXmp8
2335,LVjo-nTZt7FoOMBJfiKs3A,Starbucks,"230 W. Huron Rd, K18",Cleveland,OH,OK,41.498278,41.497166,-81.693849,-81.694064,1.248164e+02,OK,230 West Huron Road,Cleveland,OH,US,"230 W Huron Rd, Cleveland, OH 44113, USA","Restaurants, Coffee & Tea, Food, Fast Food",Ei1LMTgsIDIzMCBXIEh1cm9uIFJkLCBDbGV2ZWxhbmQsIE...,ChIJ25px3IHwMIgRtC7G8ipzyB4
1694,UlAGWuc1iQpLccjZhTqo6Q,Teuscher Chocolates,3663 Las Vegas Blvd S,Las Vegas,NV,OK,36.110732,36.109604,-115.172236,-115.172308,1.253831e+02,OK,3687 South Las Vegas Boulevard,Las Vegas,NV,US,"3687 S Las Vegas Blvd, Las Vegas, NV 89109, USA","Chocolatiers & Shops, Food, Candy Stores, Spec...",ChIJE8LdFTHEyIARjDq06fQW05o,ChIJn3y63zDEyIARPH0NoRqxVyc
1444,DtWpR5z1WYTcuDXr1_9N_Q,Chi Foot Spa,"2040 S Alma School Rd, Ste 24",Chandler,AZ,OK,33.274859,33.274685,-111.860705,-111.859374,1.255138e+02,OK,2040 South Alma School Road,Chandler,AZ,US,"2040 S Alma School Rd, Chandler, AZ 85286, USA","Massage Therapy, Reflexology, Massage, Day Spa...",EjIyNCwgMjA0MCBTIEFsbWEgU2Nob29sIFJkLCBDaGFuZG...,ChIJTa2BgoEAK4cR9zI4gpcMRAQ
688,Yw5OEZP1MHIkRNBG_gXupg,Mathnasium of Arcadia,"4730 E Indian School Rd, Ste 107",Phoenix,AZ,OK,33.495917,33.494974,-111.979264,-111.980020,1.259540e+02,OK,4730 East Indian School Road,Phoenix,AZ,US,"4730 E Indian School Rd, Phoenix, AZ 85018, USA","Education, Tutoring Centers",ChIJ4XPW7lwMK4cRKT4d6PqiRUY,ChIJJ8So_1wMK4cR4QHLOsMpug8
1376,08n9bygXHb837Fm74Bvqxw,The Pump House,575 Herrons Ferry Rd,Rock Hill,SC,OK,34.983887,34.984923,-80.973208,-80.973789,1.266224e+02,OK,576 Herrons Ferry Road,Rock Hill,SC,US,"576 Herrons Ferry Road, Rock Hill, SC 29730, USA","Nightlife, Southern, American (New), Bars, Res...",ChIJYyMese2FVogRTLtazbCVO18,ChIJS3U-LeyFVogRBrC33EF8IP8
2680,mfnflEg0UTbnPQThgu8MTw,Woody's Your Way Yogurt,20330 N Cave Creek Rd,Phoenix,AZ,OK,33.670389,33.670797,-112.031015,-112.032295,1.269939e+02,OK,20235 North Cave Creek Road,Phoenix,AZ,US,"20235 N Cave Creek Rd, Phoenix, AZ 85024, USA","Ice Cream & Frozen Yogurt, Food",ChIJeRw_cTFwK4cRzuQqVuvlTCw,ChIJq6rqSzFwK4cRVtzGD-jWa1E
2280,_D3_0nNkk0y1mt2RhO65mQ,Earnhardt's Chrysler Jeep,577 E Baseline Rd,Tempe,AZ,OK,33.377508,33.378042,-111.933108,-111.934315,1.269978e+02,OK,,Tempe,AZ,US,"577 E Baseline Rd, at San Marquis, Tempe, AZ 8...","Auto Repair, Car Dealers, Automotive",ChIJ7Tt3HRkGK4cRkpahYkuDIhQ,ChIJARlObBkGK4cRnIzmnvWTwUQ
2341,3Gihlnww_C1bao7zKkj2gA,Poppa Maize,"15414 N 19th Ave, Ste P",Phoenix,AZ,OK,33.626933,33.625903,-112.100731,-112.100127,1.272334e+02,OK,1921 West Waltann Lane,Phoenix,AZ,US,"1921 W Waltann Ln, Phoenix, AZ 85023, USA","Specialty Food, Food",Ei9TdGUgUCwgMTU0MTQgTiAxOXRoIEF2ZSwgUGhvZW5peC...,ChIJbatYjotuK4cR9AIcvrUpluM


In [18]:
df_test.country_from_coord.unique()

array(['CA', 'US'], dtype=object)

In [19]:
df_test.state_from_coord.unique()

array(['ON', 'QC', 'AZ', 'NC', 'OH', 'AB', 'NV', 'PA', 'WI', 'IL', 'SC',
       'NY'], dtype=object)

In [20]:
print("""Address (null count): {}
Address (empty count): {}
City (null count): {}
City (empty count): {}
State (null count): {}
State (empty count): {}
--------------------------------------------
Address from coordinates (null count): {}
Address from coordinates (empty count): {}
City from coordinates (null count): {}
City from coordinates (empty count): {}
State from coordinates (null count): {}
State from coordinates (empty count): {}
""".format(
df_test [df_test.address.isnull()]['business_id'].count(),
df_test [df_test.address == '']['business_id'].count(),      
df_test [df_test.city.isnull()]['business_id'].count(),
df_test [df_test.city == '']['business_id'].count(),     
df_test [df_test.state.isnull()]['business_id'].count(),
df_test [df_test.state == '']['business_id'].count(),     

df_test [df_test.address_from_coord.isnull()]['business_id'].count(),
df_test [df_test.address_from_coord == '']['business_id'].count(),      
df_test [df_test.city_from_coord.isnull()]['business_id'].count(),
df_test [df_test.city_from_coord == '']['business_id'].count(),     
df_test [df_test.state_from_coord.isnull()]['business_id'].count(),
df_test [df_test.state_from_coord == '']['business_id'].count()   
     ))

Address (null count): 0
Address (empty count): 106
City (null count): 0
City (empty count): 0
State (null count): 0
State (empty count): 0
--------------------------------------------
Address from coordinates (null count): 0
Address from coordinates (empty count): 111
City from coordinates (null count): 0
City from coordinates (empty count): 6
State from coordinates (null count): 0
State from coordinates (empty count): 0



## Matching cities and states

In [11]:
df_fltStates = pd.DataFrame (df_test[np.bitwise_and(df_test.state != '', df_test.state_from_coord != '')][['business_id','state','state_from_coord']])

In [12]:
df_fltStates.shape

(3000, 3)

In [13]:
df_fltCities= pd.DataFrame (df_test[np.bitwise_and(df_test.city != '', df_test.city_from_coord != '')][['business_id','city','city_from_coord']])

In [14]:
df_fltCities.shape

(2994, 3)

In [15]:
print("""Cities match: {} out of {}
States match: {} out of {}
Filtered out null and empty values.""" .format(
df_fltCities[df_fltCities.city == df_fltCities.city_from_coord ].business_id.count(), df_fltCities.shape[0],
df_fltStates[df_fltStates.state == df_fltStates.state_from_coord ].business_id.count(), df_fltStates.shape[0]
))

Cities match: 2791 out of 2994
States match: 3000 out of 3000
Filtered out null and empty values.


In [16]:
df_fltAddresses= pd.DataFrame(df_test[np.bitwise_and(df_test.address != '', df_test.address_from_coord != '')][['business_id','address','address_from_coord']])

In [17]:
from fuzzywuzzy import fuzz
scores = np.zeros((df_fltAddresses.shape[0],1))

In [18]:
scores.shape

(2805, 1)

In [19]:
for idx, row in enumerate(df_fltAddresses.itertuples(index=False)):
    addr=row.__getattribute__('address')
    addr_c = row.__getattribute__('address_from_coord')
    score = fuzz.token_set_ratio(addr,addr_c)
    scores[idx,0] = score
df_fltAddresses['addr_score']= scores

In [20]:
df_fltAddresses.sort_values(by=['addr_score'],ascending=False).head(10)

Unnamed: 0,business_id,address,address_from_coord,addr_score
482,fWwZ-uAZQGhSo7VE9snDxw,"6435 Dixie Road, Unit 1",6435 Dixie Road,100.0
231,Q35IklisnaQP1wKXTUOJ2Q,7710 South Autoplex Loop,7710 South Autoplex Loop,100.0
1806,nYXZqFAnZ51_5SJvy9lx2w,100 McCaul Street,100 McCaul Street,100.0
1826,BmF7g1ygaBuad-iAlQeSFw,"Etobicoke General Hospital, 101 Humber College...",101 Humber College Boulevard,100.0
1838,rGTCx9RWqBQKOD9fn0Bk7A,3200 Rue Jean-Yves,3200 Rue Jean-Yves,100.0
1847,G0J88j2hHXSKmy1nMIZcDA,477 Roncesvalles Avenue,477 Roncesvalles Avenue,100.0
224,IqxhMxYSQTg8BytCkaGczA,8050 Chamilly Rue,8050 Rue de Chamilly,100.0
1850,b6IjUr2azLh0AsWDjus4zg,141 Berkeley Street,141 Berkeley Street,100.0
222,TaR4FZDEIyk9w2xSeRQo5Q,698 6E Avenue,698 6e Avenue,100.0
1852,_MOZSg1ajEvkubHXM3aDFg,585 College Street,585 College Street,100.0


In [21]:
df_with_scores= pd.DataFrame(pd.merge(df_test, df_fltAddresses[['business_id','addr_score']], on='business_id' ,how='left'))

In [22]:
df_with_scores.shape

(3000, 21)

In [33]:
df_with_scores.sort_values(by=['addr_score'], ascending=False)[ [ 'business_id',
            'name',
            'address',
            'city',
            'state',
            'address_from_coord',
            'addr_score',
            'city_from_coord',
            'state_from_coord',
            'country_from_coord', 
            'formatted_address_from_coord',      
            'latitude',
            'latitude_from_address',
            'longitude',
            'longitude_from_address',
            'dist_diff',   
            'categories',
            'place_id_from_address',
            'place_id_from_coord',
            'geocoding_status',
            'invgeocoding_status'
            ]].to_csv(path_or_buf='/Users/kemalm/Desktop/Google Maps Data Matching/geocodingSampleScores.csv',index=False)

In [44]:
df_with_scores[~df_with_scores.isnull()].count() # for validation

business_id                     3000
name                            3000
address                         3000
city                            3000
state                           3000
geocoding_status                2894
latitude                        3000
latitude_from_address           2869
longitude                       3000
longitude_from_address          2869
dist_diff                       2869
invgeocoding_status             3000
address_from_coord              3000
city_from_coord                 3000
state_from_coord                3000
country_from_coord              3000
formatted_address_from_coord    3000
categories                      2993
place_id_from_address           2869
place_id_from_coord             3000
addr_score                      2805
dtype: int64

### Make a split

In [48]:
testing_df = pd.read_csv('/Users/kemalm/Desktop/Google Maps Data Matching/geocodingSampleScores.csv')
testing_df.loc[ testing_df['address'].isnull(), 'address'] = ''
testing_df.loc[ testing_df['address_from_coord'].isnull(), 'address_from_coord'] = ''
testing_df.loc[ testing_df['city_from_coord'].isnull(), 'city_from_coord'] = ''


In [58]:
testing_df[~testing_df.isnull()].count()

business_id                     3000
name                            3000
address                         3000
city                            3000
state                           3000
address_from_coord              3000
addr_score                      2805
city_from_coord                 3000
state_from_coord                3000
country_from_coord              3000
formatted_address_from_coord    3000
latitude                        3000
latitude_from_address           2869
longitude                       3000
longitude_from_address          2869
dist_diff                       2869
categories                      2993
place_id_from_address           2869
place_id_from_coord             3000
geocoding_status                2894
invgeocoding_status             3000
dtype: int64

In [59]:
testing_df.shape

(3000, 21)

In [60]:
testing_df.iloc[1]

business_id                                                Q35IklisnaQP1wKXTUOJ2Q
name                                                                        Midas
address                                                  7710 South Autoplex Loop
city                                                                        Tempe
state                                                                          AZ
address_from_coord                                       7710 South Autoplex Loop
addr_score                                                                    100
city_from_coord                                                             Tempe
state_from_coord                                                               AZ
country_from_coord                                                             US
formatted_address_from_coord           7710 S Autoplex Loop, Tempe, AZ 85284, USA
latitude                                                                  33.3486
latitude_from_ad

# Testing Place Search

In [38]:
url_placeSearch ="https://maps.googleapis.com/maps/api/place/findplacefromtext/json?input={}&inputtype=textquery&locationbias=circle:100@{},{}&key={}&fields=place_id,name,type,permanently_closed".format(
    'Meatball House'.replace(' ','%'),45.4884,-73.5682, api_key)
       
response_placeSearch =requests.get(url_placeSearch)
response_placeSearchJSON= response_placeSearch.json()

In [40]:
response_placeSearchJSON

{'candidates': [{'name': 'Meatball House',
   'place_id': 'ChIJLd5qjmQayUwRPy5fx6lhk2c',
   'types': ['restaurant', 'point_of_interest', 'food', 'establishment']}],
 'status': 'OK'}

In [37]:
url_placeDetails = 'https://maps.googleapis.com/maps/api/place/details/json?placeid={}&fields=name,types&key={}'.format(
'ChIJI2drjmQayUwRzU5g3kukboY',api_key)
resp = requests.get(url_placeDetails)
resp_dict = resp.json()
print(resp_dict)

{'html_attributions': [], 'result': {'name': '1752 Rue Notre-Dame Ouest', 'types': ['street_address']}, 'status': 'OK'}


# Obtaining info from Find Place (Places API)

In [24]:
testing_df.shape

(3000, 21)

In [25]:
flt_df_with_scores = pd.DataFrame(testing_df[np.bitwise_and(~testing_df.dist_diff.isnull(),~testing_df.addr_score.isnull())][testing_df.columns])

In [26]:
flt_df_with_scores.shape

(2781, 21)

In [27]:
flt_df_with_scores[(flt_df_with_scores.dist_diff <= 100) & (flt_df_with_scores.addr_score>=70) ][flt_df_with_scores.columns].shape

(2130, 21)

In [28]:
flt_list_of_dicts = flt_df_with_scores[(flt_df_with_scores.dist_diff <= 100) & (flt_df_with_scores.addr_score>=70) ][flt_df_with_scores.columns].to_dict('records')

In [29]:
len(flt_list_of_dicts), type(flt_list_of_dicts)

(2130, list)

In [30]:
def PlaceSearchHandlingRequest(record):
    dictObj=dict(record)
    
    b_name = record['name']
    b_lat = record['latitude']
    b_long = record['longitude']

    url_placeSearch ="https://maps.googleapis.com/maps/api/place/findplacefromtext/json?input={}&inputtype=textquery&locationbias=circle:100@{},{}&key={}&fields=place_id,name,type".format(
    b_name.replace(' ','%'),b_lat,b_long , api_key)
       
    resp_ps =requests.get(url_placeSearch)
    resp_psDict= resp_ps.json()
       
    dictObj['placeSearch_status'] = resp_psDict['status']  # 1
    if(dictObj['placeSearch_status']=='OK'):
        ObtainData(dictObj,resp_psDict)            
    return dictObj

In [31]:
def ObtainData(dictObj, resp_psDict):
    dictObj['business_place_id'] = resp_psDict['candidates'][0]['place_id'] #2
    dictObj['name_from_location']= resp_psDict['candidates'][0]['name']  #3
    listTypes = resp_psDict['candidates'][0]['types']
    strTypes =", ".join(listTypes)
    dictObj['categories_from_location']= strTypes   #4

In [32]:
import requests
import multiprocessing
import time as time

start = time.time()
counter = 0

dicts_with_even_name_types=[]
for i in range(0,len(flt_list_of_dicts),30):
    with multiprocessing.Pool( processes=multiprocessing.cpu_count()) as pool:
        dicts_with_even_name_types+=pool.map(PlaceSearchHandlingRequest, flt_list_of_dicts[i:i+30])
    counter+=30   
    print("Appended 30 rows. In total {}".format(counter))
    end = time.time()
    print("Execution time: ", end - start, " seconds.")

Appended 30 rows. In total 30
Execution time:  4.082468032836914  seconds.
Appended 30 rows. In total 60
Execution time:  7.191589117050171  seconds.
Appended 30 rows. In total 90
Execution time:  10.234119176864624  seconds.
Appended 30 rows. In total 120
Execution time:  13.071210861206055  seconds.
Appended 30 rows. In total 150
Execution time:  15.689855098724365  seconds.
Appended 30 rows. In total 180
Execution time:  18.40503215789795  seconds.
Appended 30 rows. In total 210
Execution time:  21.947098970413208  seconds.
Appended 30 rows. In total 240
Execution time:  25.286725997924805  seconds.
Appended 30 rows. In total 270
Execution time:  28.119951009750366  seconds.
Appended 30 rows. In total 300
Execution time:  31.05910015106201  seconds.
Appended 30 rows. In total 330
Execution time:  34.19987392425537  seconds.
Appended 30 rows. In total 360
Execution time:  37.36716318130493  seconds.
Appended 30 rows. In total 390
Execution time:  40.29800200462341  seconds.
Appended 

In [33]:
len(dicts_with_even_name_types)

2130

In [39]:
savePlaceSearchForEveryCase = pd.DataFrame(dicts_with_even_name_types)[['business_id',
            'name',
            'name_from_location',
            'address',
            'city',
            'state',
            'address_from_coord',
            'addr_score',
            'city_from_coord',
            'state_from_coord',
            'country_from_coord', 
            'formatted_address_from_coord',      
            'latitude',
            'latitude_from_address',
            'longitude',
            'longitude_from_address',
            'dist_diff',   
            'categories',
            'categories_from_location',
            'place_id_from_address',
            'place_id_from_coord',
            'business_place_id',
            'geocoding_status',
            'invgeocoding_status',                                                                        
            'placeSearch_status'                                                    
]]

In [41]:
savePlaceSearchForEveryCase[~savePlaceSearchForEveryCase.isnull()].count()

business_id                     2130
name                            2130
name_from_location              1971
address                         2130
city                            2130
state                           2130
address_from_coord              2130
addr_score                      2130
city_from_coord                 2130
state_from_coord                2130
country_from_coord              2130
formatted_address_from_coord    2130
latitude                        2130
latitude_from_address           2130
longitude                       2130
longitude_from_address          2130
dist_diff                       2130
categories                      2126
categories_from_location        1971
place_id_from_address           2130
place_id_from_coord             2130
business_place_id               1971
geocoding_status                2130
invgeocoding_status             2130
placeSearch_status              2130
dtype: int64

In [65]:
savePlaceSearchForEveryCase.shape

(2130, 25)

In [43]:
savePlaceSearchForEveryCase.to_csv(path_or_buf='/Users/kemalm/Desktop/Google Maps Data Matching/spasi_da_se_ne_gubi.csv',index=False)

In [66]:
merged_df = pd.DataFrame(        pd.merge(testing_df, 
         savePlaceSearchForEveryCase[['business_id','business_place_id','name_from_location','categories_from_location','placeSearch_status']],
         on='business_id',
         how='left'))
         

In [67]:
merged_df.shape

(3000, 25)

In [71]:
merged_df[merged_df.placeSearch_status =='OK'].shape

(1971, 25)

In [74]:
merged_df[['business_id',
            'name',
            'name_from_location',
            'address',
            'city',
            'state',
            'address_from_coord',
            'addr_score',
            'city_from_coord',
            'state_from_coord',
            'country_from_coord', 
            'formatted_address_from_coord',      
            'latitude',
            'latitude_from_address',
            'longitude',
            'longitude_from_address',
            'dist_diff',   
            'categories',
            'categories_from_location',
            'place_id_from_address',
            'place_id_from_coord',
            'business_place_id',
            'geocoding_status',
            'invgeocoding_status',                                                                        
            'placeSearch_status'                                                    
]].to_csv(path_or_buf='/Users/kemalm/Desktop/Google Maps Data Matching/mrgSample.csv',index=False)

In [73]:
merged_df[~merged_df.isnull()].count()

business_id                     3000
name                            3000
address                         3000
city                            3000
state                           3000
address_from_coord              3000
addr_score                      2805
city_from_coord                 3000
state_from_coord                3000
country_from_coord              3000
formatted_address_from_coord    3000
latitude                        3000
latitude_from_address           2869
longitude                       3000
longitude_from_address          2869
dist_diff                       2869
categories                      2993
place_id_from_address           2869
place_id_from_coord             3000
geocoding_status                2894
invgeocoding_status             3000
business_place_id               1971
name_from_location              1971
categories_from_location        1971
placeSearch_status              2130
dtype: int64

# Final Dataset (?)

In [75]:
testing_mrgDataset = pd.read_csv('/Users/kemalm/Desktop/Google Maps Data Matching/mrgSample.csv')
testing_mrgDataset.loc[ testing_mrgDataset['address'].isnull(), 'address'] = ''
testing_mrgDataset.loc[ testing_mrgDataset['address_from_coord'].isnull(), 'address_from_coord'] = ''
testing_mrgDataset.loc[ testing_mrgDataset['city_from_coord'].isnull(), 'city_from_coord'] = ''

In [76]:
testing_mrgDataset[~testing_mrgDataset.isnull()].count()

business_id                     3000
name                            3000
name_from_location              1971
address                         3000
city                            3000
state                           3000
address_from_coord              3000
addr_score                      2805
city_from_coord                 3000
state_from_coord                3000
country_from_coord              3000
formatted_address_from_coord    3000
latitude                        3000
latitude_from_address           2869
longitude                       3000
longitude_from_address          2869
dist_diff                       2869
categories                      2993
categories_from_location        1971
place_id_from_address           2869
place_id_from_coord             3000
business_place_id               1971
geocoding_status                2894
invgeocoding_status             3000
placeSearch_status              2130
dtype: int64

# Identify suspicous patterns

In [68]:
import psycopg2
import numpy as np
import pandas as pd
try:
    conn=psycopg2.connect("dbname='yelpDB' user='postgres' host='localhost' password='P0$tgre$QL'")
except:
    print("I am unable to connect to the database")
cur = conn.cursor()
cur.execute("""SELECT state,  COUNT(business_id) AS total_Count
               FROM Businesses
               GROUP BY state 
               ORDER BY 2 DESC""")
statesQuery = cur.fetchall()

In [69]:
dct_states_prov_terr = {
        'AK': 'Alaska',
        'AL': 'Alabama',
        'AR': 'Arkansas',
        'AS': 'American Samoa',
        'AZ': 'Arizona',
        'CA': 'California',
        'CO': 'Colorado',
        'CT': 'Connecticut',
        'DC': 'District of Columbia',
        'DE': 'Delaware',
        'FL': 'Florida',
        'GA': 'Georgia',
        'GU': 'Guam',
        'HI': 'Hawaii',
        'IA': 'Iowa',
        'ID': 'Idaho',
        'IL': 'Illinois',
        'IN': 'Indiana',
        'KS': 'Kansas',
        'KY': 'Kentucky',
        'LA': 'Louisiana',
        'MA': 'Massachusetts',
        'MD': 'Maryland',
        'ME': 'Maine',
        'MI': 'Michigan',
        'MN': 'Minnesota',
        'MO': 'Missouri',
        'MP': 'Northern Mariana Islands',
        'MS': 'Mississippi',
        'MT': 'Montana',
        'NA': 'National',
        'NC': 'North Carolina',
        'ND': 'North Dakota',
        'NE': 'Nebraska',
        'NH': 'New Hampshire',
        'NJ': 'New Jersey',
        'NM': 'New Mexico',
        'NV': 'Nevada',
        'NY': 'New York',
        'OH': 'Ohio',
        'OK': 'Oklahoma',
        'OR': 'Oregon',
        'PA': 'Pennsylvania',
        'PR': 'Puerto Rico',
        'RI': 'Rhode Island',
        'SC': 'South Carolina',
        'SD': 'South Dakota',
        'TN': 'Tennessee',
        'TX': 'Texas',
        'UT': 'Utah',
        'VA': 'Virginia',
        'VI': 'Virgin Islands',
        'VT': 'Vermont',
        'WA': 'Washington',
        'WI': 'Wisconsin',
        'WV': 'West Virginia',
        'WY': 'Wyoming',
        'AB': 'Alberta',
        'BC': 'British Columbia',
        'MB': 'Manitoba',
        'NB': 'New Brunswick',
        'NL': 'Newfoundland and Labrador',
        'NT': 'Northwest Territories',
        'NS': 'Nova Scotia',
        'NU': 'Nunavut',
        'ON': 'Ontario',
        'PE': 'Prince Edward Island',
        'QC': 'Quebec',
        'SK': 'Saskatchewan',
        'YT': 'Yukon'
}

In [70]:
df_statesQuery = pd.DataFrame(statesQuery, columns=['state','total_count'])

In [71]:
df_statesQuery.head()

Unnamed: 0,state,total_count
0,AZ,56686
1,NV,36312
2,ON,33412
3,NC,14720
4,OH,14697


In [72]:
df_statesQuery['state_exists'] = df_statesQuery['state'].isin(dct_states_prov_terr.keys())

In [73]:
#state_exist = np.array([ (el[0], int(el[0] in dct_states_prov_terr.keys())) for el in df_statesQuery])

In [74]:
df_statesQuery[df_statesQuery['state_exists'] == False]

Unnamed: 0,state,total_count,state_exists
14,XGM,4,False
19,XWY,2,False
25,DUR,1,False
27,DOW,1,False
29,BAS,1,False
31,CON,1,False
33,XGL,1,False


### Get corresponding records

In [79]:
rpl_str = str(list(df_statesQuery[df_statesQuery['state_exists'] == False]['state'].values)).replace('[','').replace(']','')

cur.execute("""SELECT *
         FROM Businesses
         WHERE state IN ({})""".format(rpl_str))
all_recs =cur.fetchall()


In [81]:
all_recs

[('svMJjPd4l_Zb_MoxejYZvw',
  'Zoom Printing',
  '1136 Center Street, Suite 442',
  'Thornhill',
  'DUR',
  'L4J 3M8',
  43.8085625817,
  -79.4638055695,
  3.5,
  3,
  1,
  {},
  'Professional Services, Advertising, Printing Services, Print Media, Mass Media, Local Services',
  {'Friday': '9:0-17:0',
   'Monday': '9:0-17:0',
   'Tuesday': '9:0-17:0',
   'Thursday': '9:0-17:0',
   'Wednesday': '9:0-17:0'}),
 ('d4qoXn1Rqt47LLTDA3bAwQ',
  'Thayer David Ice Cream Shop',
  '8 York St',
  'Bath',
  'BAS',
  'BA1 1NG',
  43.6406456,
  -79.380939,
  4.0,
  4,
  1,
  {'RestaurantsPriceRange2': '1'},
  'Food, Ice Cream & Frozen Yogurt',
  {}),
 ('8_GNJU3EPar9VkPzJvoC3w',
  'Bean & Brush Family Art Café',
  'The Old Sorting Office, 12 Hayfield Street',
  'Sale',
  'XGM',
  'M33 7XW',
  42.9960594,
  -89.568889,
  4.0,
  4,
  1,
  {'WiFi': "u'free'",
   'Caters': 'True',
   'BikeParking': 'True',
   'OutdoorSeating': 'True',
   'BusinessParking': "{'garage': False, 'street': False, 'validated': Fa

In [86]:
df_suspBus= pd.DataFrame(all_recs,columns=["business_id",
                                "name",
                                "address",
                                "city",
                                "state",
                                "postal_code",
                                "latitude",
                                "longitude",
                                "stars",
                                "review_count",
                                "is_open",
                                "attributes",
                                "categories",
                                "hours"])

In [88]:
df_suspBus

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,svMJjPd4l_Zb_MoxejYZvw,Zoom Printing,"1136 Center Street, Suite 442",Thornhill,DUR,L4J 3M8,43.808563,-79.463806,3.5,3,1,{},"Professional Services, Advertising, Printing S...","{'Friday': '9:0-17:0', 'Monday': '9:0-17:0', '..."
1,d4qoXn1Rqt47LLTDA3bAwQ,Thayer David Ice Cream Shop,8 York St,Bath,BAS,BA1 1NG,43.640646,-79.380939,4.0,4,1,{'RestaurantsPriceRange2': '1'},"Food, Ice Cream & Frozen Yogurt",{}
2,8_GNJU3EPar9VkPzJvoC3w,Bean & Brush Family Art Café,"The Old Sorting Office, 12 Hayfield Street",Sale,XGM,M33 7XW,42.996059,-89.568889,4.0,4,1,"{'WiFi': 'u'free'', 'Caters': 'True', 'BikePar...","Arts & Crafts, Shopping, Coffee & Tea, Food","{'Friday': '8:30-19:0', 'Monday': '8:30-19:0',..."
3,44xjnQMwAQjgZ80MW5z-Gg,No. 37 Sandwich Bar,37 Monk Bridge Road,Leeds,XWY,LS6 4EP,45.456999,-73.59525,4.5,3,1,"{'WiFi': ''no'', 'Alcohol': 'u'none'', 'Outdoo...","Bakeries, Food, Desserts, Restaurants, Sandwiches","{'Friday': '7:0-15:0', 'Monday': '7:0-15:0', '..."
4,eeEcf7XXAGClqdUCwnwRfg,The Old Lifeboat House,"The Cove, Coverack Helston",Church Cove,CON,TR12 6SX,35.532021,-80.851682,3.5,3,1,"{'RestaurantsTakeOut': 'True', 'RestaurantsDel...","British, Restaurants",{}
5,JNZeVq9jr9AWURmnM-Yxig,Total Gardening and Landscaping,,Bury,XGM,BL8 4DR,42.996059,-89.568889,5.0,3,1,"{'ByAppointmentOnly': 'False', 'BusinessAccept...","Home Services, Landscaping, Tree Services, Gar...","{'Friday': '0:0-0:0', 'Monday': '0:0-0:0', 'Su..."
6,6dhkHf-CFHr7C8wj-qopCQ,Paper Cutz,"Gorebrook Works, Pinkbank Lane",Manchester,XGM,M12 5GH,42.996059,-89.568889,2.5,3,1,"{'BusinessParking': '{'garage': False, 'valida...","Art Supplies, Arts & Crafts, Shopping",{}
7,FByZsT1Sob5Vf1AYJFPxPg,Desi Masala,61 Queen Street,Leeds,XWY,LS27 8EB,43.652821,-79.376345,4.5,5,1,"{'WiFi': ''no'', 'HasTV': 'True', 'Alcohol': '...","Indian, Pakistani, Restaurants",{}
8,xjR-PII302WyyNRfpcowDg,Moxon's Fishmongers,110 Islington High Street,London,XGL,N1 8EG,43.645355,-79.524467,4.5,3,1,"{'BusinessParking': '{'garage': False, 'street...","Specialty Food, Food, Seafood Markets","{'Friday': '9:0-19:30', 'Tuesday': '9:0-19:30'..."
9,ZsL7FUkaWdyQnDoYB6XpSA,Happy Gathering Resturant Oldham,,Oldham,XGM,OL2 6PX,42.996059,-89.568889,4.0,3,1,{},"Chinese, Restaurants",{}


In [87]:
df_suspBus.to_csv(path_or_buf='/Users/kemalm/Desktop/suspBus.csv',index=False)