In [1]:
import json
import pandas as pd
import numpy as np
import csv
import time

# converting business.json to business.csv
### importing from json file

In [3]:
start = time.time()
print("Executing the code ...\n")
listOfDicts_business = []
counter =0
with open('/Users/kemalm/Desktop/yelp_dataset/business.json',encoding='utf-8') as f:
    for line in f:
        listOfDicts_business.append(json.loads(line))
        counter+=1
print("Successfully appended {} dictionaries.".format(counter))
end = time.time()
print("Execution time: ", end - start, " seconds.")

Executing the code ...

Successfully appended 192609 dictionaries.
Execution time:  3.6180918216705322  seconds.


In [4]:
!wc -l /Users/kemalm/Desktop/yelp_dataset/business.json

  192609 /Users/kemalm/Desktop/yelp_dataset/business.json


### naming a list of attributes

In [2]:
attr_arr = np.array(['business_id', 'name', 'address', 'city', 'state', 
                     'postal_code', 'latitude', 'longitude', 'stars', 
                     'review_count', 'is_open', 'attributes', 'categories', 'hours'])
print(attr_arr, type(attr_arr))

['business_id' 'name' 'address' 'city' 'state' 'postal_code' 'latitude'
 'longitude' 'stars' 'review_count' 'is_open' 'attributes' 'categories'
 'hours'] <class 'numpy.ndarray'>


<h3> checking if all json objects, we obtained from <i style="color:blue">business.json </i> file, actually contain all keys, that yelp dataset documentation claims they do </h3> 

In [6]:
df_containsfield= np.zeros((len(listOfDicts_business),len(attr_arr)))

for i in range(0,len(listOfDicts_business)):
    df_containsfield[i,:] = np.isin(attr_arr, np.array(list(listOfDicts_business[i].keys()))).astype(np.int64)
print(df_containsfield.shape[0], " number of records\n")
tkeys_counter = np.zeros((attr_arr.shape[0],2), dtype=np.object)
tkeys_counter[:,0] = np.array(attr_arr)
tkeys_counter[:,1] = df_containsfield.sum(axis=0)
print("Key associated with its frequency: \n", tkeys_counter)

192609  number of records

Key associated with its frequency: 
 [['business_id' 192609.0]
 ['name' 192609.0]
 ['address' 192609.0]
 ['city' 192609.0]
 ['state' 192609.0]
 ['postal_code' 192609.0]
 ['latitude' 192609.0]
 ['longitude' 192609.0]
 ['stars' 192609.0]
 ['review_count' 192609.0]
 ['is_open' 192609.0]
 ['attributes' 192609.0]
 ['categories' 192609.0]
 ['hours' 192609.0]]


<h3> number of null values per column</h3> 

In [7]:
start = time.time()
for a in list(attr_arr):
    !echo $a
    !grep -e "\"$a\":null" /Users/kemalm/Desktop/yelp_dataset/business.json | wc -l
end = time.time()
print("Execution time: ", end - start, " seconds.")

business_id
       0
name
       0
address
       0
city
       0
state
       0
postal_code
       0
latitude
       0
longitude
       0
stars
       0
review_count
       0
is_open
       0
attributes
   28836
categories
     482
hours
   44830


<h3> Creating a <i style="color:blue"> business.csv </i> file and writing data to it. </h3> 

In [6]:
class mydict(dict):
        def __str__(self):
            return json.dumps(self)

In [15]:
start = time.time()
print("Executing the code ...\n")
with open('/Users/kemalm/Desktop/yelp_dataset/business.csv','w',encoding='utf-8') as csv_file:
    writer = csv.DictWriter(csv_file,list(attr_arr), delimiter='\t' )
    writer.writeheader()
    counter=0
    for dictObj in listOfDicts_business:
        tempDict=dict(dictObj)    
        if tempDict.get('attributes') is not None:
            tempDict['attributes'] = mydict(tempDict['attributes']).__str__()
        else:
            tempDict['attributes']="{}"
            
        if tempDict.get('hours') is not None:
            tempDict['hours'] = mydict(tempDict['hours']).__str__()
        else:
            tempDict['hours']="{}"
        writer.writerow(tempDict)
        counter+=1
print("Successfully written {} rows".format(counter))
end = time.time()
print("Execution time: ", end - start, " seconds.")

Executing the code ...

Successfully written 192609 rows
Execution time:  6.614404201507568  seconds.


# converting user.json to user.csv
### importing from json file

In [3]:
start = time.time()
print("Executing the code ...\n")
listOfDicts_user = []
with open('/Users/kemalm/Desktop/yelp_dataset/user.json',encoding='utf-8') as f:
    counter=0
    for line in f:
        listOfDicts_user.append(json.loads(line))
        counter+=1
endend  = time.time()
print("Successfully appended {} rows".format(counter))
print("Execution time: ", end - start, " seconds.")

Executing the code ...

Successfully appended 1637138 rows
Execution time:  42.027015209198  seconds.


In [5]:
!wc -l /Users/kemalm/Desktop/yelp_dataset/user.json

 1637138 /Users/kemalm/Desktop/yelp_dataset/user.json


In [4]:
len(listOfDicts_user)

1637138

### naming a list of attributes

In [8]:
arr_user = np.array(['user_id', 'name', 'review_count', 'yelping_since', 'useful', 
                     'funny', 'cool', 'elite', 'friends', 'fans', 
                     'average_stars', 'compliment_hot', 'compliment_more', 'compliment_profile', 'compliment_cute', 
                     'compliment_list', 'compliment_note', 'compliment_plain', 'compliment_cool', 'compliment_funny', 
                     'compliment_writer', 'compliment_photos'])

<h3> checking if all json objects, we obtained from <i style="color:blue">user.json </i> file, actually contain all keys, that yelp dataset documentation claims they do </h3> 

In [9]:
start = time.time()
print("Executing the code ...\n")
df_containsfield= np.zeros((len(listOfDicts_user),len(arr_user)))

for i in range(0,len(listOfDicts_user)):
    df_containsfield[i,:] = np.isin(arr_user, np.array(list(listOfDicts_user[i].keys()))).astype(np.int64)
print(df_containsfield.shape[0], " number of records\n")
tkeys_counter = np.zeros((arr_user.shape[0],2), dtype=np.object)
tkeys_counter[:,0] = np.array(arr_user)
tkeys_counter[:,1] = df_containsfield.sum(axis=0)
print("Key associated with its frequency: \n", tkeys_counter)

end=time.time()
print("Execution time: ", end - start, " seconds.")


Executing the code ...

1637138  number of records

Key associated with its frequency: 
 [['user_id' 1637138.0]
 ['name' 1637138.0]
 ['review_count' 1637138.0]
 ['yelping_since' 1637138.0]
 ['useful' 1637138.0]
 ['funny' 1637138.0]
 ['cool' 1637138.0]
 ['elite' 1637138.0]
 ['friends' 1637138.0]
 ['fans' 1637138.0]
 ['average_stars' 1637138.0]
 ['compliment_hot' 1637138.0]
 ['compliment_more' 1637138.0]
 ['compliment_profile' 1637138.0]
 ['compliment_cute' 1637138.0]
 ['compliment_list' 1637138.0]
 ['compliment_note' 1637138.0]
 ['compliment_plain' 1637138.0]
 ['compliment_cool' 1637138.0]
 ['compliment_funny' 1637138.0]
 ['compliment_writer' 1637138.0]
 ['compliment_photos' 1637138.0]]
Execution time:  111.81420087814331  seconds.


<h3> number of null values per column</h3> 
<h5 style="color:red;"> WARNING! Following method works very slow for very large datasets (user.json). </h5> 
<h5 style="color:red;"> Therefore, it shouldn't be run more than once. </h5> 

In [9]:
start = time.time()
for a in list(arr_user):
    !echo $a
    !grep -e "\"$a\":null" /Users/kemalm/Desktop/yelp_dataset/user.json | wc -l
end = time.time()
print("Execution time: ", end - start, " seconds.")

user_id
       0
name
       0
review_count
       0
yelping_since
       0
useful
       0
funny
       0
cool
       0
elite
       0
friends
       0
fans
       0
average_stars
       0
compliment_hot
       0
compliment_more
       0
compliment_profile
       0
compliment_cute
       0
compliment_list
       0
compliment_note
       0
compliment_plain
       0
compliment_cool
       0
compliment_funny
       0
compliment_writer
       0
compliment_photos
       0
Execution time:  963.2632689476013  seconds.


<h3> Creating a <i style="color:blue"> user.csv </i> file and writing data to it. </h3> 

In [24]:
start = time.time()
print("Executing the code ...\n")
with open('/Users/kemalm/Desktop/yelp_dataset/user.csv','w',encoding='utf-8') as csv_file:
    writer = csv.DictWriter(csv_file,user_cols, delimiter='\t' )
    writer.writeheader()
    counter=0
    for dictObj in listOfDicts_user:
        writer.writerow(dictObj)
        counter+=1
print("Successfully written {} rows".format(counter))
end = time.time()
print("Execution time: ", end - start, " seconds.")

Executing the code ...

Successfully written 1637138 rows
Execution time:  75.70107102394104


In [47]:
!wc -l yelp_dataset/user.json
!echo "User.csv has one more row used as a header."
!wc -l yelp_dataset/user.csv


 1637138 yelp_dataset/user.json
User.csv has one more row used as a header.
 1637139 yelp_dataset/user.csv


# converting review.json to review.csv
### importing from json file

In [4]:
start = time.time()
print("Executing the code ...\n")
listOfDicts_review = []
with open('/Users/kemalm/Desktop/yelp_dataset/review.json',encoding='utf-8') as f:
    counter=0
    for line in f:
        listOfDicts_review.append(json.loads(line))
        counter+=1
print("Successfully appended {} dictionaries.".format(counter))
end = time.time()
print("Execution time: ", end - start, " seconds.")

Executing the code ...

Successfully appended 6685900 dictionaries.
Execution time:  71.14869093894958  seconds.


In [6]:
!wc -l /Users/kemalm/Desktop/yelp_dataset/review.json

 6685900 /Users/kemalm/Desktop/yelp_dataset/review.json


### naming a list of attributes

In [11]:
arr_review= np.array(['review_id', 'user_id', 'business_id', 'stars', 'useful',
                      'funny', 'cool', 'text', 'date'])

<h3> checking if all json objects, we obtained from <i style="color:blue">review.json </i> file, actually contain all keys, that yelp dataset documentation claims they do </h3> 

In [10]:
start = time.time()
print("Executing the code ...\n")
df_containsfield= np.zeros((len(listOfDicts_review),len(arr_review)))

for i in range(0,len(listOfDicts_review)):
    df_containsfield[i,:] = np.isin(arr_review, np.array(list(listOfDicts_review[i].keys()))).astype(np.int64)
print(df_containsfield.shape[0], " number of records\n")
tkeys_counter = np.zeros((arr_review.shape[0],2), dtype=np.object)
tkeys_counter[:,0] = np.array(arr_review)
tkeys_counter[:,1] = df_containsfield.sum(axis=0)
print("Key associated with its frequency: \n", tkeys_counter)

end=time.time()
print("Execution time: ", end - start, " seconds.")

Executing the code ...

6685900  number of records

Key associated with its frequency: 
 [['review_id' 6685900.0]
 ['user_id' 6685900.0]
 ['business_id' 6685900.0]
 ['stars' 6685900.0]
 ['useful' 6685900.0]
 ['funny' 6685900.0]
 ['cool' 6685900.0]
 ['text' 6685900.0]
 ['date' 6685900.0]]
Execution time:  299.6732749938965  seconds.


<h3> number of null values per column</h3> 
<h5 style="color:red;"> WARNING! Following method works very slow for very large datasets (user.json). </h5> 
<h5 style="color:red;"> Therefore, it shouldn't be run more than once. </h5> 

In [13]:
start = time.time()
for a in list(arr_review):
    !echo $a
    !grep -e "\"$a\":null" /Users/kemalm/Desktop/yelp_dataset/review.json | wc -l
end = time.time()
print("Execution time: ", end - start, " seconds.")

review_id
       0
user_id
       0
business_id
       0
stars
       0
useful
       0
funny
       0
cool
       0
text
       0
date
       0
Execution time:  913.4484198093414  seconds.


In [3]:
review_cols = list(arr_review)
print(review_cols)

['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny', 'cool', 'text', 'date']


<h3> Creating a <i style="color:blue"> review.csv </i> file and writing data to it. </h3> 

In [4]:
start = time.time()
print("Executing the code ...\n")
with open('/Users/kemalm/Desktop/yelp_dataset/review.csv','w',encoding='utf-8') as csv_file:
    writer = csv.DictWriter(csv_file,review_cols, delimiter='\t' )
    writer.writeheader()
    counter=0
    for dictObj in listOfDicts_review:
        writer.writerow(dictObj)
        counter+=1
print("Successfully written {} rows".format(counter))
end = time.time()
print("Execution time: ", end - start, " seconds.")



Executing the code ...

Successfully written 6685900 rows
Execution time:  232.80819010734558  seconds.


# converting checkin.json to checkin.csv
### importing from json file

In [7]:
start = time.time()
print("Executing the code ...\n")
listOfDicts_checkin = []
with open('/Users/kemalm/Desktop/yelp_dataset/checkin.json',encoding='utf-8') as f:
    counter=0
    for line in f:
        listOfDicts_checkin.append(json.loads(line))
        counter+=1
print("Successfully appended {} dictionaries.".format(counter))
end = time.time()
print("Execution time: ", end - start, " seconds.")

Executing the code ...

Successfully appended 161950 dictionaries.
Execution time:  2.0320558547973633  seconds.


In [3]:
!wc -l /Users/kemalm/Desktop/yelp_dataset/checkin.json

  161950 /Users/kemalm/Desktop/yelp_dataset/checkin.json


### naming a list of attributes

In [15]:
arr_checkin = np.array(['business_id', 'date'])

<h3> checking if all json objects, we obtained from <i style="color:blue">checkin.json </i> file, actually contain all keys, that yelp dataset documentation claims they do </h3> 

In [12]:
start = time.time()
print("Executing the code ...\n")
df_containsfield= np.zeros((len(listOfDicts_checkin),len(arr_checkin)))

for i in range(0,len(listOfDicts_checkin)):
    df_containsfield[i,:] = np.isin(arr_checkin, np.array(list(listOfDicts_checkin[i].keys()))).astype(np.int64)
print(df_containsfield.shape[0], " number of records\n")
tkeys_counter = np.zeros((arr_checkin.shape[0],2), dtype=np.object)
tkeys_counter[:,0] = np.array(arr_checkin)
tkeys_counter[:,1] = df_containsfield.sum(axis=0)
print("Key associated with its frequency: \n", tkeys_counter)

end=time.time()
print("Execution time: ", end - start, " seconds.")

Executing the code ...

161950  number of records

Key associated with its frequency: 
 [['business_id' 161950.0]
 ['date' 161950.0]]
Execution time:  2.568455934524536  seconds.


<h3> number of null values per column</h3> 

In [16]:
start = time.time()
for a in list(arr_checkin):
    !echo $a
    !grep -e "\"$a\":null" /Users/kemalm/Desktop/yelp_dataset/checkin.json | wc -l
end = time.time()
print("Execution time: ", end - start, " seconds.")

business_id
       0
date
       0
Execution time:  14.164305925369263  seconds.


In [13]:
checkin_cols = list(arr_checkin)
print(checkin_cols)

['business_id', 'date']


<h3> Creating a <i style="color:blue"> checkin.csv </i> file and writing data to it. </h3> 

In [6]:
start = time.time()
print("Executing the code ...\n")
with open('/Users/kemalm/Desktop/yelp_dataset/checkin.csv','w',encoding='utf-8') as csv_file:
    writer = csv.DictWriter(csv_file,checkin_cols, delimiter='\t' )
    writer.writeheader()
    counter=0
    for dictObj in listOfDicts_checkin:
        writer.writerow(dictObj)
        counter+=1
print("Successfully written {} rows".format(counter))
end = time.time()
print("Execution time: ", end - start, " seconds.")

Executing the code ...

Successfully written 161950 rows
Execution time:  9.046382665634155  seconds.


In [8]:
!wc -l yelp_dataset/checkin.json
!wc -l yelp_dataset/checkin.csv


  161950 yelp_dataset/checkin.json
  161951 yelp_dataset/checkin.csv


# converting tip.json to tip.csv
### importing from json file

In [3]:
start = time.time()
print("Executing the code ...\n")
listOfDicts_tip = []
with open('/Users/kemalm/Desktop/yelp_dataset/tip.json',encoding='utf-8') as f:
    counter=0
    for line in f:
        listOfDicts_tip.append(json.loads(line))
        counter+=1
print("Successfully appended {} dictionaries.".format(counter))
end = time.time()
print("Execution time: ", end - start, " seconds.")

Executing the code ...

Successfully appended 1223094 dictionaries.
Execution time:  6.717769145965576  seconds.


In [6]:
!wc -l /Users/kemalm/Desktop/yelp_dataset/tip.json

 1223094 /Users/kemalm/Desktop/yelp_dataset/tip.json


### naming a list of attributes

In [17]:
arr_tip = np.array(['user_id', 'business_id', 'text', 'date', 'compliment_count'])

<h3> checking if all json objects, we obtained from <i style="color:blue">tip.json </i> file, actually contain all keys, that yelp dataset documentation claims they do </h3> 

In [10]:
start = time.time()
print("Executing the code ...\n")
df_containsfield= np.zeros((len(listOfDicts_tip),len(arr_tip)))

for i in range(0,len(listOfDicts_tip)):
    df_containsfield[i,:] = np.isin(arr_tip, np.array(list(listOfDicts_tip[i].keys()))).astype(np.int64)
print(df_containsfield.shape[0], " number of records\n")
tkeys_counter = np.zeros((arr_tip.shape[0],2), dtype=np.object)
tkeys_counter[:,0] = np.array(arr_tip)
tkeys_counter[:,1] = df_containsfield.sum(axis=0)
print("Key associated with its frequency: \n", tkeys_counter)

end=time.time()
print("Execution time: ", end - start, " seconds.")

Executing the code ...

1223094  number of records

Key associated with its frequency: 
 [['user_id' 1223094.0]
 ['business_id' 1223094.0]
 ['text' 1223094.0]
 ['date' 1223094.0]
 ['compliment_count' 1223094.0]]
Execution time:  30.554124116897583  seconds.


<h3> number of null values per column</h3> 

In [18]:
start = time.time()
for a in list(arr_tip):
    !echo $a
    !grep -e "\"$a\":null" /Users/kemalm/Desktop/yelp_dataset/tip.json | wc -l
end = time.time()
print("Execution time: ", end - start, " seconds.")

user_id
       0
business_id
       0
text
       0
date
       0
compliment_count
       0
Execution time:  25.752610683441162  seconds.


In [9]:
tip_cols = list(arr_tip)
print(tip_cols)

['user_id', 'business_id', 'text', 'date', 'compliment_count']


<h3> Creating a <i style="color:blue"> tip.csv </i> file and writing data to it. </h3> 

In [16]:
start = time.time()
print("Executing the code ...\n")
with open('/Users/kemalm/Desktop/yelp_dataset/tip.csv','w',encoding='utf-8') as csv_file:
    writer = csv.DictWriter(csv_file,tip_cols, delimiter='\t' )
    writer.writeheader()
    counter=0
    for dictObj in listOfDicts_tip:
        writer.writerow(dictObj)
        counter+=1
print("Successfully written {} rows".format(counter))
end = time.time()
print("Execution time: ", end - start, " seconds.")

Executing the code ...

Successfully written 1223094 rows
Execution time:  8.033058166503906  seconds.


# converting photo.json to photo.csv
### importing from json file

In [6]:
start = time.time()
print("Executing the code ...\n")
listOfDicts_photo = []
with open('/Users/kemalm/Desktop/yelp_dataset/photo.json',encoding='utf-8') as f:
    counter=0
    for line in f:
        listOfDicts_photo.append(json.loads(line))
        counter+=1
print("Successfully appended {} dictionaries.".format(counter))
end = time.time()
print("Execution time: ", end - start, " seconds.")

Executing the code ...

Successfully appended 200000 dictionaries.
Execution time:  0.9235949516296387  seconds.


In [2]:
!wc -l /Users/kemalm/Desktop/yelp_dataset/photo.json

  200000 /Users/kemalm/Desktop/yelp_dataset/photo.json


### naming a list of attributes

In [20]:
arr_photo = np.array(['caption', 'photo_id', 'business_id', 'label'])

<h3> checking if all json objects, we obtained from <i style="color:blue">photo.json </i> file, actually contain all keys, that yelp dataset documentation claims they do </h3> 

In [10]:
start = time.time()
print("Executing the code ...\n")
df_containsfield= np.zeros((len(listOfDicts_photo),len(arr_photo)))

for i in range(0,len(listOfDicts_photo)):
    df_containsfield[i,:] = np.isin(arr_photo, np.array(list(listOfDicts_photo[i].keys()))).astype(np.int64)
print(df_containsfield.shape[0], " number of records\n")
tkeys_counter = np.zeros((arr_photo.shape[0],2), dtype=np.object)
tkeys_counter[:,0] = np.array(arr_photo)
tkeys_counter[:,1] = df_containsfield.sum(axis=0)
print("Key associated with its frequency: \n", tkeys_counter)

end=time.time()
print("Execution time: ", end - start, " seconds.")

Executing the code ...

200000  number of records

Key associated with its frequency: 
 [['caption' 200000.0]
 ['photo_id' 200000.0]
 ['business_id' 200000.0]
 ['label' 200000.0]]
Execution time:  4.147678852081299  seconds.


<h3> number of null values per column</h3> 

In [21]:
start = time.time()
for a in list(arr_photo):
    !echo $a
    !grep -e "\"$a\":null" /Users/kemalm/Desktop/yelp_dataset/photo.json | wc -l
end = time.time()
print("Execution time: ", end - start, " seconds.")

caption
       0
photo_id
       0
business_id
       0
label
       0
Execution time:  3.233721971511841  seconds.


In [9]:
photo_cols = list(arr_photo)
print(photo_cols)

['caption', 'photo_id', 'business_id', 'label']


<h3> Creating a <i style="color:blue"> photo.csv </i> file and writing data to it. </h3> 

In [5]:
start = time.time()
print("Executing the code ...\n")
with open('/Users/kemalm/Desktop/yelp_dataset/photo.csv','w',encoding='utf-8') as csv_file:
    writer = csv.DictWriter(csv_file,photo_cols, delimiter='\t' )
    writer.writeheader()
    counter=0
    for dictObj in listOfDicts_photo:
        writer.writerow(dictObj)
        counter+=1
print("Successfully written {} rows".format(counter))
end = time.time()
print("Execution time: ", end - start, " seconds.")

Executing the code ...

Successfully written 200000 rows
Execution time:  0.9482808113098145  seconds.


## linux null validation (business.json)