In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
import json

In [11]:
# reads in the json file, only to the max entries and returns them as json_array, if max entries is set to 0 then it reads the full thing
def read_partial_json_file(filename, max_entries=0, encoding='utf-8'):
    json_array = []
    with open(filename, 'r', encoding=encoding) as file:
        if max_entries == 0:
            for line in file:
                json_array.append(json.loads(line))
        else:
            for _ in range(max_entries):
                line = file.readline()
                if not line:
                    break
                json_array.append(json.loads(line))
    return json_array

def add_missing_keys(json_array):
    for obj in json_array:
        for key in ['stars', 'useful', 'funny', 'cool', 'text']:
            if key not in obj:
                obj[key] = 0
                print("Key {} not found in json".format(key))
    return json_array

# removes specified keys from json array
def remove_keys(json_array, keys_to_remove):
    for obj in json_array:
        for key in keys_to_remove:
            obj.pop(key, None)
    return json_array

def ConvertJSONFileToDataFrame(filename, max_entries=1000, encoding='utf-8'):
    #load in the json array
    json_array = read_partial_json_file(filename, max_entries, encoding)
    #add in the missing keys, will set to 0 for now but a heuristic for this will have to be made.
    json_array = add_missing_keys(json_array)
    df = pd.DataFrame(json_array)
    ColumnsToRemove = ['business_id', 'user_id', 'date', 'review_id']
    df = df.drop(columns=ColumnsToRemove)
    return df

In [12]:

filename = 'yelp_academic_dataset_review.json'
df = ConvertJSONFileToDataFrame(filename)
df.head(10)

Unnamed: 0,stars,useful,funny,cool,text
0,3.0,0,0,0,"If you decide to eat here, just be aware it is..."
1,5.0,1,0,1,I've taken a lot of spin classes over the year...
2,3.0,0,0,0,Family diner. Had the buffet. Eclectic assortm...
3,5.0,1,0,1,"Wow! Yummy, different, delicious. Our favo..."
4,4.0,1,0,1,Cute interior and owner (?) gave us tour of up...
5,1.0,1,2,1,I am a long term frequent customer of this est...
6,5.0,0,2,0,Loved this tour! I grabbed a groupon and the p...
7,5.0,2,0,0,Amazingly amazing wings and homemade bleu chee...
8,3.0,1,1,0,This easter instead of going to Lopez Lake we ...
9,3.0,0,0,0,Had a party of 6 here for hibachi. Our waitres...
