In [119]:
import json
import pandas as pd
import collections
import csv

In [3]:
def readjson():
     # 设置以utf-8解码模式读取文件，encoding参数必须设置，否则默认以gbk模式读取文件，当文件中包含中文时，会报错
    f = open("../dataset/yelp/test.json", encoding="utf-8")
    file = json.load(f)
    key_list = []
    for key in file.keys():
        key_list.append(key)
    family = file["fontFamily"]
    size = file["fontSize"]
    basesetting = file["BaseSettings"]["font"]  #注意多重结构的读取语法

    return (family, size, basesetting)

In [69]:
def get_superset_of_column_names_from_file(json_file_path):
    """Read in the json dataset file and return the superset of column names."""
    column_names = set()
    with open(json_file_path) as fin:
        for line in fin:
            line_contents = json.loads(line)
            column_names.update(
                    set(get_column_names(line_contents).keys())
                    )
    return column_names

In [126]:
def get_column_names(line_contents, parent_key=''):
    """Return a list of flattened key names given a dict.
    Example:
        line_contents = {
            'a': {
                'b': 2,
                'c': 3,
                },
        }
        will return: ['a.b', 'a.c']
    These will be the column names for the eventual csv file.
    """
    column_names = []
    for k, v in line_contents.items():
        column_name = "{0}.{1}".format(parent_key, k) if parent_key else k
        if isinstance(v, collections.MutableMapping):
            column_names.extend(
                    get_column_names(v, column_name).items()
                    )
        else:
            column_names.append((column_name, v))
    return dict(column_names)

In [127]:
column_names = get_superset_of_column_names_from_file("./test.json")

In [129]:
with open('./test.csv', 'w', newline='') as fout:
    csv_file = csv.writer(fout)
    csv_file.writerow(list(column_names))

In [103]:
def get_nested_value(d, key):
    """Return a dictionary item given a dictionary `d` and a flattened key from `get_column_names`.

    Example:
        d = {
            'a': {
                'b': 2,
                'c': 3,
                },
        }
        key = 'a.b'
        will return: 2

    """
    if '.' not in key:
        if key not in d:
            return None
        return d[key]
    base_key, sub_key = key.split('.', 1)
    if base_key not in d:
        return None
    sub_dict = d[base_key]
    return get_nested_value(sub_dict, sub_key)

In [104]:
def get_row(line_contents, column_names):
    """Return a csv compatible row given column names and a dict."""
    row = []
    for column_name in column_names:
        line_value = get_nested_value(
            line_contents,
            column_name,
        )
        if isinstance(line_value, unicode):
            row.append('{0}'.format(line_value.encode('utf-8')))
        elif line_value is not None:
            row.append('{0}'.format(line_value))
        else:
            row.append('')
    return row

In [120]:
column_names

{'address',
 'attributes.Alcohol',
 'attributes.Ambience',
 'attributes.BikeParking',
 'attributes.BusinessAcceptsBitcoin',
 'attributes.BusinessAcceptsCreditCards',
 'attributes.BusinessParking',
 'attributes.ByAppointmentOnly',
 'attributes.Caters',
 'attributes.DogsAllowed',
 'attributes.GoodForKids',
 'attributes.GoodForMeal',
 'attributes.HappyHour',
 'attributes.HasTV',
 'attributes.NoiseLevel',
 'attributes.OutdoorSeating',
 'attributes.RestaurantsAttire',
 'attributes.RestaurantsDelivery',
 'attributes.RestaurantsGoodForGroups',
 'attributes.RestaurantsPriceRange2',
 'attributes.RestaurantsReservations',
 'attributes.RestaurantsTableService',
 'attributes.RestaurantsTakeOut',
 'attributes.WheelchairAccessible',
 'attributes.WiFi',
 'business_id',
 'categories',
 'city',
 'hours.Friday',
 'hours.Monday',
 'hours.Saturday',
 'hours.Sunday',
 'hours.Thursday',
 'hours.Tuesday',
 'hours.Wednesday',
 'is_open',
 'latitude',
 'longitude',
 'name',
 'postal_code',
 'review_count',
 's

In [117]:
with open("./test.json") as fin:
    for line in fin:
        line_contents = json.loads(line)
        row = []
        print(line_contents)
        print('======'*4)
        for column_name in column_names:
            print(column_name)
            line_value = get_nested_value(
                line_contents,
                column_name,
            )
            print(line_value)
            print('======'*4)
            if line_value is not None:
                row.append('{0}'.format(line_value))
            else:
                row.append('')
        print(row)

{'business_id': 'tCbdrRPZA0oiIYSmHG3J0w', 'name': 'Flying Elephants at PDX', 'address': '7000 NE Airport Way', 'city': 'Portland', 'state': 'OR', 'postal_code': '97218', 'latitude': 45.5889058992, 'longitude': -122.5933307507, 'stars': 4.0, 'review_count': 126, 'is_open': 1, 'attributes': {'RestaurantsTakeOut': 'True', 'RestaurantsAttire': "u'casual'", 'GoodForKids': 'True', 'BikeParking': 'False', 'OutdoorSeating': 'False', 'Ambience': "{'romantic': False, 'intimate': False, 'touristy': False, 'hipster': False, 'divey': False, 'classy': False, 'trendy': False, 'upscale': False, 'casual': True}", 'Caters': 'True', 'RestaurantsReservations': 'False', 'RestaurantsDelivery': 'False', 'HasTV': 'False', 'RestaurantsGoodForGroups': 'False', 'BusinessAcceptsCreditCards': 'True', 'NoiseLevel': "u'average'", 'ByAppointmentOnly': 'False', 'RestaurantsPriceRange2': '2', 'WiFi': "u'free'", 'BusinessParking': "{'garage': True, 'street': False, 'validated': False, 'lot': False, 'valet': False}", 'Al

In [94]:
file['attributes']['Ambience']

"{'touristy': False, 'hipster': False, 'romantic': False, 'divey': False, 'intimate': False, 'trendy': False, 'upscale': False, 'classy': False, 'casual': True}"