## JSON tutorial

In [1]:
import pandas as pd

In [2]:
url = "https://raw.githubusercontent.com/chrisalbon/simulated_datasets/master/data.json"
first_json = pd.read_json(url)
first_json.head()

Unnamed: 0,integer,datetime,category
0,5,2015-01-01 00:00:00,0
1,5,2015-01-01 00:00:01,0
2,9,2015-01-01 00:00:02,0
3,6,2015-01-01 00:00:03,0
4,6,2015-01-01 00:00:04,0


In [7]:
first_json.to_json('json_columns.json', orient="columns")

In [3]:
first_json.to_json('json_index.json', orient="index")

In [9]:
second_json = pd.read_json('json_columns.json')
second_json.head()

Unnamed: 0,integer,datetime,category
0,5,2015-01-01 00:00:00,0
1,5,2015-01-01 00:00:01,0
2,9,2015-01-01 00:00:02,0
3,6,2015-01-01 00:00:03,0
4,6,2015-01-01 00:00:04,0


In [8]:
third_json = pd.read_json('json_index.json')
third_json.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
integer,5,5,9,6,6,9,7,1,6,9,...,1,5,7,9,5,9,8,6,8,1
datetime,1420070400000,1420070401000,1420070402000,1420070403000,1420070404000,1420070405000,1420070406000,1420070407000,1420070408000,1420070409000,...,1420070490000,1420070491000,1420070492000,1420070493000,1420070494000,1420070495000,1420070496000,1420070497000,1420070498000,1420070499000
category,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


`read_json()` and `to_json()` works only with simple JSON. All arrays inside need to have arrays of same length.

Instead, for nested JSON files, use this...

In [20]:
import json
import pprint   # pprint is nice
from IPython.display import JSON    # but this is probably nicer

#load json object
with open('nested.json') as f:
    nested_json = json.load(f)
    
print(nested_json, '\n')
pprint.pprint(nested_json)
print('\n',type(nested_json))

{'article': [{'id': '01', 'language': 'JSON', 'edition': 'first', 'author': 'Allen'}, {'id': '02', 'language': 'Python', 'edition': 'second', 'author': 'Aditya Sharma'}], 'blog': [{'name': 'Datacamp', 'URL': 'datacamp.com'}]} 

{'article': [{'author': 'Allen',
              'edition': 'first',
              'id': '01',
              'language': 'JSON'},
             {'author': 'Aditya Sharma',
              'edition': 'second',
              'id': '02',
              'language': 'Python'}],
 'blog': [{'URL': 'datacamp.com', 'name': 'Datacamp'}]}

 <class 'dict'>


In [21]:
from pandas.io.json import json_normalize  
json_normalize(nested_json)

  json_normalize(nested_json)


Unnamed: 0,article,blog
0,"[{'id': '01', 'language': 'JSON', 'edition': '...","[{'name': 'Datacamp', 'URL': 'datacamp.com'}]"


In [22]:
article = json_normalize(nested_json,record_path ='article')
article.head()

  article = json_normalize(nested_json,record_path ='article')


Unnamed: 0,id,language,edition,author
0,1,JSON,first,Allen
1,2,Python,second,Aditya Sharma


In [23]:
# define json string
data = [{"state": "Florida", 
        "shortname": "FL",
        "info": {"governor": "Rick Scott"},
        "counties": [{"name": "Dade", "population": 12345},
                     {"name": "Broward", "population": 40000},
                     {"name": "Palm Beach", "population": 60000}]},
       {"state": "Ohio",
        "shortname": "OH",
        "info": {"governor": "John Kasich"},
        "counties": [{"name": "Summit", "population": 1234},
                     {"name": "Cuyahoga", "population": 1337}]}]

In [24]:
json_normalize(data)
json_normalize(data=data, record_path='counties', meta=['state', 'shortname', ['info', 'governor']])

  json_normalize(data)
  json_normalize(data=data, record_path='counties', meta=['state', 'shortname', ['info', 'governor']])


Unnamed: 0,name,population,state,shortname,info.governor
0,Dade,12345,Florida,FL,Rick Scott
1,Broward,40000,Florida,FL,Rick Scott
2,Palm Beach,60000,Florida,FL,Rick Scott
3,Summit,1234,Ohio,OH,John Kasich
4,Cuyahoga,1337,Ohio,OH,John Kasich
