# Working with JSON Schemas

In [2]:
#Import and load the file
import json

with open(r"C:\Users\nrmmw\Documents\Flatiron\dsc-working-with-known-json-schemas\ny_times_response.json", 'r') as f:
    data = json.load(f)

In [3]:
# Get datattype of data
print(type(data))
# Get the keys of the dictionary data
print(data.keys())

<class 'dict'>
dict_keys(['status', 'copyright', 'response'])


In [4]:
type(data['response']['docs']['headline'])
# Unaccessible because ['docs'] is a list

TypeError: list indices must be integers or slices, not str

In [5]:
docs = data['response']['docs']

print(f'`docs` is a data structure of {type(docs)}')
print(f"It has {len(docs)} elements")
print(f"The first element is of type {type(docs[0])}")

`docs` is a data structure of <class 'list'>
It has 9 elements
The first element is of type <class 'dict'>


In [6]:
# Access the list of dictionsaries under the headline key
# and get the main key
[doc['headline'] for doc in docs]

[{'main': "HIGGINS, SPENT $22,189.53.; Governor-Elect's Election Expenses -- Harrison $9,220.28.",
  'kicker': None,
  'content_kicker': None,
  'print_headline': None,
  'name': None,
  'seo': None,
  'sub': None},
 {'main': 'GARDEN BOUTS CANCELED; Mauriello Says He Could Not Be Ready on Nov. 3',
  'kicker': '1',
  'content_kicker': None,
  'print_headline': None,
  'name': None,
  'seo': None,
  'sub': None},
 {'main': 'Stock Drop Is Biggest in 2 Months--Margin Rise Held Factor in Lightest Trading of 1955',
  'kicker': '1',
  'content_kicker': None,
  'print_headline': None,
  'name': None,
  'seo': None,
  'sub': None},
 {'main': 'MUSIC OF THE WEEK',
  'kicker': None,
  'content_kicker': None,
  'print_headline': None,
  'name': None,
  'seo': None,
  'sub': None},
 {'main': 'Anacomp Inc. reports earnings for Qtr to March 31',
  'kicker': None,
  'content_kicker': None,
  'print_headline': None,
  'name': None,
  'seo': None,
  'sub': None},
 {'main': 'Brooklyn Routs Yeshiva',
  'ki

**Flattening Data (i.e. Breaking Out Nested Data)**

Let's say we want to create a list of dictionaries containing information about the documents contained in this JSON. It should contain the publication date (value associated with pub_date key), word count (value associated with word_count key), and both the 'main' and 'kicker' associated with the headline key. This list should be called doc_info_list.

Flattening means that each key is associated with a single string or number value.

In [7]:
docs[2]['headline']

{'main': 'Stock Drop Is Biggest in 2 Months--Margin Rise Held Factor in Lightest Trading of 1955',
 'kicker': '1',
 'content_kicker': None,
 'print_headline': None,
 'name': None,
 'seo': None,
 'sub': None}

In [8]:
def extract_headline_info(headline_dict):
    result = {}
    result['headline_main'] = headline_dict['main']
    result['headline_kicker'] = headline_dict['kicker']
    return result

extract_headline_info(docs[2]['headline'])

{'headline_main': 'Stock Drop Is Biggest in 2 Months--Margin Rise Held Factor in Lightest Trading of 1955',
 'headline_kicker': '1'}

Now let's write another function that calls that function, then adds the pub_date and word_count keys and values:

In [9]:
def extract_doc_info(doc):
    info = extract_headline_info(doc['headline'])
    info['pub_date'] = doc['pub_date']
    info['word_count'] = doc['word_count']
    return info

extract_doc_info(docs[2])

{'headline_main': 'Stock Drop Is Biggest in 2 Months--Margin Rise Held Factor in Lightest Trading of 1955',
 'headline_kicker': '1',
 'pub_date': '1955-05-15T00:00:00Z',
 'word_count': 823}

Now we can loop over the full list and create doc_info_list:

In [10]:
doc_info_list = []
for doc in docs:
    doc_info_list.append(extract_doc_info(doc))
doc_info_list

[{'headline_main': "HIGGINS, SPENT $22,189.53.; Governor-Elect's Election Expenses -- Harrison $9,220.28.",
  'headline_kicker': None,
  'pub_date': '1904-11-17T00:00:00Z',
  'word_count': 213},
 {'headline_main': 'GARDEN BOUTS CANCELED; Mauriello Says He Could Not Be Ready on Nov. 3',
  'headline_kicker': '1',
  'pub_date': '1944-10-23T00:00:00Z',
  'word_count': 149},
 {'headline_main': 'Stock Drop Is Biggest in 2 Months--Margin Rise Held Factor in Lightest Trading of 1955',
  'headline_kicker': '1',
  'pub_date': '1955-05-15T00:00:00Z',
  'word_count': 823},
 {'headline_main': 'MUSIC OF THE WEEK',
  'headline_kicker': None,
  'pub_date': '1904-11-06T00:00:00Z',
  'word_count': 2609},
 {'headline_main': 'Anacomp Inc. reports earnings for Qtr to March 31',
  'headline_kicker': None,
  'pub_date': '1992-05-06T00:00:00Z',
  'word_count': 129},
 {'headline_main': 'Brooklyn Routs Yeshiva',
  'headline_kicker': '1',
  'pub_date': '1972-12-24T00:00:00Z',
  'word_count': 144},
 {'headline_ma

**Transforming JSON to other Formats**

In [11]:
# To a dataframe
import pandas as pd
df = pd.DataFrame(data['response']['docs'])
df.shape

(9, 17)

In [12]:
df1 = pd.DataFrame(doc_info_list)
df1

Unnamed: 0,headline_main,headline_kicker,pub_date,word_count
0,"HIGGINS, SPENT $22,189.53.; Governor-Elect's E...",,1904-11-17T00:00:00Z,213
1,GARDEN BOUTS CANCELED; Mauriello Says He Could...,1.0,1944-10-23T00:00:00Z,149
2,Stock Drop Is Biggest in 2 Months--Margin Rise...,1.0,1955-05-15T00:00:00Z,823
3,MUSIC OF THE WEEK,,1904-11-06T00:00:00Z,2609
4,Anacomp Inc. reports earnings for Qtr to March 31,,1992-05-06T00:00:00Z,129
5,Brooklyn Routs Yeshiva,1.0,1972-12-24T00:00:00Z,144
6,Albuquerque Program Gives Drinkers a Lift,1.0,1972-12-25T00:00:00Z,151
7,Front Page 7 -- No Title,1.0,1944-10-24T00:00:00Z,29
8,UNIONS AND BUILDERS READY FOR LONG FIGHT; None...,,1904-08-06T00:00:00Z,883


In [13]:
df = pd.DataFrame(data['response']['docs'])
# Making new columns 'headline_main' and 'headline_kicker'
df['headline_main'] = df['headline'].apply(lambda headline_dict: headline_dict['main'])
df['headline_kicker'] = df['headline'].apply(lambda headline_dict: headline_dict['kicker'])

#Subsetting to get only the relevant columns
df = df[['headline_main', 'headline_kicker', 'pub_date', 'word_count']]
df

Unnamed: 0,headline_main,headline_kicker,pub_date,word_count
0,"HIGGINS, SPENT $22,189.53.; Governor-Elect's E...",,1904-11-17T00:00:00Z,213
1,GARDEN BOUTS CANCELED; Mauriello Says He Could...,1.0,1944-10-23T00:00:00Z,149
2,Stock Drop Is Biggest in 2 Months--Margin Rise...,1.0,1955-05-15T00:00:00Z,823
3,MUSIC OF THE WEEK,,1904-11-06T00:00:00Z,2609
4,Anacomp Inc. reports earnings for Qtr to March 31,,1992-05-06T00:00:00Z,129
5,Brooklyn Routs Yeshiva,1.0,1972-12-24T00:00:00Z,144
6,Albuquerque Program Gives Drinkers a Lift,1.0,1972-12-25T00:00:00Z,151
7,Front Page 7 -- No Title,1.0,1944-10-24T00:00:00Z,29
8,UNIONS AND BUILDERS READY FOR LONG FIGHT; None...,,1904-08-06T00:00:00Z,883


# Outputting to JSON

If we want to write the data back to JSON, we open the file in write mode.

We also use `json.dump` instead of json.load to write the python object into the JSON file

In [14]:
with open(r"C:\Users\nrmmw\Documents\Flatiron\dsc-working-with-known-json-schemas\doc_info_list.json", mode = 'w') as f:
    json.dump(doc_info_list, f)

# JSON Schemas Lab

Open the JSON file located at ny_times_movies.json, and use the json module to load the data into a variable called data.

In [16]:
with open(r"C:\Users\nrmmw\Documents\Flatiron\dsc-working-with-known-json-schemas-lab\ny_times_movies.json", "r") as f:
    data = json.load(f)

In [18]:
# Investigation
# Run this cell without changes
print("`data` has type", type(data))
print("The keys are", list(data.keys()))

`data` has type <class 'dict'>
The keys are ['status', 'copyright', 'has_more', 'num_results', 'results']


Create a variable results that contains the value associated with the 'results' key.

In [23]:
#Loading Results
results = data["results"]
results

[{'display_title': 'Can You Ever Forgive Me',
  'mpaa_rating': 'R',
  'critics_pick': 1,
  'byline': 'A.O. SCOTT',
  'headline': 'Review: Melissa McCarthy Is Criminally Good in â€˜Can You Ever Forgive Me?â€™',
  'summary_short': 'Marielle Heller directs a true story of literary fraud, set amid the bookstores and gay bars of early â€™90s Manhattan.',
  'publication_date': '2018-10-16',
  'opening_date': '2018-10-19',
  'date_updated': '2018-10-17 02:44:23',
  'link': {'type': 'article',
   'url': 'http://www.nytimes.com/2018/10/16/movies/can-you-ever-forgive-me-review-melissa-mccarthy.html',
   'suggested_link_text': 'Read the New York Times Review of Can You Ever Forgive Me'},
  'multimedia': {'type': 'mediumThreeByTwo210',
   'src': 'https://static01.nyt.com/images/2018/10/19/arts/19CANYOUEVER-1/19CANYOUEVER-1-mediumThreeByTwo210.jpg',
   'width': 210,
   'height': 140}},
 {'display_title': 'Charm City',
  'mpaa_rating': '',
  'critics_pick': 1,
  'byline': 'BEN KENIGSBERG',
  'headli

In [26]:
# Run this cell without changes
import pandas as pd
df = pd.DataFrame(results)
df

Unnamed: 0,display_title,mpaa_rating,critics_pick,byline,headline,summary_short,publication_date,opening_date,date_updated,link,multimedia
0,Can You Ever Forgive Me,R,1,A.O. SCOTT,Review: Melissa McCarthy Is Criminally Good in...,Marielle Heller directs a true story of litera...,2018-10-16,2018-10-19,2018-10-17 02:44:23,"{'type': 'article', 'url': 'http://www.nytimes...","{'type': 'mediumThreeByTwo210', 'src': 'https:..."
1,Charm City,,1,BEN KENIGSBERG,Review: â€˜Charm Cityâ€™ Vividly Captures the ...,Marilyn Nessâ€™s documentary is dedicated to t...,2018-10-16,2018-04-22,2018-10-16 11:04:03,"{'type': 'article', 'url': 'http://www.nytimes...","{'type': 'mediumThreeByTwo210', 'src': 'https:..."
2,Horn from the Heart: The Paul Butterfield Story,,1,GLENN KENNY,Review: Paul Butterfieldâ€™s Story Is Told in ...,A documentary explores the life of the blues m...,2018-10-16,2018-10-19,2018-10-16 11:04:04,"{'type': 'article', 'url': 'http://www.nytimes...","{'type': 'mediumThreeByTwo210', 'src': 'https:..."
3,The Price of Everything,,0,A.O. SCOTT,Review: â€˜The Price of Everythingâ€™ Asks $56...,This documentary examines the global art marke...,2018-10-16,2018-10-19,2018-10-16 16:08:03,"{'type': 'article', 'url': 'http://www.nytimes...","{'type': 'mediumThreeByTwo210', 'src': 'https:..."
4,Impulso,,0,BEN KENIGSBERG,Review: â€˜Impulsoâ€™ Goes Backstage With a Fl...,"This documentary follows RocÃ­o Molina, a cutt...",2018-10-16,,2018-10-16 11:04:03,"{'type': 'article', 'url': 'http://www.nytimes...","{'type': 'mediumThreeByTwo210', 'src': 'https:..."
5,Watergate,,1,A.O. SCOTT,Review: â€˜Watergateâ€™ Shocks Anew With Its T...,Charles Ferguson delivers a comprehensive docu...,2018-10-11,2018-10-12,2018-10-17 02:44:21,"{'type': 'article', 'url': 'http://www.nytimes...","{'type': 'mediumThreeByTwo210', 'src': 'https:..."
6,Barbara,,1,GLENN KENNY,"Review: In â€˜Barbara,â€™ a Fictional Biopic o...",Itâ€™s a film of scenes rather than of one uni...,2018-10-11,,2018-10-17 02:44:21,"{'type': 'article', 'url': 'http://www.nytimes...","{'type': 'mediumThreeByTwo210', 'src': 'https:..."
7,Over the Limit,,1,JEANNETTE CATSOULIS,Review: A Russian Gymnast Goes â€˜Over the Lim...,Margarita Mamun endures injury and abuse in Ma...,2018-10-11,2018-10-05,2018-10-17 02:44:20,"{'type': 'article', 'url': 'http://www.nytimes...","{'type': 'mediumThreeByTwo210', 'src': 'https:..."
8,The Kindergarten Teacher,R,1,JEANNETTE CATSOULIS,Review: The Disturbing Obsession of â€˜The Kin...,Maggie Gyllenhaal is riveting as a dissatisfie...,2018-10-11,2018-10-12,2018-10-17 02:44:19,"{'type': 'article', 'url': 'http://www.nytimes...","{'type': 'mediumThreeByTwo210', 'src': 'https:..."
9,Classical Period,,1,BEN KENIGSBERG,"Review: In â€˜Classical Period,â€™ a Deep Dive...",This highly original feature is technically in...,2018-10-11,,2018-10-17 02:44:18,"{'type': 'article', 'url': 'http://www.nytimes...","{'type': 'mediumThreeByTwo210', 'src': 'https:..."


**Data Analysis**

In [28]:
# Run this cell without changes
data['num_results']

20

In [30]:
# How many results are in the file?
print(f"There are {df.shape[0]} rows")

There are 20 rows


In [39]:
# How many unique critics are there?
unique_critics = df["byline"].unique()
print(f"There are {len(unique_critics)} unique critics in the NY Times")

There are 7 unique critics in the NY Times


In [38]:
# Run this cell without changes
assert len(unique_critics) == 7

**Flattening Data**

Create a list review_urls that contains the URL for each review. This can be found using the 'url' key nested under 'link'.

In [88]:
# df['link'] = df['link']['url']
df1 = df[["display_title"]].copy()
df1

Unnamed: 0,display_title
0,Can You Ever Forgive Me
1,Charm City
2,Horn from the Heart: The Paul Butterfield Story
3,The Price of Everything
4,Impulso
5,Watergate
6,Barbara
7,Over the Limit
8,The Kindergarten Teacher
9,Classical Period


In [90]:
df1['url'] = df['link'].apply(lambda x:x.get('url'))
df1

Unnamed: 0,display_title,url
0,Can You Ever Forgive Me,http://www.nytimes.com/2018/10/16/movies/can-y...
1,Charm City,http://www.nytimes.com/2018/10/16/movies/charm...
2,Horn from the Heart: The Paul Butterfield Story,http://www.nytimes.com/2018/10/16/movies/horn-...
3,The Price of Everything,http://www.nytimes.com/2018/10/16/movies/the-p...
4,Impulso,http://www.nytimes.com/2018/10/16/movies/impul...
5,Watergate,http://www.nytimes.com/2018/10/11/movies/water...
6,Barbara,http://www.nytimes.com/2018/10/11/movies/barba...
7,Over the Limit,http://www.nytimes.com/2018/10/11/movies/over-...
8,The Kindergarten Teacher,http://www.nytimes.com/2018/10/11/movies/the-k...
9,Classical Period,http://www.nytimes.com/2018/10/11/movies/class...


In [92]:
review_urls = []
for url in df1['url']:
    review_urls.append(url)
review_urls

['http://www.nytimes.com/2018/10/16/movies/can-you-ever-forgive-me-review-melissa-mccarthy.html',
 'http://www.nytimes.com/2018/10/16/movies/charm-city-review-baltimore.html',
 'http://www.nytimes.com/2018/10/16/movies/horn-from-the-heart-review-paul-butterfield.html',
 'http://www.nytimes.com/2018/10/16/movies/the-price-of-everything-review-documentary.html',
 'http://www.nytimes.com/2018/10/16/movies/impulso-review-documentary.html',
 'http://www.nytimes.com/2018/10/11/movies/watergate-review-documentary.html',
 'http://www.nytimes.com/2018/10/11/movies/barbara-review.html',
 'http://www.nytimes.com/2018/10/11/movies/over-the-limit-review.html',
 'http://www.nytimes.com/2018/10/11/movies/the-kindergarten-teacher-review.html',
 'http://www.nytimes.com/2018/10/11/movies/classical-period-review.html',
 'http://www.nytimes.com/2018/10/11/movies/bad-times-at-the-el-royale-review.html',
 'http://www.nytimes.com/2018/10/11/movies/beautiful-boy-review-steve-carell.html',
 'http://www.nytimes

In [93]:
# Run this cell without changes

# review_urls should be a list
assert type(review_urls) == list

# The length should be 20, same as the length of reviews
assert len(review_urls) == 20

# The data type contained should be string
assert type(review_urls[0]) == str and type(review_urls[-1]) == str

# Spot checking a specific value
assert review_urls[6] == 'http://www.nytimes.com/2018/10/11/movies/barbara-review.html'

All good