In [35]:
import re
import json
import pprint
from collections import Counter

import pandas as pd
from bs4 import BeautifulSoup

PATH_TO_DATA = "../../data/raw/WashingtonPost.v4/data/TREC_Washington_Post_collection.v4.jl"
pp = pprint.PrettyPrinter(indent=1, compact=False)

### Exploring the structure of the dataset

In [31]:
top_level_attribs = Counter()
content_types_subtypes = Counter()

with open(PATH_TO_DATA, 'r', encoding='utf8') as fp:
    for line in fp.readlines():
        decoded_json = json.loads(line)
        
        top_level_attribs.update(decoded_json.keys())
    
        for content_piece in decoded_json["contents"]:
            if (content_piece is not None):
                if "subtype" in content_piece.keys():
                    key = "{}+{}".format(content_piece["type"], content_piece["subtype"])                    
                else:
                    key = content_piece["type"]
                content_types_subtypes[key] += 1  

In [30]:
top_level_attribs.most_common()

[('id', 728626),
 ('article_url', 728626),
 ('title', 728626),
 ('author', 728626),
 ('contents', 728626),
 ('content', 671947),
 ('published_date', 517530),
 ('type', 517530),
 ('source', 517530),
 ('publish_date', 56679),
 ('orig-id', 2579)]

In [17]:
content_types_subtypes.most_common()

[('sanitized_html+paragraph', 14704450),
 ('date', 728623),
 ('title', 718741),
 ('kicker', 718464),
 ('byline', 670249),
 ('image', 556142),
 ('author_info', 399237),
 ('sanitized_html+subhead', 177310),
 ('sanitized_html+blockquote', 175624),
 ('tweet', 127961),
 ('sanitized_html+sublabel', 116428),
 ('video', 100727),
 ('gallery', 34713),
 ('list', 23278),
 ('sanitized_html+p1', 18030),
 ('sanitized_html+trailleft', 17351),
 ('sanitized_html+h2', 9984),
 ('sanitized_html+h3', 9551),
 ('instagram', 8269),
 ('deck', 7092),
 ('sanitized_html+liveblog-cf liveblog-entry-meta', 6806),
 ('sanitized_html+liveblog-entry-content', 6648),
 ('sanitized_html+dateline', 5265),
 ('sanitized_html+strong', 4560),
 ('sanitized_html+10pt-top', 4302),
 ('sanitized_html+table', 3125),
 ('sanitized_html+caption padding-left border-left', 2842),
 ('sanitized_html+correction', 2768),
 ('sanitized_html+digital-headgroup', 2442),
 ('sanitized_html+attachment', 2173),
 ('sanitized_html+p2', 2136),
 ('sanitize

In [22]:
pp.pprint(decoded_json['contents'])

[{'content': 'Opinions',
  'mime': 'text/plain',
  'section': '/opinions',
  'type': 'kicker'},
 {'content': 'The 10 worst things Trump did in 2020',
  'mime': 'text/plain',
  'type': 'title'},
 {'content': 'By Marc Thiessen', 'mime': 'text/plain', 'type': 'byline'},
 {'content': '2020-12-31T18:39:46.637Z', 'mime': 'text/plain', 'type': 'date'},
 {'content': 'This week, I offer my <a '
             'href="https://www.washingtonpost.com/opinions/2019/12/30/worst-things-trump-did/" '
             'target=_blank>annual </a>lists of the <a '
             'href="https://www.washingtonpost.com/opinions/2020/12/31/best-things-trump-did-2020/" '
             'target=_blank>10 best</a> and 10 worst things President Trump '
             'did this year. Since 2020 was such a horrible year, we’ll start '
             'with the worst things first:',
  'mime': 'text/html',
  'subtype': 'paragraph',
  'type': 'sanitized_html'},
 {'content': '<b>10. He pardoned war criminals</b>. Trump showed a flagra

Notes:
* Only top-level attributes that appear in every article are `id`, `article_url`, `title`, `author` and `contents`. Therefore `contents` attribute should be used (instead of `content`) when performing text mining.
* There are two top-level date attributes (denoting publishing date of the article): `publish_date` and `published_date`. Neither appears in every article and should be picked whichever is available, if needed.
* Only content pieces of type `sanitized_html` will be mined, specificly of subtypes `paragraph`, `correction`, `blockquote` and `p1`, based on prevalence and inspection.

### Testing out text extraction

In [36]:
def is_rel_type_subtype(contents_obj):
    relevant_subtypes = ['paragraph', 'correction', 'blockquote', 'p1']
    return (contents_obj["type"] == "sanitized_html" and 
            contents_obj["subtype"] in relevant_subtypes)

def extract_plaintext(contents):
    raw_html_pieces = [obj["content"] for obj in contents if is_rel_type_subtype(obj)]
    document = '\n'.join(raw_html_pieces)
    raw_text = re.sub(r'<.*?>', '', document)
    return raw_text

In [37]:
# decoded_json is a free variable
print(extract_plaintext(decoded_json['contents']))

This week, I offer my annual lists of the 10 best and 10 worst things President Trump did this year. Since 2020 was such a horrible year, we’ll start with the worst things first:
10. He pardoned war criminals. Trump showed a flagrant disregard of the rule of law by pardoning Blackwater contractors who massacred unarmed Iraqi civilians, including innocent women and children.
9. He vetoed the bipartisan National Defense Authorization Act. Trump vetoed $741 billion in military spending and a 3 percent pay raise for our troops over an unrelated issue, and put Republicans who voted for it in the difficult position of having to choose whether to flip-flop or override his veto.
8. He ordered the drawdown of nearly all U.S. forces in Afghanistan and Iraq. Trump was apparently talked out of a complete withdrawal, but reducing to 2,500 troops in each country makes no strategic sense. Despite an ongoing terrorist threat, we will have fewer troops in Afghanistan or Iraq than we do in Spain.
7. He 