In [79]:
import json
import html
from copy import deepcopy
from bs4 import BeautifulSoup, Comment

In [80]:
FILENAME = 'keyword-database-export.json'
PARSER = 'html.parser'

In [81]:
# Reading json
with open(FILENAME) as file:
    content = file.read()
    whole_json = json.loads(content)

In [82]:
# Sorting definitions for tests
definitions = [entry['definition'] for entry in whole_json.values() if 'definition' in entry]
word_definitions = [definition for definition in definitions if 'OfficeDocumentSettings' in definition]
standard_definitions = [definition for definition in definitions if 'OfficeDocumentSettings' not in definition]

print('all:', len(definitions))
print('word:', len(word_definitions))
print('standard:', len(standard_definitions))

all: 1505
word: 89
standard: 1416


In [83]:
# Extracting all possible tags
all_tags = set()

for definition in definitions:
    soup = BeautifulSoup(definition, PARSER)
    for tag in soup.find_all():
        all_tags.add(tag.name)

print(' '.join(all_tags))

em tbody strong u h3 h4 img tr div sub sup ul header p span li thead ol th td table h2 br a


In [84]:
# Extracting all possible tags attributes
all_attrs = set()

for definition in definitions:
    soup = BeautifulSoup(definition, PARSER)
    for tag in soup.find_all():
        all_attrs.update(list(tag.attrs))

print(' '.join(all_attrs))

id target src cellspacing height href class alt valign rel width type border title colspan scope style cellpadding start align lang


In [85]:
# Specify legal elements - other will be removed
legal_tags = all_tags - {'span'}    # spans doesn't make any difference without styles
legal_attrs = {'rel', 'width', 'scope', 'height', 'src', 'href', 'alt', 'type', 'colspan'}

In [86]:
# Summary
print('all tags    :', ' '.join(all_tags))
print('legal tags  :', ' '.join(legal_tags))
print('illegal tags:', ' '.join(all_tags - legal_tags))
print('-'*10)
print('all attrs    :', ' '.join(all_attrs))
print('legal attrs  :', ' '.join(legal_attrs))
print('illegal attrs:', ' '.join(all_attrs - legal_attrs))

all tags    : em tbody strong u h3 h4 img tr div sub sup ul header p span li thead ol th td table h2 br a
legal tags  : em tbody strong p sup u li thead ol th h3 h4 td img table h2 tr div br sub ul header a
illegal tags: span
----------
all attrs    : id target src cellspacing height href class alt valign rel width type border title colspan scope style cellpadding start align lang
legal attrs  : type height rel href alt colspan src scope width
illegal attrs: border style id cellpadding lang start title class target align valign cellspacing


In [87]:
# cleaning code
def clean_xml(xml_text):
  xml = BeautifulSoup(xml_text, PARSER)
  remove_comments(xml)
  remove_illegal_tags(xml)
  remove_illegal_attrs(xml)
  remove_links_to_ids(xml)
  return xml.encode(formatter='html').decode('utf-8').replace('\n', '')

def remove_comments(xml):
  comments = xml.find_all(string=lambda text: isinstance(text, Comment))
  for c in comments:
    c.extract()

def remove_illegal_tags(xml):
  for tag in xml.find_all():
    if tag.name not in legal_tags:
      tag.unwrap()

def remove_illegal_attrs(xml):
  for tag in xml.find_all():
    for attr in list(tag.attrs):
      if attr not in legal_attrs:
        del tag[attr]

def remove_links_to_ids(xml):
  for link in xml.find_all('a'):
    if link.attrs['href'].startswith('#'):
      link.unwrap()

In [None]:
# test
print(clean_xml(standard_definitions[0]))

In [None]:
# test
print(clean_xml(word_definitions[0]))

In [90]:
# Saving cleaned json
OUTPUT_FILENAME = 'keyword-cleaned.json'

# process
cleaned_json = deepcopy(whole_json)
for entry in cleaned_json.values():
    if 'definition' in entry:
        entry['definition'] = clean_xml(entry['definition'])

# save
json_text = json.dumps(cleaned_json, indent=2)
with open(OUTPUT_FILENAME, 'w') as file:
    file.write(json_text)

In [91]:
# Saving list of definitions in txt file - easy to search for errors
OUTPUT_FILENAME = 'cleaned-definitions-list.txt'

# process
cleaned_definitions = []
for entry in whole_json.values():
    if 'definition' in entry:
        cleaned_definitions.append(clean_xml(entry['definition']))

# save
with open(OUTPUT_FILENAME, 'w') as file:
    for definition in cleaned_definitions:
        if len(definition) == 0:
            continue
        file.write(definition)
        file.write('\n\n')