In [25]:
from bs4 import BeautifulSoup
import re

In [3]:
source_file = 'data/html/abduction.html'

In [12]:
with open(source_file, encoding='utf-8') as f:
    source_bs = BeautifulSoup(f, 'html.parser')

In [13]:
source_bs.contents

[<div id="article">
 <div id="article-content">
 <!-- BEGIN ARTICLE HTML -->
 <div id="aueditable"><!--DO NOT MODIFY THIS LINE AND ABOVE-->
 <h1>Abduction</h1><div id="pubinfo"><em>First published Wed Mar 9, 2011; substantive revision Tue May 18, 2021</em></div>
 <div id="preamble">
 <p>
 In the philosophical literature, the term “abduction” is
 used in two related but different senses. In both senses, the term
 refers to some form of explanatory reasoning. However, in the
 historically first sense, it refers to the place of explanatory
 reasoning in <em>generating</em> hypotheses, while in the sense in
 which it is used most frequently in the modern literature it refers to
 the place of explanatory reasoning in <em>justifying</em> hypotheses.
 In the latter sense, abduction is also often called “Inference
 to the Best Explanation.”</p>
 <p>
 This entry is exclusively concerned with abduction in the modern
 sense, although there is a supplement on abduction in the historical
 sense, wh

In [19]:
features = {}

In [23]:
# dates
pattern = r'(\w+) (\w+) (\d+), (\d+)'
matches = re.findall(r'\w+ \w+ \d+, \d{4}', source_bs.find('div', {'id': 'pubinfo'}).getText())
features['first_published'] = matches[0] if len(matches) > 0 else None
features['last_edit'] = matches[1] if len(matches) > 1 else None

In [88]:
# toc
toc = source_bs.find('div', {'id': 'toc'})

if toc:
    toc_items = toc.find_all('a')
    regex_matches = [re.match(r'([\d+|\.]*)(.*)', item.text) for item in toc_items]
    toc_links = [(item.get('href')[1:], match.group(1), match.group(2).strip()) for item, match in zip(toc_items, regex_matches)]
else:
    toc_links = []

features['toc'] = toc_links

In [108]:
# articles
sections = []
for toc_link in features['toc']:
    link_id = toc_link[0]
    content = source_bs.find(lambda tag: tag.get('id') == link_id or tag.get('name') == link_id).find_parent('div')
    sections.append((link_id, content.text))

features['sections'] = sections


In [109]:
#bibliography
bibliography = source_bs.find('div', {'id': 'bibliography'})
bibliography_items = bibliography.find_all('li')
bibliography_enumerated = [(i, bibliography_item.text.replace('\n', ' ')) for i, bibliography_item in enumerate(bibliography_items)]
features['bibliography'] = bibliography_enumerated

In [110]:
display(features)

{'first_published': 'Wed Mar 9, 2011',
 'last_edit': 'Tue May 18, 2021',
 'toc': [('AbdGenIde', '1.', 'Abduction: The General Idea'),
  ('DedIndAbd', '1.1', 'Deduction, induction, abduction'),
  ('UbiAbd', '1.2', 'The ubiquity of abduction'),
  ('ExpAbd', '2.', 'Explicating Abduction'),
  ('StaAbd', '3.', 'The Status of Abduction'),
  ('Cri', '3.1', 'Criticisms'),
  ('Def', '3.2', 'Defenses'),
  ('AbdVerBayConThe', '4.', 'Abduction versus Bayesian Confirmation Theory'),
  ('Bib', '', 'Bibliography'),
  ('Aca', '', 'Academic Tools'),
  ('Oth', '', 'Other Internet Resources'),
  ('Rel', '', 'Related Entries')],
 'bibliography': [(0,
   'Achinstein, P., 2001. The Book of Evidence, Oxford: Oxford University Press.'),
  (1,
   'Adler, J., 1994. “Testimony, Trust, Knowing,” Journal of Philosophy, 91: 264–275.'),
  (2,
   'Bach, K. and Harnish, R., 1979. Linguistic Communication and Speech Acts, Cambridge MA: MIT Press.'),
  (3, 'Bird, A., 1998. Philosophy of Science, London: UCL Press.'),
  

In [111]:
import json
import os

output_folder = os.path.join(os.path.dirname(source_file), 'json')
os.makedirs(output_folder, exist_ok=True)

with open(os.path.join(output_folder, os.path.split(source_file)[-1].replace('.html', '.json')), 'w+') as f:
    json.dump(features, f)