In [1]:
from bs4 import BeautifulSoup
import re
import os

from typing import Dict, Any

In [2]:
source_folder = os.path.join('data', 'html')
output_folder = os.path.join(source_folder, 'json')

In [3]:
features = {}

In [4]:
def populate_title(features: Dict[str, Any], bs: BeautifulSoup):
    features['title'] = bs.find('h1').text

def populate_authors(features: Dict[str, Any], bs: BeautifulSoup):
    features['authors'] = bs.find('a', {'target': 'other'}).text

In [5]:
from datefinder import parser

def populate_dates(features: Dict[str, Any], bs: BeautifulSoup):
    matches = re.findall(r'\w+ \w+ \d+, \d{4}', bs.find('div', {'id': 'pubinfo'}).getText())
    features['first_published'] = parser.parse(matches[0]) if len(matches) > 0 else None
    features['last_edit'] = parser.parse(matches[1]) if len(matches) > 1 else None

In [6]:
def populate_toc(features: Dict[str, Any], bs: BeautifulSoup):
    toc = bs.find('div', {'id': 'toc'})

    if toc:
        toc_items = toc.find_all('a')
        regex_matches = [re.match(r'([\d+|\.]*)(.*)', item.text) for item in toc_items]
        toc_links = [{'id': item.get('href')[1:], 'number': match.group(1), 'title': match.group(2).strip()} for item, match in zip(toc_items, regex_matches)]
    else:
        toc_links = []

    features['toc'] = toc_links

In [24]:
def populate_sections(features: Dict[str, Any], bs: BeautifulSoup):
    sections = []
    for toc_link in features['toc']:
        link_id = toc_link['id']
        title = toc_link['title']
        print(bs.select_one('#main-text'))
        content = bs.select_one('#main-text').select_one(lambda tag: tag.get_text() == title).find_parent('div')
        sections.append({'id': link_id, 'content': content.text[:200]})

    features['sections'] = sections

In [25]:
def populate_bibliography(features: Dict[str, Any], bs: BeautifulSoup):
    bibliography = bs.find('div', {'id': 'bibliography'})
    bibliography_items = bibliography.find_all('li')
    bibliography_enumerated = [{'index': i, 'content': bibliography_item.text.replace('\n', ' ')} for i, bibliography_item in enumerate(bibliography_items)]
    features['bibliography'] = bibliography_enumerated[:2]

In [26]:
def get_features(bs: BeautifulSoup) -> Dict[str, Any]:
    features = {}
    populate_title(features, bs)
    populate_authors(features, bs)
    populate_dates(features, bs)
    populate_toc(features, bs)
    populate_bibliography(features, bs)
    populate_sections(features, bs)

    return features

In [27]:

from tqdm.notebook import tqdm as log_progress
import json
from datetime import date, datetime
import logging


def json_serial(obj):
    """JSON serializer for objects not serializable by default json code"""

    if isinstance(obj, (datetime, date)):
        return obj.isoformat()
    raise TypeError ("Type %s not serializable" % type(obj))


os.makedirs(output_folder, exist_ok=True)

articles = []
for file in log_progress(os.listdir(source_folder)[:1]):
        if file.endswith('.html'):
            with open(os.path.join(source_folder, file), encoding='utf-8') as f:
                bs = BeautifulSoup(f, 'html.parser')
                try:
                    articles.append(get_features(bs))
                except Exception as e:
                     logging.error(f'{file}: {e}', exc_info=True)

with open(os.path.join(output_folder, 'sep_articles.json'), 'w+') as f:
    for i, item in enumerate(articles):
        f.write(json.dumps({"index":{"_index":"sep_articles","_id":i}}) + '\n')
        f.write(json.dumps(item, default=json_serial) + '\n')


  0%|          | 0/1 [00:00<?, ?it/s]

ERROR:root:reverse-mathematics.html: 'function' object has no attribute 'replace'
Traceback (most recent call last):
  File "/var/folders/2l/j18wprxj2196w3ysmqljcttc0000gn/T/ipykernel_8857/389258369.py", line 23, in <module>
    articles.append(get_features(bs))
                    ^^^^^^^^^^^^^^^^
  File "/var/folders/2l/j18wprxj2196w3ysmqljcttc0000gn/T/ipykernel_8857/4067765005.py", line 8, in get_features
    populate_sections(features, bs)
  File "/var/folders/2l/j18wprxj2196w3ysmqljcttc0000gn/T/ipykernel_8857/3679300399.py", line 7, in populate_sections
    content = bs.select_one('#main-text').select_one(lambda tag: tag.get('id') == link_id or tag.get('name') == link_id or tag.get_text() == title).find_parent('div')
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/lukaszdygon/code/sep-tools/.venv/lib/python3.12/site-packages/bs4/element.py", line 2094, in select_one
 

<div id="main-text">
<h2 id="HistIntrReveMath">1. A Historical Introduction to Reverse Mathematics</h2>
<h3 id="MathNeceAxioMeth">1.1 Mathematical necessity and the axiomatic method</h3>
<p>
Reverse mathematics is a relatively young subfield of mathematical
logic, having made its start in the mid-1970s as an outgrowth of
 <a href="../recursive-functions/">computability theory</a>.
 In the field’s founding paper (Friedman 1975), Harvey Friedman
begins by asking</p>
<blockquote>
<p>
What are the proper axioms to use in carrying out proofs of particular
theorems, or bodies of theorems, in mathematics? What are those formal
systems which isolate the essential principles needed to prove them?
(1975: 235)</p>
</blockquote>
<p>
Friedman’s notion of the proper axioms to prove a given theorem
is one on which not only can the theorem can be proved from the
axioms, but the axioms can be proved from the theorem. Put another
way, the proper axioms are necessary in order to prove the theorem,
and no