In [17]:
# Answer span extraction from IDI txt files

# Let's test some answer span extraction approaches.
# First, using SpaCy.

# Load SpaCy


import spacy
nlp = spacy.load("en_core_web_lg")

import pandas as pd


In [None]:
import json, re

In [19]:
ls

AnswerSpanExtractor.ipynb       InsitutionalBooksSampler.ipynb
IDI_books_sample.csv            balanced_400_sample.csv
IDI_books_sample.tsv            encyclopedia.jsonl
IDI_extractor.ipynb             filtered_books_400_sample.csv
IDI_sample_1875-1924.csv        filtered_books_400_sample.tsv
[34mIDI_sample_1875-25[m[m/


In [20]:
cyclopedia_txt_path = 'IDI_sample_1875-25/HN5BS8.txt' 
with open(cyclopedia_txt_path, 'r', encoding='utf-8') as f:
    lines = f.readlines()

Processed lines 1500 to 1551, 1411 tokens.
PEDALIA'CEE: see BIGNONIACEÆ. PEDALIER, n. pěďa-lēr: same as Pedalion, in music. PEDALION, n. pē-dā'li-on [Gr. pēdalion, a rudder]: in zool., a genus of Rotifera, family Floscularido. PEDALION, n. pē-dā'li-on [L. pedalis, pertaining to the foot]: in mus., a set of pedals acting upon strings, producing notes of a deep pitch, so constructed as to be capable of being used with a pianoforte. PEDANT, n. pěďănt [F. pédant-from It. and Sp. pedante, a pedant]: one who makes a vain and ostentatious display of his learning. PEDANTIC, a. pě-dăn'tik, or PEDAN TICAL, a. -ti-kal, vainly displaying or making a show of knowledge. PEDAN'TICALLY, ad. -. PEDANTRY, n. pěďăn-trì, a vain and offensive display of knowledge. Note. Said to be connected with L. pedărě, to foot it, to tramp about, as if a pedant was 'one who tramped about with children at his heels.' The word is most probably derived from Gr. paideuò, I instruct. PEDATE, a. pēď'āt [L. pedātus, footed-fr

In [40]:
def clean_encyclopedia_entry(text):
    """
    Clean phonetic transcriptions from encyclopedia entries.
    Strategy: find all-caps headword, find delimiter, remove junk in between.
    """
    
    pos_pattern = r'(?:n|v|a|adj|adv|prep|conj|interj|pron|pp|imp)\.'
    
    lines = text.split('\n')
    cleaned = []
    
    for line in lines:
        if not line or not line[0].isupper():
            cleaned.append(line)
            continue
        
        # Fix OCR breaks in headwords: PHIL' LIPS → PHILLIPS
        line = re.sub(r"([A-Z]+)[''']+\s*([A-Z]+)", r"\1\2", line)
        
        # Pattern: capture all-caps headword portion, then find delimiter
        # Headword: uppercase letters, periods, spaces, commas, hyphens
        # Stops at first lowercase letter
        headword_match = re.match(r'^([A-Z][A-Z.\s,\-]*?)(?=\s*[a-z\(\[]|:\s*[a-z])', line)
        
        if headword_match:
            headword = headword_match.group(1).rstrip(', ')
            rest = line[headword_match.end():]
            
            # Find where definition starts: colon or POS marker
            delim_match = re.search(rf'(:\s*|({pos_pattern})\s*\[?)', rest)
            
            if delim_match:
                # Keep headword + delimiter + everything after
                after_delim = rest[delim_match.end():]
                delim = delim_match.group(1)
                line = headword + ', ' + delim + after_delim if delim_match.group(2) else headword + delim + after_delim
        
        cleaned.append(line)
    
    return '\n'.join(cleaned)


# Test
test_text = """PHILIPPSBURG, fe'lips-bûrch: town of Baden, on the right bank of the Rhine; anciently one of the most important fortresses on the Rhine, taken and retaken frequently by French, Germans, or Swedes. The fortifications were destroyed 1800. Pop. 2,500.

PHILIPS, filips, AMBROSE: 1675-1749; b. Shropshire, England.

PHIL'LIPS, JOHN, LL.D.: 1719, Dec. 6-1795, Apr. 21; b. Andover, Mass.: philanthropist. He graduated at Harvard College 1735

PENCIL, n. pěn'sil [OF. pincel, F. pinceau, a pencil from L. penicillus, a small tail, a painter's brush or penci -from peniculus, a little tail; penis, a tail: Sp. pincel]: thin strip or thread of plumbago or black lead, or other substance, generally inclosed in a cover of soft wood

PHILOMEL, n. fil'ō-měl, or PHILOME'LA, n. -mē´lă [Gr. Philomela, mythical daughter of King Pandion of Attica, who was changed into a nightingale, or, as some say, a swallow]: the nightingale. """

print(clean_encyclopedia_entry(test_text))

PHILIPPSBURG: town of Baden, on the right bank of the Rhine; anciently one of the most important fortresses on the Rhine, taken and retaken frequently by French, Germans, or Swedes. The fortifications were destroyed 1800. Pop. 2,500.

PHILIPS: 1675-1749; b. Shropshire, England.

PHILLIPS, JOHN, LL.D.: 1719, Dec. 6-1795, Apr. 21; b. Andover, Mass.: philanthropist. He graduated at Harvard College 1735

PENCIL, n. pěn'sil [OF. pincel, F. pinceau, a pencil from L. penicillus, a small tail, a painter's brush or penci -from peniculus, a little tail; penis, a tail: Sp. pincel]: thin strip or thread of plumbago or black lead, or other substance, generally inclosed in a cover of soft wood

PHILOMEL, n. fil'ō-měl, or PHILOMELA, n. -mē´lă [Gr. Philomela, mythical daughter of King Pandion of Attica, who was changed into a nightingale, or, as some say, a swallow]: the nightingale. 


In [38]:
def extract_headwords(text):
    """
    Extract headwords from encyclopedia text.
    If line starts with all-caps word, return everything up to first
    lowercase letter or terminating punctuation (: [ ().
    """
    headwords = []
    
    for line in text.split('\n'):
        line = line.strip()
        if not line:
            continue
        
        # Check if first word is all-caps
        first_word = line.split()[0].rstrip('.,;:')
        if not first_word.isupper():
            continue
        
        # Walk through line until we hit lowercase or terminator
        headword = []
        for char in line:
            if char.islower() or char in ':[(alan':
                break
            headword.append(char)
        
        headword = ''.join(headword).strip().rstrip(',.')
        if headword:
            # Title case, but preserve abbreviations
            parts = [p.strip() for p in headword.split(',')]
            titled = []
            for part in parts:
                if '.' in part and len(part) <= 6:
                    titled.append(part)
                else:
                    titled.append(part.title())
            headwords.append(', '.join(titled))
    
    return headwords


# Test
test = """PEKING. PEKING, pē-king', or PEKIN, pē-kin' (i.e., Northern Capital): city, cap. of Chinese empire
PHILIPPSBURG: town of Baden, on the right bank of the Rhine
PHILIPS, AMBROSE: 1675-1749; b. Shropshire, England.
PHILLIPS, JOHN, LL.D.: 1719, Dec. 6-1795, Apr. 21
PENCIL, n. [OF. pincel]: thin strip of plumbago
This line doesn't start with caps so it's ignored."""

print(extract_headwords(test))

['Peking. Peking', 'Philippsburg', 'Philips, Ambrose', 'Phillips, John, LL.D', 'Pencil']


In [None]:
# Just a test.

In [51]:
len(lines)

17441

In [52]:
# Read chunks of lines, starting from line 2500,
# and then reading chunks stopping when you hit 10 lines,
# or when you have more than 3 lines and the next line
# starts with an all-caps word (indicating a new section).
import random
start_line = 2500
ctr = 0
json_objects = []
line_idx = start_line
while line_idx < len(lines):
    chunk_lines = ['\n']
    while line_idx < len(lines):
        line = lines[line_idx].strip()
        if not line:
            line_idx += 1
            continue
        # if the line is all caps, contains a hyphen, and ends with a period,
        #  it's likely a page header
        # like PEG-PEGMATITE. Ignore it
        if line.isupper() and '-' in line and line.endswith('.'):
            line_idx += 1
            continue
        chunk_lines.append(line)
        line_idx += 1
        if len(chunk_lines) >= 10:
            break
        if len(chunk_lines) > 3:
            next_line = lines[line_idx].strip() if line_idx < len(lines) else ''
            if next_line and next_line.split()[0].isupper():
                break
    chunk_text = '\n'.join(chunk_lines)

    # Clean the chunk text to remove phonetic transcriptions
    chunk_text = clean_encyclopedia_entry(chunk_text)

    # get a list of headwords in this chunk
    headwords = extract_headwords(chunk_text)

    doc = nlp(chunk_text)
    print(f"Processed lines {start_line} to {line_idx}, {len(doc)} tokens.")
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    print(chunk_text)
    print("Named Entities:", entities)
    print("Headwords:", headwords)
    fused = condense_entities(headwords, entities)
    print("Fused Entities:", fused)
    # json object to save for later comparison of other methods on this chunk
    chunk_data = {
        'start_line': start_line,
        'end_line': line_idx,
        'text': chunk_text.replace('\n', ' '),
        'entities': fused
    }
    
    start_line = line_idx
    ctr += 1
    if ctr % 100 == 1:
        print(ctr)
    json_objects.append(chunk_data)

print('Done with selection.')
# Randomly select 800 chunks for later analysis
json_objects = random.sample(json_objects, 800)
with open('encyclopedia2.jsonl', 'a', encoding='utf-8') as out_f:
    for chunk_data in json_objects:
        out_f.write(json.dumps(chunk_data) + '\n')

print('Wrote 800 sampled chunks to encyclopedia2.jsonl')

Processed lines 2500 to 2505, 199 tokens.


13, 14, the greater and lesser sacro-ischiatic notches; 15, the tuberosity of the ischium; 16, its ramus; 17, the body of the os pubis;
18, its ramus; 19, the acetabulum; 20, the thyroid or obturator foramen. (From Wilson.) fourth or fifth month. At birth, the crest of the ilium, the bottom of the acetabulum, and the rami of the ischium and pubes, are still cartilaginous. At about the sixth or seventh year, these rami become completely ossified; next, the ilium is united to the ischium; and lastly, the pubes is joined to the other two in the acetabulum. The complete ossification of the bone, from the secondary centres in the crest of the ilium, the tuberosity of the ischium, etc., is not completed till about the 25th year.
Each os innominatum articulates with its fellow of the opposite side (through the intervention of the interosseous fibro-cartilage, which unites the two surfaces of the pubic
Named Entities: [('13', 'CARDINAL'), ('14', 'DAT

In [29]:
# pretty-print function that breaks long lines
def pretty_print(text, width=80):
    import textwrap
    wrapped = textwrap.fill(text, width=width)
    print(wrapped)

pretty_print(chunk_text, width=80)

PEKING. PEKING, pē-king', or PEKIN, pē-kin' (i.e., Northern Cap ital): city,
cap. of Chinese empire since 1421; on a sandy plain, about 13 m. n.w. of the
Pei-ho; lat. 39° 54' 13' n., long. 116° 28′ 54'' e.; in the n. province of Chih-
le, nearly 100 m. from the sea, about 60 m. from the great Chinese wall, Pop.
estimated about 1,000,000; circuit of the walls, according to the latest
measurements, 20 m. These walls are of earth, with outer casing of brick, having
embrasures for musketry or ordnance every 50 ft. Those of the Tartar City have
an average height of 50, but in some places 61, ft. In thickness they vary from
57 to 22 ft. The walls of the Chinese City are only 30 ft. high, and 15 to 25
wide. The top, to which horsemen can ascend by a ramp or sloping way, is paved
with stone. At intervals of 60 yards are square towers or buttresses, projecting
outward from the walls 50 or 60 ft. Of the 16 gates which give access to the
city from the surrounding country, 9 belong to the Northern

In [47]:
def condense_entities (headwords, entities):
    """ Accepts a list of headwords (just strings),
    and a list of entities (tuples of (text, label)).

    It discards all headwords and entities that have less than four characters,
    discards duplicates, and in cases where one entity/headword is a substring of another,
    keeps only the longest one.
    
    It returns a list of strings, associated with a label if applicable; headwords not matched
    to any entity are returned with label None.
    """

    combined = []
    diacritics_pattern = r'[ˈˌɑɔɛɪʊθðŋæʃʒɔ̃āēīōūáéíóúàèìòùäëïöüâêîôûç]'
    
    for ent_text, ent_label in entities:
        if len(ent_text) < 4:
            continue
        # also discard entities that begin with 'a' or 'n' lowercase followed by period
        if re.match(r'^[an]\.', ent_text):
            continue
        # also discard entities that look like phonetic transcriptions with diacritics
        if re.search(diacritics_pattern, ent_text) and ent_text.islower():
            continue

        # transform ent_text to title case for comparison with headwords
        ent_text = ent_text.title()
        combined.append( (ent_text, ent_label) )
    
    for hw in headwords:
        if len(hw) < 4:
            continue
        # Check if this headword is already in entities
        if not any(hw == ent_text for ent_text, _ in entities):
            combined.append( (hw, None) )
    
    # Remove duplicates
    combined = list(set(combined))
    
    # Remove substrings
    to_remove = set()
    for i in range(len(combined)):
        for j in range(len(combined)):
            if i != j:
                if combined[i][0] in combined[j][0]:
                    to_remove.add(combined[i])
    
    condensed = [item for item in combined if item not in to_remove]
    
    return condensed