In [1]:
import glob
import re

import bs4
import nltk
import numpy as np
import pandas as pd

In [2]:
xmldir = "../texts/corpus-eliot-middlemarch-tei"

In [3]:
xmls = sorted(glob.glob(f"{xmldir}/*.xml"))
books = [open(f).read() for f in xmls]
soups = [bs4.BeautifulSoup(book) for book in books]

Get all paragraphs

In [4]:
# soup = soups[0]
# paras = soup.findAll('p')
paras = [p for soup in soups for p in soup.findAll('p')]

For each paragraph, check whether there are any `<FID>` tags

In [5]:
dparas = pd.DataFrame({
    'Index': np.arange(len(paras)),
    'hasFID': [len(para.findAll('fid'))>0 for para in paras],
})

In [6]:
dparas.groupby('hasFID').size()

hasFID
False    3783
True      805
dtype: int64

Convenience functions:

* `dewrap` converts hard-wrapped strings to a single line
* `parse_paragraph` parses (i) all sentences in a paragraph, (ii) only the FID sentences in the paragraph, and identifies the FID sentences that are actually subsentences. The subsentence / `complete` column is mostly useful for finding weird exceptions to sentence tokenization.

In [7]:
def dewrap(s):
    # split at new lines, remove whitespace at beginning / end, join with space
    return ' '.join([l.strip() for l in s.split('\n') if l])

def parse_paragraph(para):

    raw = dewrap(para.get_text())
    fids =  [dewrap(fid.get_text()) for fid in para.findAll('fid')]

    raw_sentences = nltk.tokenize.sent_tokenize(raw)

    fid_sentences = [sentence for fid in fids for sentence in nltk.tokenize.sent_tokenize(fid)]
    # did the sentence end?
    complete = [sentence[-1] in ['.', '!', '?'] for sentence in fid_sentences]

    dfid = pd.DataFrame({
        'sentence': fid_sentences,
        'complete': complete,
    })
    
    draw = pd.DataFrame({
        'sentence': raw_sentences,
    })
    
    return dfid, draw

Loop over every paragraph. Parse the sentences in each paragraph. For each, check whether any FID sentence is a substring—i.e., **tag whether the given sentence is FID or not.** Also annotates each sentence with its paragraph's index, and summarizes the number of FID sentences in each. The `nComplete` and `nIncomplete` fields of the data frame `dparas` gives the number of FID sentences that do and do not end in `.` / `!` / `?`. This is primarily useful for debugging / finding weird results.

In [8]:
all_sentences = []
dparas['nFID'] = 0
dparas['nComplete'] = 0
for i, row in dparas.iterrows():
    ix = row.Index
    para = paras[ix]
    
    dfid, draw = parse_paragraph(para)
    # check whether any FID sentence is a subsentence of each raw sentence
    draw['isFID'] = draw.sentence.apply(lambda s: any([(x in s) for x in dfid.sentence]))
    draw['paraIndex'] = i
    all_sentences.append(draw)
    
    dparas.loc[i, 'nFID'] = len(dfid)
    dparas.loc[i, 'nComplete'] = sum(dfid.complete)
dparas['nIncomplete'] = dparas.nFID - dparas.nComplete

In [9]:
all_sentences = pd.concat(all_sentences)

In [10]:
all_sentences.groupby('isFID').size()

isFID
False    12686
True      2058
dtype: int64

# A weird example

In [11]:
dparas[(dparas.nFID > 1) & (dparas.nIncomplete > 1)]

Unnamed: 0,Index,hasFID,nFID,nComplete,nIncomplete
267,267,True,3,1,2
3878,3878,True,4,2,2
4260,4260,True,2,0,2
4336,4336,True,2,0,2


In [12]:
para = paras[267]
dfid, draw = parse_paragraph(para)

In [13]:
fids =  [dewrap(fid.get_text()) for fid in para.findAll('fid')]

In [14]:
x = fids[0]

In [15]:
nltk.tokenize.sent_tokenize(x)

['Mr. Casaubon was touched with an unknown delight (what man would not have been?)',
 'at this childlike unrestrained ardor: he was not surprised (what lover would have been?)',
 'that he should be the object of it.']

In [16]:
print(para)

<p>
Nevertheless before the evening was at an end she was very happy.  In
an hour's <i>tete-a-tete</i> with Mr. Casaubon she talked to him with more
freedom than she had ever felt before, even pouring out her joy at the
thought of devoting herself to him, and of learning how she might best
share and further all his great ends. <fid who="#C"> Mr. Casaubon was touched with an
unknown delight (what man would not have been?) at this childlike
unrestrained ardor: he was not surprised (what lover would have been?)
that he should be the object of it. </fid>
</p>


In [17]:
dfid.sentence.values

array(['Mr. Casaubon was touched with an unknown delight (what man would not have been?)',
       'at this childlike unrestrained ardor: he was not surprised (what lover would have been?)',
       'that he should be the object of it.'], dtype=object)

In [18]:
draw.sentence.values

array(['Nevertheless before the evening was at an end she was very happy.',
       "In an hour's tete-a-tete with Mr. Casaubon she talked to him with more freedom than she had ever felt before, even pouring out her joy at the thought of devoting herself to him, and of learning how she might best share and further all his great ends.",
       'Mr. Casaubon was touched with an unknown delight (what man would not have been?)',
       'at this childlike unrestrained ardor: he was not surprised (what lover would have been?)',
       'that he should be the object of it.'], dtype=object)