In [1]:
import spacy

In [46]:
import textacy
from bs4 import BeautifulSoup
from glob import glob
from collections import Counter

In [39]:
nlp = spacy.load('en_core_web_lg')

In [4]:
%ls corpus-joyce-ulysses-tei

[0m[01;34manalysis[0m/             [01;32mu01_telemachus.xml[0m*       [01;32mu11_sirens.xml[0m*
CONTRIBUTING.md       [01;32mu02_nestor.xml[0m*           [01;32mu12_cyclops.xml[0m*
cross-references.xml  [01;32mu03_proteus.xml[0m*          [01;32mu13_nausica.xml[0m*
header.xml            [01;32mu04_calypso.xml[0m*          [01;32mu14_oxen.xml[0m*
Makefile              [01;32mu05_lotus-eaters.xml[0m*     [01;32mu15_circe.xml[0m*
persons.txt           [01;32mu06_hades.xml[0m*            [01;32mu16_eumaeus.xml[0m*
[01;34mpublic[0m/               [01;32mu07_aeolus.xml[0m*           [01;32mu17_ithaca.xml[0m*
README.md             [01;32mu08_lestrygonians.xml[0m*    [01;32mu18_penelope.xml[0m*
[01;34mschema[0m/               [01;32mu09_scylla.xml[0m*
[01;34mstylesheets[0m/          [01;32mu10_wandering_rocks.xml[0m*


In [25]:
episodesRaw = [open(f).read()[1:] for f in sorted(glob('corpus-joyce-ulysses-tei/u*.xml'))]

In [29]:
episodesRaw[0][:500]

'<div type="episode" n="01">\n<p rend="inset"><lb n="010000"/>I</p>\n<p><lb n="010001"/>Stately, plump Buck Mulligan came from the stairhead, bearing a\n<lb n="010002"/>bowl of lather on which a mirror and a razor lay crossed. A yellow\n<lb n="010003"/>dressinggown, ungirdled, was sustained gently behind him on the mild\n<lb n="010004"/>morning air. He held the bowl aloft and intoned:\n<lb n="010005"/><said who="bm">―<quote xml:lang="la">Introibo ad altare Dei.</quote></said></p>\n<p><lb n="010006"/>Hal'

In [34]:
# Extract the raw text from the XML.
episodesText = [BeautifulSoup(e, features='xml').get_text() for e in episodesRaw]

In [35]:
episodesText[0][:500]

'\nI\nStately, plump Buck Mulligan came from the stairhead, bearing a\nbowl of lather on which a mirror and a razor lay crossed. A yellow\ndressinggown, ungirdled, was sustained gently behind him on the mild\nmorning air. He held the bowl aloft and intoned:\n―Introibo ad altare Dei.\nHalted, he peered down the dark winding stairs and called out\ncoarsely:\n―Come up, Kinch! Come up, you fearful jesuit!\nSolemnly he came forward and mounted the round gunrest. He faced\nabout and blessed gravely thrice the tow'

In [42]:
e1doc = nlp(episodesText[0])

In [80]:
e1trigrams = textacy.extract.ngrams(e1doc, 7, filter_stops=False, filter_nums=True)

In [81]:
e1trigramsStr = [ngram.text for ngram in list(e1trigrams)]

In [82]:
Counter(e1trigramsStr).most_common()

[('graveclothes giving off an odour of wax', 2),
 ('giving off an odour of wax and', 2),
 ('off an odour of wax and rosewood', 2),
 ("won't we have a merry time", 2),
 ('plump Buck Mulligan came from the stairhead', 1),
 ('bowl of lather on which a mirror', 1),
 ('of lather on which a mirror and', 1),
 ('lather on which a mirror and a', 1),
 ('on which a mirror and a razor', 1),
 ('which a mirror and a razor lay', 1),
 ('a mirror and a razor lay crossed', 1),
 ('was sustained gently behind him on the', 1),
 ('sustained gently behind him on the mild', 1),
 ('He held the bowl aloft and intoned', 1),
 ('he peered down the dark winding stairs', 1),
 ('peered down the dark winding stairs and', 1),
 ('down the dark winding stairs and called', 1),
 ('the dark winding stairs and called out', 1),
 ('Solemnly he came forward and mounted the', 1),
 ('he came forward and mounted the round', 1),
 ('came forward and mounted the round gunrest', 1),
 ('about and blessed gravely thrice the tower', 1),


In [84]:
e1tdoc = textacy.Doc(e1doc)

In [88]:
e1terms = e1tdoc.to_bag_of_terms(ngrams=(1,2,3,4,5,6,7,8,9), as_strings=True, named_entities=False )

In [98]:
sorted(e1terms.items(), key=(lambda x: x[1]), reverse=True )

[('-PRON-', 912),
 ('the', 387),
 ('and', 217),
 ('a', 209),
 ('of', 162),
 ('say', 148),
 ('to', 141),
 ('have', 139),
 ('be', 137),
 ('in', 118),
 ('stephen', 92),
 ('mulligan', 83),
 ('buck', 73),
 ('buck mulligan', 71),
 ('on', 65),
 ('with', 65),
 ('not', 65),
 ('that', 53),
 ('-PRON- be', 46),
 ('from', 45),
 ('haines', 45),
 ('-PRON- say', 43),
 ('at', 40),
 ('in the', 40),
 ('-PRON- have', 39),
 ('out', 37),
 ('for', 37),
 ('of -PRON-', 36),
 ('up', 34),
 ('do', 34),
 ('of the', 32),
 ('come', 29),
 ('down', 29),
 ('to -PRON-', 28),
 ('stephen say', 28),
 ('mulligan say', 25),
 ('all', 24),
 ('over', 24),
 ('go', 24),
 ('by', 24),
 ('can', 23),
 ('―i', 23),
 ('ask', 23),
 ('from the', 23),
 ('make', 22),
 ('old', 22),
 ('will', 21),
 ('mother', 21),
 ('in -PRON-', 21),
 ('buck mulligan say', 21),
 ('voice', 20),
 ('do not', 20),
 ('again', 19),
 ('on the', 19),
 ('to the', 19),
 ('kinch', 18),
 ('what', 18),
 ('speak', 18),
 ('sea', 17),
 ('turn', 17),
 ('face', 16),
 ('about',

In [107]:
list(e1terms.items())[50][0]

'fearful'