## Import Libraries

In [8]:
from collections import Counter
import pickle
import requests
import spacy
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt

!pip list

print('All prereqs installed.')

Package                       VersionAll prereqs installed.

----------------------------- --------------------
alabaster                     0.7.12
anaconda-client               1.11.0
anaconda-navigator            2.3.2
anyio                         3.5.0
appdirs                       1.4.4
argon2-cffi                   21.3.0
argon2-cffi-bindings          21.2.0
arrow                         1.2.2
asgiref                       3.5.2
astroid                       2.11.7
astropy                       5.1
atomicwrites                  1.4.0
attrs                         22.1.0
Automat                       20.2.0
autopep8                      1.6.0
Babel                         2.11.0
backcall                      0.2.0
backports.functools-lru-cache 1.6.4
backports.tempfile            1.0
backports.weakref             1.0.post1
bcrypt                        3.2.0
beautifulsoup4                4.11.1
binaryornot                   0.4.4
bitarray                      2.5.1
bkcharts       

## Read Pickle and define play_doc

In [9]:
#Read the file
with open("midsummer.pkl", "rb") as file:
    html_content = pickle.load(file)

#parse
soup = BeautifulSoup(html_content, "html.parser")
text = soup.get_text()
nlp = spacy.load("en_core_web_sm")
play_doc = nlp(text)

## Part-of-Speech Tagging
### I will use POS to determine which parts of speech each token is. 
### and also use spacy.explain to give descriptive details about the POS

In [10]:
for token in play_doc:
    print(
        F"""
    TOKEN: {str(token)}
    =====
    TAG: {str(token.tag_):10} POS: {token.pos_}
    EXSPLANATION: {spacy.explain(token.tag_)}"""
    )


    TOKEN: 


    =====
    TAG: _SP        POS: SPACE
    EXSPLANATION: whitespace

    TOKEN: Midsummer
    =====
    TAG: NNP        POS: PROPN
    EXSPLANATION: noun, proper singular

    TOKEN: Night
    =====
    TAG: NNP        POS: PROPN
    EXSPLANATION: noun, proper singular

    TOKEN: 's
    =====
    TAG: POS        POS: PART
    EXSPLANATION: possessive ending

    TOKEN: Dream
    =====
    TAG: NNP        POS: PROPN
    EXSPLANATION: noun, proper singular

    TOKEN: :
    =====
    TAG: :          POS: PUNCT
    EXSPLANATION: punctuation mark, colon or ellipsis

    TOKEN: Entire
    =====
    TAG: JJ         POS: ADJ
    EXSPLANATION: adjective (English), other noun-modifier (Chinese)

    TOKEN: Play
    =====
    TAG: NN         POS: NOUN
    EXSPLANATION: noun, singular or mass

    TOKEN: 
 






    =====
    TAG: _SP        POS: SPACE
    EXSPLANATION: whitespace

    TOKEN: A
    =====
    TAG: DT         POS: DET
    EXSPLANATION: determiner

    TOKEN: Mids

In [11]:
nouns = []
adjectives = []
for token in play_doc:
    if token.pos_=="NOUN":
        nouns.append(token)
    if token.pos_=="ADJ":
        adjectives.append(token)
nouns

[Play,
 homepage,
 |,
 |,
 play,
 palace,
 HIPPOLYTA,
 hour,
 apace,
 days,
 moon,
 methinks,
 moon,
 wanes,
 desires,
 step,
 dame,
 dowager,
 man,
 revenue,
 HIPPOLYTA,
 days,
 night,
 nights,
 time,
 moon,
 night,
 solemnities,
 youth,
 merriments,
 pert,
 spirit,
 funerals,
 companion,
 pomp,
 Exit,
 sword,
 love,
 injuries,
 thee,
 key,
 pomp,
 triumph,
 revelling,
 duke,
 Thanks,
 news,
 vexation,
 complaint,
 child,
 daughter,
 lord,
 man,
 consent,
 duke,
 man,
 bosom,
 child,
 thou,
 rhymes,
 love,
 child,
 hast,
 moonlight,
 window,
 voice,
 verses,
 love,
 impression,
 fantasy,
 bracelets,
 hair,
 rings,
 gawds,
 conceits,
 trifles,
 nosegays,
 sweetmeats,
 messengers,
 prevailment,
 youth,
 hast,
 thou,
 daughter,
 heart,
 obedience,
 harshness,
 duke,
 grace,
 privilege,
 gentleman,
 death,
 law,
 case,
 maid,
 father,
 god,
 beauties,
 form,
 wax,
 power,
 figure,
 gentleman,
 kind,
 father,
 voice,
 worthier,
 father,
 look'd,
 eyes,
 eyes,
 judgment,
 look,
 grace,
 pow

In [12]:
adjectives

[Entire,
 Entire,
 fair,
 nuptial,
 happy,
 slow,
 old,
 young,
 silver,
 New,
 bent,
 Athenian,
 nimble,
 melancholy,
 pale,
 renowned,
 good,
 Full,
 noble,
 gracious,
 thy,
 strong,
 cunning,
 due,
 stubborn,
 gracious,
 ancient,
 fair,
 yea,
 worthy,
 other,
 bold,
 worst,
 fair,
 shady,
 barren,
 faint,
 cold,
 fruitless,
 such,
 happy,
 rose,
 single,
 unwished,
 next,
 new,
 everlasting,
 single,
 sweet,
 certain,
 true,
 mine,
 more,
 more,
 beauteous,
 sweet,
 spotted,
 inconstant,
 much,
 full,
 private,
 fair,
 single,
 nuptial,
 pale,
 true,
 smooth,
 different,
 high,
 old,
 young,
 momentany,
 short,
 collied,
 quick,
 bright,
 true,
 customary,
 due,
 poor,
 good,
 great,
 remote,
 only,
 gentle,
 sharp,
 Athenian,
 good,
 strongest,
 best,
 golden,
 false,
 more,
 same,
 fair,
 fair,
 fair,
 happy,
 sweet,
 tuneable,
 green,
 hawthorn,
 favour,
 fair,
 sweet,
 such,
 such,
 silver,
 watery,
 liquid,
 bladed,
 faint,
 sweet,
 new,
 stranger,
 sweet,
 good,
 deep,
 adieu,

## Using displaCy for Visulization
### We will run a dependancy parse and vizulize named entities

In [23]:
import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_sm")
displacy.render(play_doc, style="ent")
# This is very interesing, but I think it will need some direction to be useful with shakespearian language and names. 

## Preprocessing functions
### Here we will build functions to process text and clean it for easier use. 

In [15]:
# ignore white spaces, stop words, and punctuation 
def is_token_allowed(token):
     return bool(
         token
         and str(token).strip()
         and not token.is_stop
         and not token.is_punct
     )
# get lemma of each word
def preprocess_token(token):
    return token.lemma_.strip().lower()

complete_filtered_tokens = [
     preprocess_token(token)
     for token in play_doc
     if is_token_allowed(token)
]

complete_filtered_tokens

['midsummer',
 'night',
 'dream',
 'entire',
 'play',
 'midsummer',
 'night',
 'dream',
 'shakespeare',
 'homepage',
 '|',
 'midsummer',
 'night',
 'dream',
 '|',
 'entire',
 'play',
 'act',
 'scene',
 'i.',
 'athens',
 'palace',
 'theseus',
 'enter',
 'theseus',
 'hippolyta',
 'philostrate',
 'attendants',
 'theseus',
 'fair',
 'hippolyta',
 'nuptial',
 'hour',
 'draws',
 'apace',
 'happy',
 'day',
 'bring',
 'moon',
 'o',
 'methink',
 'slow',
 'old',
 'moon',
 'wane',
 'linger',
 'desire',
 'like',
 'step',
 'dame',
 'dowager',
 'long',
 'wither',
 'young',
 'man',
 'revenue',
 'hippolyta',
 'day',
 'quickly',
 'steep',
 'night',
 'night',
 'quickly',
 'dream',
 'away',
 'time',
 'moon',
 'like',
 'silver',
 'bow',
 'new',
 'bent',
 'heaven',
 'shall',
 'behold',
 'night',
 'solemnity',
 'theseus',
 'philostrate',
 'stir',
 'athenian',
 'youth',
 'merriment',
 'awake',
 'pert',
 'nimble',
 'spirit',
 'mirth',
 'turn',
 'melancholy',
 'forth',
 'funeral',
 'pale',
 'companion',
 'pomp

## Rule Based Matching
### Using rule based matching we can extract tokens and phrases based on patterns and gramatical features. I will attempt to extract all of players.
### Based on UPPERCASE

In [16]:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

def extract_player(nlp_doc):
    pattern = [{"IS_UPPER": True}]
    matcher.add("uppercase", [pattern])
    matches = matcher(nlp_doc)
    for _, start, end in matches:
        span = nlp_doc[start:end]  # first iteration included I, O, and A as all caps words, but I elimiated single letter options
        word = span.text
        if len(word) > 1:
            yield word


for word in extract_player(play_doc):
    print(word)

ACT
SCENE
I.
THESEUS
THESEUS
HIPPOLYTA
PHILOSTRATE
THESEUS
HIPPOLYTA
THESEUS
PHILOSTRATE
EGEUS
HERMIA
LYSANDER
DEMETRIUS
EGEUS
THESEUS
EGEUS
THESEUS
HERMIA
THESEUS
HERMIA
THESEUS
HERMIA
THESEUS
HERMIA
THESEUS
DEMETRIUS
LYSANDER
EGEUS
LYSANDER
THESEUS
EGEUS
LYSANDER
HERMIA
LYSANDER
HERMIA
LYSANDER
HERMIA
LYSANDER
HERMIA
LYSANDER
HERMIA
LYSANDER
HERMIA
LYSANDER
HERMIA
LYSANDER
HELENA
HERMIA
HELENA
HERMIA
HELENA
HERMIA
HELENA
HERMIA
HELENA
HERMIA
HELENA
HERMIA
LYSANDER
HERMIA
LYSANDER
HERMIA
HELENA
SCENE
II
QUINCE
'S
QUINCE
SNUG
BOTTOM
FLUTE
SNOUT
STARVELING
QUINCE
BOTTOM
QUINCE
BOTTOM
QUINCE
BOTTOM
QUINCE
BOTTOM
QUINCE
BOTTOM
QUINCE
BOTTOM
QUINCE
FLUTE
QUINCE
FLUTE
QUINCE
FLUTE
QUINCE
BOTTOM
QUINCE
BOTTOM
QUINCE
STARVELING
QUINCE
SNOUT
QUINCE
SNUG
QUINCE
BOTTOM
QUINCE
ALL
BOTTOM
QUINCE
BOTTOM
QUINCE
BOTTOM
QUINCE
BOTTOM
QUINCE
BOTTOM
ACT
II
SCENE
I.
PUCK
PUCK
PUCK
PUCK
OBERON
TITANIA
OBERON
TITANIA
OBERON
TITANIA
OBERON
TITANIA
OBERON
TITANIA
OBERON
TITANIA
OBERON
TITANIA
TITANIA
OBERON


## Dependency Parsing
### this will help us define relatiionships between words


In [17]:
for token in play_doc:
    print(
        f"""
TOKEN: {token.text}
=====
{token.tag_ = }
{token.head.text = }
{token.dep_ = }"""
    )


TOKEN: 


=====
token.tag_ = '_SP'
token.head.text = 'Night'
token.dep_ = 'dep'

TOKEN: Midsummer
=====
token.tag_ = 'NNP'
token.head.text = 'Night'
token.dep_ = 'compound'

TOKEN: Night
=====
token.tag_ = 'NNP'
token.head.text = 'Dream'
token.dep_ = 'poss'

TOKEN: 's
=====
token.tag_ = 'POS'
token.head.text = 'Night'
token.dep_ = 'case'

TOKEN: Dream
=====
token.tag_ = 'NNP'
token.head.text = '|'
token.dep_ = 'nmod'

TOKEN: :
=====
token.tag_ = ':'
token.head.text = 'Dream'
token.dep_ = 'punct'

TOKEN: Entire
=====
token.tag_ = 'JJ'
token.head.text = 'Play'
token.dep_ = 'amod'

TOKEN: Play
=====
token.tag_ = 'NN'
token.head.text = 'Night'
token.dep_ = 'compound'

TOKEN: 
 






=====
token.tag_ = '_SP'
token.head.text = 'Play'
token.dep_ = 'dep'

TOKEN: A
=====
token.tag_ = 'DT'
token.head.text = 'Night'
token.dep_ = 'det'

TOKEN: Midsummer
=====
token.tag_ = 'NNP'
token.head.text = 'Night'
token.dep_ = 'compound'

TOKEN: Night
=====
token.tag_ = 'NNP'
token.head.text = 'homepage'
t

## Tree and Subtreen Navigaton
### we can use attributes to navigate and parse the tree

In [18]:
import spacy
nlp = spacy.load("en_core_web_sm")
last_line= (
     "Give me your hands, if we be friends,"
        "And Robin shall restore amends."
)
last_line = nlp(last_line)

# Extract children of `friends`
print([token.text for token in last_line[8].children])


# Extract previous neighboring node of `friends`
print (last_line[8].nbor(-1))


# Extract next neighboring node of `friends`
print (last_line[8].nbor())


# Extract all tokens on the left of `friends`
print([token.text for token in last_line[8].lefts])


# Extract tokens on the right of `friends`
print([token.text for token in last_line[8].rights])


# Print subtree of `friends`
print (list(last_line[8].subtree))


[]
be
,
[]
[]
[friends]


## Shallow Parsing
### This will allow us to use chunking  to group adjcent tokens into phrases

In [19]:
# Extract Noun Phrases
for chunk in play_doc.noun_chunks:
    print (chunk)

Entire Play
 





A Midsummer Night's Dream

Shakespeare homepage 
    | Midsummer Night's Dream
SCENE
The palace
THESEUS
Enter THESEUS
HIPPOLYTA
PHILOSTRATE
Attendants

THESEUS
fair Hippolyta
our nuptial hour
Draws
apace
four happy days
Another moon
O, methinks
how slow
This old moon wanes
she
my desires
a step-dame
a dowager
Long
a young man revenue
HIPPOLYTA

Four days
themselves
night
Four nights
the moon
heaven
the night
our solemnities
THESEUS
the Athenian youth
merriments
the pert and nimble spirit
mirth
funerals
The pale companion
our pomp
I
thee
my sword
thy love
thee injuries
I
we
thee
another key
pomp
triumph
revelling
EGEUS
HERMIA
LYSANDER
EGEUS

Happy
Theseus
our renowned duke
good Egeus
what
the news
thee
EGEUS
vexation
I
complaint
my child
Hermia
This man
my consent
her
This man
the bosom
my child
thou, Lysander
thou hast
her rhymes
interchanged love-tokens
my child
Thou hast
moonlight
her window sung
voice verses
love
the impression
her fantasy
bracelets
thy hair
rings

In [20]:
# use textacy to extract verb phrases

import textacy

patterns = [{"POS": "AUX"}, {"POS": "VERB"}]
about_play = textacy.make_spacy_doc(
     text, lang="en_core_web_sm"
 )
verb_phrases = textacy.extract.token_matches(
     about_play, patterns=patterns
)

# Print all verb phrases
for chunk in verb_phrases:
    print(chunk.text)


# Extract noun phrase to explain what nouns are involved
for chunk in about_play.noun_chunks:
     print (chunk)

shall behold
may dispose
be advised
be held
do entreat
am made
may concern
may know
may befall
can endure
shall render
do estate
am beloved
'll avouch
must confess
have heard
have spoke
did lose
shall go
may extenuate--
must employ
did run
be engaged
did lay
do devour
did meet
was seen
have broke
is catching
should catch
should catch
being bated
'd give
would teach
shall see
will fly
will unfold
shall meet
must starve
am thought
do know
can transpose
is perjured
did melt
will go
are set
will ask
will move
will condole
could play
Shall break
Shall shine
must take
must love
shall play
may speak
may hide
must play
must play
may do
will roar
will roar
will make
should do
would fright
would shriek
would hang
should fright
would have
will aggravate
will roar
will roar
can play
shall see
must needs
will undertake
will discharge
will play
be dogged
will draw
will meet
may rehearse
do wander
must go
be gone
is passing
would have
shall have
were gone
have forsworn
be wedded
have overborne
are fa

## Named Entity Recognition
###  We will use NER to classify text into categories. This could help imorove key word search. 

In [21]:
for ent in play_doc.ents:
     print(
         f"""
{ent.text = }
{ent.start_char = }
{ent.end_char = }
{ent.label_ = }
spacy.explain('{ent.label_}') = {spacy.explain(ent.label_)}"""
)


ent.text = "Midsummer Night's Dream"
ent.start_char = 2
ent.end_char = 25
ent.label_ = 'WORK_OF_ART'
spacy.explain('WORK_OF_ART') = Titles of books, songs, etc.

ent.text = 'Shakespeare'
ent.start_char = 73
ent.end_char = 84
ent.label_ = 'PERSON'
spacy.explain('PERSON') = People, including fictional

ent.text = "Midsummer Night's"
ent.start_char = 101
ent.end_char = 118
ent.label_ = 'ORG'
spacy.explain('ORG') = Companies, agencies, institutions, etc.

ent.text = 'Attendants'
ent.start_char = 235
ent.end_char = 245
ent.label_ = 'NORP'
spacy.explain('NORP') = Nationalities or religious or political groups

ent.text = 'four'
ent.start_char = 310
ent.end_char = 314
ent.label_ = 'CARDINAL'
spacy.explain('CARDINAL') = Numerals that do not fall under another type

ent.text = 'Four days'
ent.start_char = 506
ent.end_char = 515
ent.label_ = 'DATE'
spacy.explain('DATE') = Absolute or relative dates or periods

ent.text = 'night'
ent.start_char = 549
ent.end_char = 554
ent.label_ = 'TIME'
spacy.

In [22]:
# We will use NER to remove people's names form the play 

def replace_person_names(token):
     if token.ent_iob != 0 and token.ent_type_ == "PERSON":
         return "[REDACTED] "
     return token.text_with_ws


def redact_names(nlp_doc):
     with nlp_doc.retokenize() as retokenizer:
         for ent in nlp_doc.ents:
             retokenizer.merge(ent)
     tokens = map(replace_person_names, nlp_doc)
     return "".join(tokens)


print(redact_names(play_doc))



Midsummer Night's Dream: Entire Play
 





A Midsummer Night's Dream

[REDACTED] homepage 
    | Midsummer Night's Dream 
    | Entire play

ACT I
SCENE I. Athens. The palace of THESEUS.

Enter THESEUS, HIPPOLYTA, PHILOSTRATE, and Attendants

THESEUS

Now, fair Hippolyta, our nuptial hour
Draws on apace; four happy days bring in
Another moon: but, O, methinks, how slow
This old moon wanes! she lingers my desires,
Like to a step-dame or a dowager
Long withering out a young man revenue.

HIPPOLYTA

Four days will quickly steep themselves in night;
Four nights will quickly dream away the time;
And then the moon, like to a silver bow
New-bent in heaven, shall behold the night
Of our solemnities.

THESEUS

Go, [REDACTED] ,
Stir up the Athenian youth to merriments;
[REDACTED] the pert and nimble spirit of mirth;
Turn melancholy forth to funerals;
The pale companion is not for our pomp.
Exit PHILOSTRATE
Hippolyta, I woo'd thee with my sword,
And won thy love, doing thee injuries;
But I wil

## Conclusion
### spaCy is a powerful tool. I am glad I chose this tutorial to follow. It introduced me to some new concepts and even more, it allowed me to pratice with the 
### code. I really enjoy NLP and am excited to see how it continues to develop and improve. 