## Tokenize and POS tag the text

In [1]:
import nltk
# from nltk.stem.porter import *
from nltk.stem.snowball import SnowballStemmer

In [2]:
# https://www.battlefields.org/learn/articles/brief-overview-american-civil-war 
text = open("data.txt").read()
print(text)
print(len(text))

The Civil War is the central event in America's historical consciousness. While the Revolution of 1776-1783 created the United States, the Civil War of 1861-1865 determined what kind of nation it would be. The war resolved two fundamental questions left unresolved by the revolution: whether the United States was to be a dissolvable confederation of sovereign states or an indivisible nation with a sovereign national government; and whether this nation, born of a declaration that all men were created with an equal right to liberty, would continue to exist as the largest slaveholding country in the world.

Northern victory in the war preserved the United States as one nation and ended the institution of slavery that had divided the country from its beginning. But these achievements came at the cost of 625,000 lives--nearly as many American soldiers as died in all the other wars in which this country has fought combined. The American Civil War was the largest and most destructive conflict 

In [3]:
# convert to list of words
text_tokens = nltk.word_tokenize(text)
print(text_tokens)
print()
print(len(text_tokens))

['The', 'Civil', 'War', 'is', 'the', 'central', 'event', 'in', 'America', "'s", 'historical', 'consciousness', '.', 'While', 'the', 'Revolution', 'of', '1776-1783', 'created', 'the', 'United', 'States', ',', 'the', 'Civil', 'War', 'of', '1861-1865', 'determined', 'what', 'kind', 'of', 'nation', 'it', 'would', 'be', '.', 'The', 'war', 'resolved', 'two', 'fundamental', 'questions', 'left', 'unresolved', 'by', 'the', 'revolution', ':', 'whether', 'the', 'United', 'States', 'was', 'to', 'be', 'a', 'dissolvable', 'confederation', 'of', 'sovereign', 'states', 'or', 'an', 'indivisible', 'nation', 'with', 'a', 'sovereign', 'national', 'government', ';', 'and', 'whether', 'this', 'nation', ',', 'born', 'of', 'a', 'declaration', 'that', 'all', 'men', 'were', 'created', 'with', 'an', 'equal', 'right', 'to', 'liberty', ',', 'would', 'continue', 'to', 'exist', 'as', 'the', 'largest', 'slaveholding', 'country', 'in', 'the', 'world', '.', 'Northern', 'victory', 'in', 'the', 'war', 'preserved', 'the',

In [4]:
# POS tag on filtered text
text_tag = nltk.pos_tag(text_tokens)
print(text_tag)

[('The', 'DT'), ('Civil', 'NNP'), ('War', 'NNP'), ('is', 'VBZ'), ('the', 'DT'), ('central', 'JJ'), ('event', 'NN'), ('in', 'IN'), ('America', 'NNP'), ("'s", 'POS'), ('historical', 'JJ'), ('consciousness', 'NN'), ('.', '.'), ('While', 'IN'), ('the', 'DT'), ('Revolution', 'NNP'), ('of', 'IN'), ('1776-1783', 'JJ'), ('created', 'VBD'), ('the', 'DT'), ('United', 'NNP'), ('States', 'NNPS'), (',', ','), ('the', 'DT'), ('Civil', 'NNP'), ('War', 'NNP'), ('of', 'IN'), ('1861-1865', 'JJ'), ('determined', 'VBD'), ('what', 'WP'), ('kind', 'NN'), ('of', 'IN'), ('nation', 'NN'), ('it', 'PRP'), ('would', 'MD'), ('be', 'VB'), ('.', '.'), ('The', 'DT'), ('war', 'NN'), ('resolved', 'VBD'), ('two', 'CD'), ('fundamental', 'JJ'), ('questions', 'NNS'), ('left', 'VBD'), ('unresolved', 'JJ'), ('by', 'IN'), ('the', 'DT'), ('revolution', 'NN'), (':', ':'), ('whether', 'IN'), ('the', 'DT'), ('United', 'NNP'), ('States', 'NNPS'), ('was', 'VBD'), ('to', 'TO'), ('be', 'VB'), ('a', 'DT'), ('dissolvable', 'JJ'), ('con

## Remove non-Alpha

In [5]:
# remove non-alpha entries
# print num of removed entries
print(f"Before removing non alpha characters, length is {len(text_tag)}")
# alpha words
text_alpha = [word for word in text_tokens if word.isalpha()]
# alpha entries with tag
text_tag_alpha = [entries for entries in text_tag if entries[0].isalpha()]
print(f"After removing non alpha characters, length is {len(text_tag_alpha)}\n")
print(text_tag_alpha)

Before removing non alpha characters, length is 852
After removing non alpha characters, length is 759

[('The', 'DT'), ('Civil', 'NNP'), ('War', 'NNP'), ('is', 'VBZ'), ('the', 'DT'), ('central', 'JJ'), ('event', 'NN'), ('in', 'IN'), ('America', 'NNP'), ('historical', 'JJ'), ('consciousness', 'NN'), ('While', 'IN'), ('the', 'DT'), ('Revolution', 'NNP'), ('of', 'IN'), ('created', 'VBD'), ('the', 'DT'), ('United', 'NNP'), ('States', 'NNPS'), ('the', 'DT'), ('Civil', 'NNP'), ('War', 'NNP'), ('of', 'IN'), ('determined', 'VBD'), ('what', 'WP'), ('kind', 'NN'), ('of', 'IN'), ('nation', 'NN'), ('it', 'PRP'), ('would', 'MD'), ('be', 'VB'), ('The', 'DT'), ('war', 'NN'), ('resolved', 'VBD'), ('two', 'CD'), ('fundamental', 'JJ'), ('questions', 'NNS'), ('left', 'VBD'), ('unresolved', 'JJ'), ('by', 'IN'), ('the', 'DT'), ('revolution', 'NN'), ('whether', 'IN'), ('the', 'DT'), ('United', 'NNP'), ('States', 'NNPS'), ('was', 'VBD'), ('to', 'TO'), ('be', 'VB'), ('a', 'DT'), ('dissolvable', 'JJ'), ('conf

## Store normalized token and remove duplicates

In [6]:
# use Porter2 Stemmer
s_stemmer = SnowballStemmer(language='english')

In [7]:
# normalize and remove duplicates
normalized_text_alpha = [s_stemmer.stem(word) for word in text_alpha]
normalized_text_alpha_noDup = list(set(normalized_text_alpha))
# display difference
print(f"{len(normalized_text_alpha) - len(normalized_text_alpha_noDup)} duplicates removed\n")
print(len(normalized_text_alpha))
print()
print(normalized_text_alpha_noDup)

439 duplicates removed

759

['becaus', 'cold', 'recogn', 'an', 'chief', 'ineffect', 'cape', 'be', 'liberti', 'had', 'free', 'april', 'call', 'atlanta', 'discredit', 'onset', 'they', 'bloodi', 'invas', 'meantim', 'has', 'militia', 'nation', 'campaign', 'restor', 'arm', 'infrastructur', 'antietam', 'while', 'trigger', 'manassa', 'ulyss', 'central', 'was', 'keep', 'econom', 'presid', 'yet', 'thoma', 'junction', 'gettysburg', 'begin', 'mani', 'that', 'pain', 'mile', 'resist', 'surrend', 'war', 'join', 'i', 'institut', 'at', 'govern', 'may', 'won', 'base', 'two', 'pledg', 'along', 'cost', 'foreshadow', 'line', 'sumter', 'would', 'determin', 'unit', 'give', 'georgia', 'insurrect', 'territori', 'secess', 'largest', 'dissolv', 'fredericksburg', 'all', 'appalachian', 'democraci', 'lee', 'american', 'confeder', 'fear', 'davi', 'new', 'fatal', 'left', 'the', 'own', 'huge', 'sever', 'western', 'name', 'administr', 'slavehold', 'refus', 'establish', 'seced', 'subsequ', 'there', 'start', 'brought',

## Chunking using customized Grammar

In [8]:
# reference
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

In [9]:
"""
Grammar:
    1. his/her(PRP$)/the(DT)/a(LS) adj(JJ) noun(NN_)     {[<PRP$><DT><LS>]?<JJ>?<NN>+}
    2. cannot(MD) verb(VB)
    3. all(PDT) noun(NN_)
    4. be(VB_) doing(VBG)
    5. has(VB_) adv(RB) done(VBD)
    6. adv(RB) verb(VB|VBZ) {<RB>?[<VB><VBZ>]}
    7. was(VB_) taken(VBN)
    

"""
#seq = """
#    Chunk:
#    {<NNP>+}
#    {<NN>+}"""
seq = """
    Chunk:
    {<MD>?<VB>}
    {<PDT>?<JJ>?<NN>+}
    {(<VB>|<VBZ>)<VBG>}
    {(<VB>|<VBZ>)<RB>?<VBD>}
    {<VBD><VBN>}"""
chunker = nltk.RegexpParser(seq)

In [10]:
# print result
chunk_result = chunker.parse(text_tag_alpha)
for item in chunk_result:
    print(item, type(item))

('The', 'DT') <class 'tuple'>
('Civil', 'NNP') <class 'tuple'>
('War', 'NNP') <class 'tuple'>
('is', 'VBZ') <class 'tuple'>
('the', 'DT') <class 'tuple'>
(Chunk central/JJ event/NN) <class 'nltk.tree.Tree'>
('in', 'IN') <class 'tuple'>
('America', 'NNP') <class 'tuple'>
(Chunk historical/JJ consciousness/NN) <class 'nltk.tree.Tree'>
('While', 'IN') <class 'tuple'>
('the', 'DT') <class 'tuple'>
('Revolution', 'NNP') <class 'tuple'>
('of', 'IN') <class 'tuple'>
('created', 'VBD') <class 'tuple'>
('the', 'DT') <class 'tuple'>
('United', 'NNP') <class 'tuple'>
('States', 'NNPS') <class 'tuple'>
('the', 'DT') <class 'tuple'>
('Civil', 'NNP') <class 'tuple'>
('War', 'NNP') <class 'tuple'>
('of', 'IN') <class 'tuple'>
('determined', 'VBD') <class 'tuple'>
('what', 'WP') <class 'tuple'>
(Chunk kind/NN) <class 'nltk.tree.Tree'>
('of', 'IN') <class 'tuple'>
(Chunk nation/NN) <class 'nltk.tree.Tree'>
('it', 'PRP') <class 'tuple'>
(Chunk would/MD be/VB) <class 'nltk.tree.Tree'>
('The', 'DT') <clas

# Spacy

In [11]:
import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_sm")

In [12]:
doc = nlp(text)

In [13]:
for chunk in doc.noun_chunks:
    print(f'{chunk.text:{25}} {chunk.root.text:{6}} {chunk.root.dep_:<{22}} {chunk.root.head.text}')

The Civil War             War    nsubj                  is
the central event         event  attr                   is
America's historical consciousness consciousness pobj                   in
the Revolution            Revolution nsubj                  created
the United States         States dobj                   created
the Civil War             War    appos                  States
what kind                 kind   attr                   be
nation                    nation pobj                   of
it                        it     nsubj                  be
The war                   war    nsubj                  resolved
two fundamental questions questions dobj                   resolved
the revolution            revolution pobj                   by
the United States         States nsubj                  was
a dissolvable confederation confederation attr                   be
sovereign states          states pobj                   of
an indivisible nation     nation conj               

In [14]:
#When doc is created, the parsing process is done and a parse tree is generated
displacy.render(doc, style = 'dep')

In [15]:
displacy.render(doc, style="ent")

In [16]:
# higher accuracy
!python -m spacy download en_core_web_trf
nlp = spacy.load('en_core_web_trf')

Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-trf==3.0.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.0.0/en_core_web_trf-3.0.0-py3-none-any.whl (459.7 MB)
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_trf')


In [17]:
doc = nlp(text)
for chunk in doc.noun_chunks:
    print(f'{chunk.text:{25}} {chunk.root.text:{6}} {chunk.root.dep_:<{22}} {chunk.root.head.text}')

The Civil War             War    nsubj                  is
the central event         event  attr                   is
America's historical consciousness consciousness pobj                   in
the Revolution            Revolution nsubj                  created
the United States         States dobj                   created
the Civil War             War    nsubj                  determined
what kind                 kind   attr                   be
nation                    nation pobj                   of
it                        it     nsubj                  be
The war                   war    nsubj                  resolved
two fundamental questions questions dobj                   resolved
the revolution            revolution pobj                   by
the United States         States nsubj                  was
a dissolvable confederation confederation attr                   be
sovereign states          states pobj                   of
an indivisible nation     nation conj           

In [18]:
#When doc is created, the parsing process is done and a parse tree is generated
displacy.render(doc, style = 'dep')

In [19]:
displacy.render(doc, style="ent")