In [1]:
#web scraping and Processing(cleaning)
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import string
from collections import Counter

def cleanSentence(sentence):
    sentence = sentence.split(' ')
    sentence = [word.strip(string.punctuation+string.whitespace) for word in sentence]
    sentence = [word for word in sentence if len(word) > 1 or (word.lower() == 'a' or word.lower() == 'i')]
    return sentence

def cleanInput(content):
    content = content.upper()
    content = re.sub('\n', ' ', content)
    content = bytes(content, 'UTF-8')
    content = content.decode('ascii', 'ignore')
    sentences = content.split('. ')
    return [cleanSentence(sentence) for sentence in sentences]

def getNgramsFromSentence(content, n):
    output = []
    for i in range(len(content)-n+1):
        output.append(content[i:i+n])
    return output

def getNgrams(content, n):
    content = cleanInput(content)
    ngrams = Counter()
    ngrams_list = []
    for sentence in content:
        newNgrams = [' '.join(ngram) for ngram in getNgramsFromSentence(sentence, n)]
        ngrams_list.extend(newNgrams)
        ngrams.update(newNgrams)
    return(ngrams)


content = str(
      urlopen('http://pythonscraping.com/files/inaugurationSpeech.txt').read(),
              'utf-8')
ngrams = getNgrams(content, 3)
print(ngrams)



In [2]:
import nltk

In [3]:
#POS Tagging
tagged = nltk.pos_tag(nltk.word_tokenize(content))
tagged

[('Called', 'VBN'),
 ('from', 'IN'),
 ('a', 'DT'),
 ('retirement', 'NN'),
 ('which', 'WDT'),
 ('I', 'PRP'),
 ('had', 'VBD'),
 ('supposed', 'VBN'),
 ('was', 'VBD'),
 ('to', 'TO'),
 ('continue', 'VB'),
 ('for', 'IN'),
 ('the', 'DT'),
 ('residue', 'NN'),
 ('of', 'IN'),
 ('my', 'PRP$'),
 ('life', 'NN'),
 ('to', 'TO'),
 ('fill', 'VB'),
 ('the', 'DT'),
 ('chief', 'JJ'),
 ('executive', 'NN'),
 ('office', 'NN'),
 ('of', 'IN'),
 ('this', 'DT'),
 ('great', 'JJ'),
 ('and', 'CC'),
 ('free', 'JJ'),
 ('nation', 'NN'),
 (',', ','),
 ('I', 'PRP'),
 ('appear', 'VBP'),
 ('before', 'IN'),
 ('you', 'PRP'),
 (',', ','),
 ('fellow-citizens', 'NNS'),
 (',', ','),
 ('to', 'TO'),
 ('take', 'VB'),
 ('the', 'DT'),
 ('oaths', 'NNS'),
 ('which', 'WDT'),
 ('the', 'DT'),
 ('Constitution', 'NNP'),
 ('prescribes', 'NN'),
 ('as', 'IN'),
 ('a', 'DT'),
 ('necessary', 'JJ'),
 ('qualification', 'NN'),
 ('for', 'IN'),
 ('the', 'DT'),
 ('performance', 'NN'),
 ('of', 'IN'),
 ('its', 'PRP$'),
 ('duties', 'NNS'),
 (';', ':'),
 

In [25]:
# Tokenize the content into sentences: sentences
sentences = nltk.sent_tokenize(content)

In [26]:
# Tokenize each sentence into words: token_sentences
token_sentences = [nltk.word_tokenize(sent) for sent in sentences]
print(token_sentences)



In [27]:
# Tag each tokenized sentence into parts of speech: pos_sentences
pos_sentences = [nltk.pos_tag(sent) for sent in token_sentences] 
print(pos_sentences)



In [28]:
# Create the named entity chunks: chunked_sentences
chunked_sentences = nltk.ne_chunk_sents(pos_sentences, binary=True)
print(chunked_sentences)

<generator object ParserI.parse_sents.<locals>.<genexpr> at 0x000001C2B7E1EF48>


In [29]:
# Test for stems of the tree with 'NE' tags
for sent in chunked_sentences:
    for chunk in sent:
        if hasattr(chunk, "label") and chunk.label() == "NE":
            print(chunk)


(NE Roman/NNP)
(NE Roman/NNP)
(NE Beneficent/NNP Creator/NNP)
(NE United/NNP States/NNPS)
(NE American/NNP)
(NE Roman/NNP)
(NE Athens/NNP)
(NE Far/NNP)
(NE American/JJ)
(NE United/NNP States/NNPS)
(NE United/NNP States/NNPS)
(NE Mr./NNP Jefferson/NNP)
(NE Congress/NNP)
(NE United/NNP States/NNPS)
(NE Congress/NNP)
(NE Congress/NNP)
(NE Congress/NNP)
(NE Congress/NNP)
(NE State/NNP)
(NE Union/NNP)
(NE United/NNP States/NNPS)
(NE Congress/NNP)
(NE Mr./NNP Madison/NNP)
(NE Upward/NNP)
(NE States/NNPS)
(NE Mr./NNP Jefferson/NNP)
(NE Armies/NNPS)
(NE Navy/NNP)
(NE United/NNP States/NNPS)
(NE Europe/NNP)
(NE Roman/NNP Emperor/NNP)
(NE Caesar/NNP)
(NE Roman/NNP)
(NE Treasury/NNP)
(NE Treasury/NNP)
(NE Treasury/NNP Department/NNP)
(NE Treasury/NNP)
(NE Congress/NNP)
(NE Mr./NNP Jefferson/NNP)
(NE Executive/NNP)
(NE Executive/NNP)
(NE Congress/NNP)
(NE Parliament/NNP)
(NE Senate/NNP)
(NE House/NNP)
(NE United/NNP States/NNPS)
(NE American/JJ)
(NE British/JJ)
(NE England/NNP)
(NE American/JJ)
(N

In [37]:
# Import spacy
import spacy

In [39]:
# Instantiate the English model: nlp
nlp = spacy.load('en_core_web_sm', tagger=False, parser=False, matcher=False)


In [40]:
# Create a new document: doc
doc = nlp(content)

In [41]:
# Print all of the found entities and their labels
for ent in doc.ents:
    print(ent.label_, ent.text)


LAW Constitution
LOC Roman
DATE two thousand years
PERSON Roman
PERSON Magistrate
DATE a few months
ORG Administration
ORG Almighty Power
LAW Constitution
ORG the Beneficent Creator
LAW The Constitution of the United States
NORP American
NORP Roman
NORP democrat
GPE Athens
LAW Constitution
ORG Government
NORP American
GPE the United States
LAW Constitution
LAW Constitution
CARDINAL one
LAW Constitution
CARDINAL one
CARDINAL one
CARDINAL one
LAW the Constitution of the United States
ORDINAL first
EVENT Convention
NORP republicans
DATE the day
ORG the Federal Government
ORG Government
DATE some years past
LAW Constitution
ORDINAL second
PERSON Jefferson
GPE States
CARDINAL one
LAW Constitution
NORP republican
LAW Constitution
ORDINAL second
LAW Constitution
CARDINAL one
LAW Constitution
ORG the Congress of the United States
LAW Constitution
ORG Legislature
ORG Legislature
LAW Constitution
ORG Executive
CARDINAL two-thirds
ORG Congress
CARDINAL one
CARDINAL one
LAW Constitution
ORG State


In [42]:
#Printing Named Entity labels
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Constitution 221 233 LAW
Roman 581 586 LOC
two thousand years 952 970 DATE
Roman 1027 1032 PERSON
Magistrate 1253 1263 PERSON
a few months 1703 1715 DATE
Administration 1821 1835 ORG
Almighty Power 2401 2415 ORG
Constitution 2620 2632 LAW
the Beneficent Creator 3654 3676 ORG
The Constitution of the United States 3838 3875 LAW
American 4405 4413 NORP
Roman 4612 4617 NORP
democrat 4702 4710 NORP
Athens 4714 4720 GPE
Constitution 5343 5355 LAW
Government 5638 5648 ORG
American 5688 5696 NORP
the United States 6008 6025 GPE
Constitution 6479 6491 LAW
Constitution 7063 7075 LAW
one 7241 7244 CARDINAL
Constitution 7529 7541 LAW
one 7773 7776 CARDINAL
one 7961 7964 CARDINAL
one 8099 8102 CARDINAL
the Constitution of the United States 8169 8206 LAW
first 8207 8212 ORDINAL
Convention 8240 8250 EVENT
republicans 8289 8300 NORP
the day 8304 8311 DATE
the Federal Government 8378 8400 ORG
Government 8780 8790 ORG
some years past 9002 9017 DATE
Constitution 9645 9657 LAW
second 9809 9815 ORDINAL
Jef