In [1]:
""" NLPIA Chapter 2 Section 2.1 Code Listings and Snippets """
import pandas as pd


sentence = "Thomas Jefferson began building Monticello at the age of 26."
sentence.split()
# ['Thomas', 'Jefferson', 'began', 'building', 'Monticello', 'at', 'the', 'age', 'of', 'twenty-six.']

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26.']

In [2]:
str.split(sentence)

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26.']

In [3]:
import numpy as np
token_sequence = str.split(sentence)
vocab = sorted(set(token_sequence))
', '.join(vocab)

'26., Jefferson, Monticello, Thomas, age, at, began, building, of, the'

In [5]:
num_tokens = len(token_sequence)
vocab_size = len(vocab)
onehot_vectors = np.zeros((num_tokens, vocab_size), int)

In [6]:
print(num_tokens)
print(vocab_size)

10
10


In [7]:
onehot_vectors

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [8]:
for i, word in enumerate(token_sequence):
    onehot_vectors[i, vocab.index(word)] = 1
' '.join(vocab)

'26. Jefferson Monticello Thomas age at began building of the'

In [9]:
onehot_vectors

array([[0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [10]:
import pandas as pd
pd.DataFrame(onehot_vectors, columns=vocab)

Unnamed: 0,26.,Jefferson,Monticello,Thomas,age,at,began,building,of,the
0,0,0,0,1,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,1,0,0
4,0,0,1,0,0,0,0,0,0,0
5,0,0,0,0,0,1,0,0,0,0
6,0,0,0,0,0,0,0,0,0,1
7,0,0,0,0,1,0,0,0,0,0
8,0,0,0,0,0,0,0,0,1,0
9,1,0,0,0,0,0,0,0,0,0


In [11]:
df = pd.DataFrame(onehot_vectors, columns=vocab)
df[df == 0] =''
df

Unnamed: 0,26.,Jefferson,Monticello,Thomas,age,at,began,building,of,the
0,,,,1.0,,,,,,
1,,1.0,,,,,,,,
2,,,,,,,1.0,,,
3,,,,,,,,1.0,,
4,,,1.0,,,,,,,
5,,,,,,1.0,,,,
6,,,,,,,,,,1.0
7,,,,,1.0,,,,,
8,,,,,,,,,1.0,
9,1.0,,,,,,,,,


In [12]:
sentence_bow = {}
# As you can see, this simple Python function already does a decent job tokenizing the example sentence. A couple more vanilla python statements and you can create numerical vector representations for each word.
for token in sentence.split():
    sentence_bow[token] = 1

sorted(sentence_bow.items())

[('26.', 1),
 ('Jefferson', 1),
 ('Monticello', 1),
 ('Thomas', 1),
 ('age', 1),
 ('at', 1),
 ('began', 1),
 ('building', 1),
 ('of', 1),
 ('the', 1)]

In [13]:
# A slightly better data structure
sentence = "Thomas Jefferson began building Monticello at the age of 26."
df = pd.DataFrame(pd.Series(dict([(token, 1) for token in sentence.split()])), columns=['sent']).T
df

Unnamed: 0,Thomas,Jefferson,began,building,Monticello,at,the,age,of,26.
sent,1,1,1,1,1,1,1,1,1,1


In [15]:
# And a pandas dataframe is great for holding multiple texts (sentences, tweets, or documents)
sentences  = "Thomas Jefferson began building Monticello at the age of 26. \n"
sentences += "Construction was done mostly by local masons and carpenters.\n"
sentences += "He moved into the South Pavilion in 1770.\n" 
sentences += "Turning Monticello into a neoclassical masterpiece was Jefferson's obession.\n"
sentences

"Thomas Jefferson began building Monticello at the age of 26. \nConstruction was done mostly by local masons and carpenters.\nHe moved into the South Pavilion in 1770.\nTurning Monticello into a neoclassical masterpiece was Jefferson's obession.\n"

In [16]:
corpus = {}
for i, sent in enumerate(sentences.split('\n')):
    corpus['sent{}'.format(i)] = dict((tok, 1) for tok in sent.split())
df = pd.DataFrame.from_records(corpus).fillna(0).astype(int).T

In [17]:
df

Unnamed: 0,1770.,26.,Construction,He,Jefferson,Jefferson's,Monticello,Pavilion,South,Thomas,...,local,masons,masterpiece,mostly,moved,neoclassical,obession.,of,the,was
sent0,0,1,0,0,1,0,1,0,0,1,...,0,0,0,0,0,0,0,1,1,0
sent1,0,0,1,0,0,0,0,0,0,0,...,1,1,0,1,0,0,0,0,0,1
sent2,1,0,0,1,0,0,0,1,1,0,...,0,0,0,0,1,0,0,0,1,0
sent3,0,0,0,0,0,1,1,0,0,0,...,0,0,1,0,0,1,1,0,0,1
sent4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
df[df.columns[:10]]

Unnamed: 0,1770.,26.,Construction,He,Jefferson,Jefferson's,Monticello,Pavilion,South,Thomas
sent0,0,1,0,0,1,0,1,0,0,1
sent1,0,0,1,0,0,0,0,0,0,0
sent2,1,0,0,1,0,0,0,1,1,0
sent3,0,0,0,0,0,1,1,0,0,0
sent4,0,0,0,0,0,0,0,0,0,0


In [19]:
v1 = pd.np.array([1, 2, 3])
v2 = pd.np.array([2, 3, 4])
v1.dot(v2)

20

In [20]:
(v1*v2).sum()

20

In [21]:
sum([x1*x2 for x1, x2 in zip(v1, v2)])

20

In [22]:
df = df.T

In [23]:
df

Unnamed: 0,sent0,sent1,sent2,sent3,sent4
1770.,0,0,1,0,0
26.,1,0,0,0,0
Construction,0,1,0,0,0
He,0,0,1,0,0
Jefferson,1,0,0,0,0
Jefferson's,0,0,0,1,0
Monticello,1,0,0,1,0
Pavilion,0,0,1,0,0
South,0,0,1,0,0
Thomas,1,0,0,0,0


In [24]:
df.sent0.dot(df.sent1)

0

In [25]:
df.sent0.dot(df.sent2)

1

In [26]:
df.sent0.dot(df.sent3)

1

In [27]:
import re
sentence = "Thomas Jefferson began building Monticello at the age of 26."
tokens = re.split(r'[-\s.,;!?]+', sentence)
tokens

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26',
 '']

In [28]:
sentence = "Thomas Jefferson began building Monticello at the age of 26."
pattern = re.compile(r'([-\s.,;!?])+')
tokens = pattern.split(sentence)
for x in tokens:
    if(x and x not in '- \t\n.,;!?'):
        print(x)

Thomas
Jefferson
began
building
Monticello
at
the
age
of
26


In [30]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+|$[0-9.]+|\S+')
tokenizer.tokenize(sentence)

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26',
 '.']

In [31]:
from nltk.tokenize import TreebankWordTokenizer
sentence = "Thomas Jefferson began building Monticello at the age of 26."
tokenizer = TreebankWordTokenizer()
tokenizer.tokenize(sentence)

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26',
 '.']

In [32]:
sentence = "Thomas Jefferson began building Monticello at the age of 26."
pattern = re.compile(r'([-\s.,;!?])+')
tokens = pattern.split(sentence)
tokens = [x for x in tokens if(x and x not in '- \t\n.,;!?')]
tokens

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26']

In [33]:
from nltk.util import ngrams
list(ngrams(tokens, 2))

[('Thomas', 'Jefferson'),
 ('Jefferson', 'began'),
 ('began', 'building'),
 ('building', 'Monticello'),
 ('Monticello', 'at'),
 ('at', 'the'),
 ('the', 'age'),
 ('age', 'of'),
 ('of', '26')]

In [34]:
list(ngrams(tokens, 3))

[('Thomas', 'Jefferson', 'began'),
 ('Jefferson', 'began', 'building'),
 ('began', 'building', 'Monticello'),
 ('building', 'Monticello', 'at'),
 ('Monticello', 'at', 'the'),
 ('at', 'the', 'age'),
 ('the', 'age', 'of'),
 ('age', 'of', '26')]

In [35]:
two_grams = list(ngrams(tokens, 2))
[" ".join(x) for x in two_grams]

['Thomas Jefferson',
 'Jefferson began',
 'began building',
 'building Monticello',
 'Monticello at',
 'at the',
 'the age',
 'age of',
 'of 26']

In [36]:
stop_words = ["a", "an", "the", "on", "of", "off", "this", "is"]
tokens = ["the", "house", "is", "on", "fire"]
tokens_without_stopwords = [x for x in tokens if x not in stop_words]
print(tokens_without_stopwords)

['house', 'fire']


In [37]:
import nltk
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lee\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [38]:
len(stop_words)

179

In [39]:
stop_words[:7]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours']

In [40]:
for sw in stop_words:
    if(len(sw) == 1):
        print(sw)

i
a
s
t
d
m
o
y


In [41]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as sklearn_stop_words
len(sklearn_stop_words)

318

In [42]:
type(sklearn_stop_words)

frozenset

In [43]:
sk = [x for x in sklearn_stop_words]
sk

['himself',
 'hundred',
 'through',
 'again',
 'such',
 'hers',
 'how',
 'where',
 'my',
 'ltd',
 'has',
 'only',
 'its',
 'either',
 'together',
 'most',
 'into',
 'somewhere',
 'cry',
 'amongst',
 'thru',
 'sixty',
 'after',
 'of',
 'get',
 'hereby',
 'made',
 'upon',
 'serious',
 'without',
 'until',
 'thereafter',
 'ie',
 'nobody',
 'thin',
 'eight',
 'enough',
 'thick',
 'who',
 'nothing',
 'because',
 'empty',
 'former',
 'have',
 'i',
 'one',
 'elsewhere',
 'twenty',
 'fire',
 'so',
 'other',
 'with',
 'these',
 'four',
 'afterwards',
 'alone',
 'them',
 'against',
 'the',
 'becoming',
 'meanwhile',
 'move',
 'although',
 'else',
 'in',
 'mine',
 'moreover',
 'nine',
 'two',
 'etc',
 'over',
 'as',
 'seems',
 'whereafter',
 'amount',
 'before',
 'fifty',
 'his',
 'less',
 'me',
 'everything',
 'front',
 'namely',
 'neither',
 'below',
 're',
 'were',
 'much',
 'he',
 'otherwise',
 'by',
 'be',
 'however',
 'noone',
 'on',
 'this',
 'off',
 'anything',
 'both',
 'themselves',
 'p

In [44]:
len(stop_words)

179

In [45]:
intersection = 0
for x in sk:
    if x in stop_words:
        intersection = intersection + 1
intersection

119

In [46]:
tokens = ['House', 'Visitor', 'Center']
normalized_tokens = [x.lower() for x in tokens]
print(normalized_tokens)

['house', 'visitor', 'center']


In [47]:
def stem(phrase):
    return ' '.join([re.findall('^(.*ss|.*?)(s)?$', word)[0][0].strip("'") for word in phrase.lower().split()])

In [48]:
stem("houses")

'house'

In [49]:
stem("Doctor House's calls")

'doctor house call'

In [50]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
' '.join([stemmer.stem(w).strip("'") for w in "dish washer's waseded dishes".split()])

'dish washer wased dish'

In [51]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lee\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [52]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize("better")

'better'

In [54]:
lemmatizer.lemmatize("better", pos = "a")

'good'

In [55]:
lemmatizer.lemmatize("good", pos = "a")

'good'

In [56]:
lemmatizer.lemmatize("goods", pos = "a")

'goods'

In [57]:
lemmatizer.lemmatize("goods", pos = "n")

'good'

In [58]:
lemmatizer.lemmatize("goodness", pos = "n")

'goodness'

In [59]:
lemmatizer.lemmatize("best", pos = "a")

'best'

In [60]:
stemmer.stem('goodness')

'good'

In [62]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
sa = SentimentIntensityAnalyzer()
sa.lexicon

{'$:': -1.5,
 '%)': -0.4,
 '%-)': -1.5,
 '&-:': -0.4,
 '&:': -0.7,
 "( '}{' )": 1.6,
 '(%': -0.9,
 "('-:": 2.2,
 "(':": 2.3,
 '((-:': 2.1,
 '(*': 1.1,
 '(-%': -0.7,
 '(-*': 1.3,
 '(-:': 1.6,
 '(-:0': 2.8,
 '(-:<': -0.4,
 '(-:o': 1.5,
 '(-:O': 1.5,
 '(-:{': -0.1,
 '(-:|>*': 1.9,
 '(-;': 1.3,
 '(-;|': 2.1,
 '(8': 2.6,
 '(:': 2.2,
 '(:0': 2.4,
 '(:<': -0.2,
 '(:o': 2.5,
 '(:O': 2.5,
 '(;': 1.1,
 '(;<': 0.3,
 '(=': 2.2,
 '(?:': 2.1,
 '(^:': 1.5,
 '(^;': 1.5,
 '(^;0': 2.0,
 '(^;o': 1.9,
 '(o:': 1.6,
 ")':": -2.0,
 ")-':": -2.1,
 ')-:': -2.1,
 ')-:<': -2.2,
 ')-:{': -2.1,
 '):': -1.8,
 '):<': -1.9,
 '):{': -2.3,
 ');<': -2.6,
 '*)': 0.6,
 '*-)': 0.3,
 '*-:': 2.1,
 '*-;': 2.4,
 '*:': 1.9,
 '*<|:-)': 1.6,
 '*\\0/*': 2.3,
 '*^:': 1.6,
 ',-:': 1.2,
 "---'-;-{@": 2.3,
 '--<--<@': 2.2,
 '.-:': -1.2,
 '..###-:': -1.7,
 '..###:': -1.9,
 '/-:': -1.3,
 '/:': -1.3,
 '/:<': -1.4,
 '/=': -0.9,
 '/^:': -1.0,
 '/o:': -1.4,
 '0-8': 0.1,
 '0-|': -1.2,
 '0:)': 1.9,
 '0:-)': 1.4,
 '0:-3': 1.5,
 '0:03': 1.9,
 '

In [63]:
[(tok, score) for tok, score in sa.lexicon.items() if " " in tok]

[("( '}{' )", 1.6),
 ("can't stand", -2.0),
 ('fed up', -1.8),
 ('screwed up', -1.5)]

In [64]:
sa.polarity_scores(text = "Python is very readable and it's great for NLP")

{'neg': 0.0, 'neu': 0.661, 'pos': 0.339, 'compound': 0.6249}

In [65]:
sa.polarity_scores(text = "Python is not a bad choice for most applications")

{'neg': 0.0, 'neu': 0.711, 'pos': 0.289, 'compound': 0.431}

In [66]:
corpus = ["Absolutely perfect! Love it! :-) :-) :-)",
"Horrible! Completely useless. :(",
"It was OK. Some good and some bad things."]

In [67]:
for doc in corpus:
    scores = sa.polarity_scores(doc)
    print('{:+}: {}'.format(scores['compound'], doc))

+0.9428: Absolutely perfect! Love it! :-) :-) :-)
-0.8768: Horrible! Completely useless. :(
+0.3254: It was OK. Some good and some bad things.
