In [1]:
""" NLPIA Chapter 2 Section 2.1 Code Listings and Snippets """
import pandas as pd


sentence = "Thomas Jefferson began building Monticello at the age of 26."
sentence.split()
# ['Thomas', 'Jefferson', 'began', 'building', 'Monticello', 'at', 'the', 'age', 'of', 'twenty-six.']

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26.']

In [2]:
str.split(sentence)

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26.']

In [3]:
import numpy as np
token_sequence = str.split(sentence)
vocab = sorted(set(token_sequence))
', '.join(vocab)

'26., Jefferson, Monticello, Thomas, age, at, began, building, of, the'

In [4]:
num_tokens = len(token_sequence)
vocab_size = len(vocab)
onehot_vectors = np.zeros((num_tokens, vocab_size), int)

In [5]:
for i, word in enumerate(token_sequence):
    onehot_vectors[i, vocab.index(word)] = 1
' '.join(vocab)

'26. Jefferson Monticello Thomas age at began building of the'

In [6]:
onehot_vectors

array([[0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [7]:
import pandas as pd
pd.DataFrame(onehot_vectors, columns=vocab)

Unnamed: 0,26.,Jefferson,Monticello,Thomas,age,at,began,building,of,the
0,0,0,0,1,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,1,0,0
4,0,0,1,0,0,0,0,0,0,0
5,0,0,0,0,0,1,0,0,0,0
6,0,0,0,0,0,0,0,0,0,1
7,0,0,0,0,1,0,0,0,0,0
8,0,0,0,0,0,0,0,0,1,0
9,1,0,0,0,0,0,0,0,0,0


In [8]:
df = pd.DataFrame(onehot_vectors, columns=vocab)
df[df == 0] =''
df

Unnamed: 0,26.,Jefferson,Monticello,Thomas,age,at,began,building,of,the
0,,,,1.0,,,,,,
1,,1.0,,,,,,,,
2,,,,,,,1.0,,,
3,,,,,,,,1.0,,
4,,,1.0,,,,,,,
5,,,,,,1.0,,,,
6,,,,,,,,,,1.0
7,,,,,1.0,,,,,
8,,,,,,,,,1.0,
9,1.0,,,,,,,,,


In [9]:
sentence_bow = {}
# As you can see, this simple Python function already does a decent job tokenizing the example sentence. A couple more vanilla python statements and you can create numerical vector representations for each word.
for token in sentence.split():
    sentence_bow[token] = 1

sorted(sentence_bow.items())

[('26.', 1),
 ('Jefferson', 1),
 ('Monticello', 1),
 ('Thomas', 1),
 ('age', 1),
 ('at', 1),
 ('began', 1),
 ('building', 1),
 ('of', 1),
 ('the', 1)]

In [10]:
# A slightly better data structure
sentence = "Thomas Jefferson began building Monticello at the age of 26."
df = pd.DataFrame(pd.Series(dict([(token, 1) for token in sentence.split()])), columns=['sent']).T
df

Unnamed: 0,Thomas,Jefferson,began,building,Monticello,at,the,age,of,26.
sent,1,1,1,1,1,1,1,1,1,1


In [11]:
# And a pandas dataframe is great for holding multiple texts (sentences, tweets, or documents)
sentences  = "Thomas Jefferson began building Monticello at the age of 26. \n"
sentences += "Construction was done mostly by local masons and carpenters.\n"
sentences += "He moved into the South Pavilion in 1770.\n" 
sentences += "Turning Monticello into a neoclassical masterpiece was Jefferson's obession.\n"
sentences

"Thomas Jefferson began building Monticello at the age of 26. \nConstruction was done mostly by local masons and carpenters.\nHe moved into the South Pavilion in 1770.\nTurning Monticello into a neoclassical masterpiece was Jefferson's obession.\n"

In [12]:
corpus = {}
for i, sent in enumerate(sentences.split('\n')):
    corpus['sent{}'.format(i)] = dict((tok, 1) for tok in sent.split())
df = pd.DataFrame.from_records(corpus).fillna(0).astype(int).T

In [13]:
df[df.columns[:10]]

Unnamed: 0,1770.,26.,Construction,He,Jefferson,Jefferson's,Monticello,Pavilion,South,Thomas
sent0,0,1,0,0,1,0,1,0,0,1
sent1,0,0,1,0,0,0,0,0,0,0
sent2,1,0,0,1,0,0,0,1,1,0
sent3,0,0,0,0,0,1,1,0,0,0
sent4,0,0,0,0,0,0,0,0,0,0


In [14]:
v1 = pd.np.array([1, 2, 3])
v2 = pd.np.array([2, 3, 4])
v1.dot(v2)

20

In [15]:
(v1*v2).sum()

20

In [16]:
sum([x1*x2 for x1, x2 in zip(v1, v2)])

20

In [17]:
df = df.T

In [18]:
df.sent0.dot(df.sent1)

0

In [19]:
df.sent0.dot(df.sent2)

1

In [20]:
df.sent0.dot(df.sent3)

1

In [21]:
import re
sentence = "Thomas Jefferson began building Monticello at the age of 26."
tokens = re.split(r'[-\s.,;!?]+', sentence)
tokens

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26',
 '']

In [22]:
sentence = "Thomas Jefferson began building Monticello at the age of 26."
pattern = re.compile(r'([-\s.,;!?])+')
tokens = pattern.split(sentence)
for x in tokens:
    if(x and x not in '- \t\n.,;!?'):
        print(x)

Thomas
Jefferson
began
building
Monticello
at
the
age
of
26


In [26]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+|$[0-9.]+|\S+')
tokenizer.tokenize(sentence)

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26',
 '.']

In [27]:
from nltk.tokenize import TreebankWordTokenizer
sentence = "Thomas Jefferson began building Monticello at the age of 26."
tokenizer = TreebankWordTokenizer()
tokenizer.tokenize(sentence)

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26',
 '.']

In [29]:
sentence = "Thomas Jefferson began building Monticello at the age of 26."
pattern = re.compile(r'([-\s.,;!?])+')
tokens = pattern.split(sentence)
tokens = [x for x in tokens if(x and x not in '- \t\n.,;!?')]
tokens

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26']

In [30]:
from nltk.util import ngrams
list(ngrams(tokens, 2))

[('Thomas', 'Jefferson'),
 ('Jefferson', 'began'),
 ('began', 'building'),
 ('building', 'Monticello'),
 ('Monticello', 'at'),
 ('at', 'the'),
 ('the', 'age'),
 ('age', 'of'),
 ('of', '26')]

In [31]:
list(ngrams(tokens, 3))

[('Thomas', 'Jefferson', 'began'),
 ('Jefferson', 'began', 'building'),
 ('began', 'building', 'Monticello'),
 ('building', 'Monticello', 'at'),
 ('Monticello', 'at', 'the'),
 ('at', 'the', 'age'),
 ('the', 'age', 'of'),
 ('age', 'of', '26')]

In [33]:
two_grams = list(ngrams(tokens, 2))
[" ".join(x) for x in two_grams]

['Thomas Jefferson',
 'Jefferson began',
 'began building',
 'building Monticello',
 'Monticello at',
 'at the',
 'the age',
 'age of',
 'of 26']

In [34]:
stop_words = ["a", "an", "the", "on", "of", "off", "this", "is"]
tokens = ["the", "house", "is", "on", "fire"]
tokens_without_stopwords = [x for x in tokens if x not in stop_words]
print(tokens_without_stopwords)

['house', 'fire']


In [35]:
import nltk
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER/nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [36]:
len(stop_words)

179

In [37]:
stop_words[:7]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours']

In [38]:
for sw in stop_words:
    if(len(sw) == 1):
        print(sw)

i
a
s
t
d
m
o
y


In [39]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as sklearn_stop_words
len(sklearn_stop_words)

318

In [45]:
type(sklearn_stop_words)

frozenset

In [47]:
sk = [x for x in sklearn_stop_words]
sk

['only',
 'move',
 'or',
 'therein',
 'became',
 'around',
 'mostly',
 'afterwards',
 'latterly',
 'via',
 'done',
 'by',
 'go',
 'hereafter',
 'of',
 'anything',
 'off',
 'can',
 'something',
 'sometimes',
 'they',
 'too',
 'well',
 'wherein',
 'with',
 'find',
 'behind',
 'ever',
 'whereafter',
 'amoungst',
 'that',
 'because',
 'nowhere',
 'myself',
 'beyond',
 'not',
 'latter',
 'himself',
 'already',
 'co',
 'your',
 'anyhow',
 'forty',
 'itself',
 'yourselves',
 'here',
 'put',
 'between',
 'again',
 'un',
 'detail',
 'six',
 'cry',
 'thereby',
 'must',
 'both',
 'anyone',
 'whole',
 'none',
 'interest',
 'hundred',
 'nevertheless',
 'so',
 'ten',
 'twenty',
 'namely',
 'has',
 'under',
 'whether',
 'nor',
 'thereupon',
 'into',
 'against',
 'indeed',
 'call',
 'seemed',
 'until',
 'after',
 'alone',
 'much',
 'everywhere',
 'side',
 'sixty',
 'whence',
 'up',
 'before',
 'four',
 'whatever',
 'his',
 'who',
 'seem',
 'more',
 're',
 'than',
 'towards',
 'very',
 'yourself',
 'wh

In [40]:
len(stop_words)

179

In [48]:
intersection = 0
for x in sk:
    if x in stop_words:
        intersection = intersection + 1
intersection

119

In [49]:
tokens = ['House', 'Visitor', 'Center']
normalized_tokens = [x.lower() for x in tokens]
print(normalized_tokens)

['house', 'visitor', 'center']


In [57]:
def stem(phrase):
    return ' '.join([re.findall('^(.*ss|.*?)(s)?$', word)[0][0].strip("'") for word in phrase.lower().split()])

In [58]:
stem("houses")

'house'

In [59]:
stem("Doctor House's calls")

'doctor house call'

In [51]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
' '.join([stemmer.stem(w).strip("'") for w in "dish washer's waseded dishes".split()])

'dish washer wased dish'

In [60]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to C:\Users\USER/nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [64]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize("better")

'better'

In [65]:
lemmatizer.lemmatizer("better", pos = "a")

AttributeError: 'WordNetLemmatizer' object has no attribute 'lemmatizer'

In [66]:
lemmatizer.lemmatize("good", pos = "a")

'good'

In [67]:
lemmatizer.lemmatize("goods", pos = "a")

'goods'

In [68]:
lemmatizer.lemmatize("goods", pos = "n")

'good'

In [69]:
lemmatizer.lemmatize("goodness", pos = "n")

'goodness'

In [70]:
lemmatizer.lemmatize("best", pos = "a")

'best'

In [71]:
stemmer.stem('goodness')

'good'

In [73]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
sa = SentimentIntensityAnalyzer()
sa.lexicon

{'$:': -1.5,
 '%)': -0.4,
 '%-)': -1.5,
 '&-:': -0.4,
 '&:': -0.7,
 "( '}{' )": 1.6,
 '(%': -0.9,
 "('-:": 2.2,
 "(':": 2.3,
 '((-:': 2.1,
 '(*': 1.1,
 '(-%': -0.7,
 '(-*': 1.3,
 '(-:': 1.6,
 '(-:0': 2.8,
 '(-:<': -0.4,
 '(-:o': 1.5,
 '(-:O': 1.5,
 '(-:{': -0.1,
 '(-:|>*': 1.9,
 '(-;': 1.3,
 '(-;|': 2.1,
 '(8': 2.6,
 '(:': 2.2,
 '(:0': 2.4,
 '(:<': -0.2,
 '(:o': 2.5,
 '(:O': 2.5,
 '(;': 1.1,
 '(;<': 0.3,
 '(=': 2.2,
 '(?:': 2.1,
 '(^:': 1.5,
 '(^;': 1.5,
 '(^;0': 2.0,
 '(^;o': 1.9,
 '(o:': 1.6,
 ")':": -2.0,
 ")-':": -2.1,
 ')-:': -2.1,
 ')-:<': -2.2,
 ')-:{': -2.1,
 '):': -1.8,
 '):<': -1.9,
 '):{': -2.3,
 ');<': -2.6,
 '*)': 0.6,
 '*-)': 0.3,
 '*-:': 2.1,
 '*-;': 2.4,
 '*:': 1.9,
 '*<|:-)': 1.6,
 '*\\0/*': 2.3,
 '*^:': 1.6,
 ',-:': 1.2,
 "---'-;-{@": 2.3,
 '--<--<@': 2.2,
 '.-:': -1.2,
 '..###-:': -1.7,
 '..###:': -1.9,
 '/-:': -1.3,
 '/:': -1.3,
 '/:<': -1.4,
 '/=': -0.9,
 '/^:': -1.0,
 '/o:': -1.4,
 '0-8': 0.1,
 '0-|': -1.2,
 '0:)': 1.9,
 '0:-)': 1.4,
 '0:-3': 1.5,
 '0:03': 1.9,
 '

In [75]:
[(tok, score) for tok, score in sa.lexicon.items() if " " in tok]

[("( '}{' )", 1.6),
 ("can't stand", -2.0),
 ('fed up', -1.8),
 ('screwed up', -1.5)]

In [76]:
sa.polarity_scores(text = "Python is very readable and it's great for NLP")

{'neg': 0.0, 'neu': 0.661, 'pos': 0.339, 'compound': 0.6249}

In [77]:
sa.polarity_scores(text = "Python is not a bad choice for most applications")

{'neg': 0.0, 'neu': 0.711, 'pos': 0.289, 'compound': 0.431}

In [78]:
corpus = ["Absolutely perfect! Love it! :-) :-) :-)",
"Horrible! Completely useless. :(",
"It was OK. Some good and some bad things."]

In [79]:
for doc in corpus:
    scores = sa.polarity_scores(doc)
    print('{:+}: {}'.format(scores['compound'], doc))

+0.9428: Absolutely perfect! Love it! :-) :-) :-)
-0.8768: Horrible! Completely useless. :(
+0.3254: It was OK. Some good and some bad things.


In [1]:
""" Section 2.3 code listings from NLPIA """

' Section 2.3 code listings from NLPIA '

In [2]:
import pandas as pd
pd.options.display.max_colwidth = 40  # default: 50
pd.options.display.width = 75  # default: 80
pd.options.display.max_columns = 12  # default: 0

In [3]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
sa = SentimentIntensityAnalyzer()
sa.lexicon

{'$:': -1.5,
 '%)': -0.4,
 '%-)': -1.5,
 '&-:': -0.4,
 '&:': -0.7,
 "( '}{' )": 1.6,
 '(%': -0.9,
 "('-:": 2.2,
 "(':": 2.3,
 '((-:': 2.1,
 '(*': 1.1,
 '(-%': -0.7,
 '(-*': 1.3,
 '(-:': 1.6,
 '(-:0': 2.8,
 '(-:<': -0.4,
 '(-:o': 1.5,
 '(-:O': 1.5,
 '(-:{': -0.1,
 '(-:|>*': 1.9,
 '(-;': 1.3,
 '(-;|': 2.1,
 '(8': 2.6,
 '(:': 2.2,
 '(:0': 2.4,
 '(:<': -0.2,
 '(:o': 2.5,
 '(:O': 2.5,
 '(;': 1.1,
 '(;<': 0.3,
 '(=': 2.2,
 '(?:': 2.1,
 '(^:': 1.5,
 '(^;': 1.5,
 '(^;0': 2.0,
 '(^;o': 1.9,
 '(o:': 1.6,
 ")':": -2.0,
 ")-':": -2.1,
 ')-:': -2.1,
 ')-:<': -2.2,
 ')-:{': -2.1,
 '):': -1.8,
 '):<': -1.9,
 '):{': -2.3,
 ');<': -2.6,
 '*)': 0.6,
 '*-)': 0.3,
 '*-:': 2.1,
 '*-;': 2.4,
 '*:': 1.9,
 '*<|:-)': 1.6,
 '*\\0/*': 2.3,
 '*^:': 1.6,
 ',-:': 1.2,
 "---'-;-{@": 2.3,
 '--<--<@': 2.2,
 '.-:': -1.2,
 '..###-:': -1.7,
 '..###:': -1.9,
 '/-:': -1.3,
 '/:': -1.3,
 '/:<': -1.4,
 '/=': -0.9,
 '/^:': -1.0,
 '/o:': -1.4,
 '0-8': 0.1,
 '0-|': -1.2,
 '0:)': 1.9,
 '0:-)': 1.4,
 '0:-3': 1.5,
 '0:03': 1.9,
 '

In [4]:
[(tok, score) for tok, score in sa.lexicon.items() if " " in tok]

[("( '}{' )", 1.6),
 ("can't stand", -2.0),
 ('fed up', -1.8),
 ('screwed up', -1.5)]

In [5]:
sa.polarity_scores(text=\
... "Python is very readable and it's great for NLP.")

{'neg': 0.0, 'neu': 0.661, 'pos': 0.339, 'compound': 0.6249}

In [6]:
sa.polarity_scores(text=\
... "Python is not a bad choice for most applications.")

{'neg': 0.0, 'neu': 0.711, 'pos': 0.289, 'compound': 0.431}

In [7]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer  # noqa
sa = SentimentIntensityAnalyzer()
sa.lexicon

{'$:': -1.5,
 '%)': -0.4,
 '%-)': -1.5,
 '&-:': -0.4,
 '&:': -0.7,
 "( '}{' )": 1.6,
 '(%': -0.9,
 "('-:": 2.2,
 "(':": 2.3,
 '((-:': 2.1,
 '(*': 1.1,
 '(-%': -0.7,
 '(-*': 1.3,
 '(-:': 1.6,
 '(-:0': 2.8,
 '(-:<': -0.4,
 '(-:o': 1.5,
 '(-:O': 1.5,
 '(-:{': -0.1,
 '(-:|>*': 1.9,
 '(-;': 1.3,
 '(-;|': 2.1,
 '(8': 2.6,
 '(:': 2.2,
 '(:0': 2.4,
 '(:<': -0.2,
 '(:o': 2.5,
 '(:O': 2.5,
 '(;': 1.1,
 '(;<': 0.3,
 '(=': 2.2,
 '(?:': 2.1,
 '(^:': 1.5,
 '(^;': 1.5,
 '(^;0': 2.0,
 '(^;o': 1.9,
 '(o:': 1.6,
 ")':": -2.0,
 ")-':": -2.1,
 ')-:': -2.1,
 ')-:<': -2.2,
 ')-:{': -2.1,
 '):': -1.8,
 '):<': -1.9,
 '):{': -2.3,
 ');<': -2.6,
 '*)': 0.6,
 '*-)': 0.3,
 '*-:': 2.1,
 '*-;': 2.4,
 '*:': 1.9,
 '*<|:-)': 1.6,
 '*\\0/*': 2.3,
 '*^:': 1.6,
 ',-:': 1.2,
 "---'-;-{@": 2.3,
 '--<--<@': 2.2,
 '.-:': -1.2,
 '..###-:': -1.7,
 '..###:': -1.9,
 '/-:': -1.3,
 '/:': -1.3,
 '/:<': -1.4,
 '/=': -0.9,
 '/^:': -1.0,
 '/o:': -1.4,
 '0-8': 0.1,
 '0-|': -1.2,
 '0:)': 1.9,
 '0:-)': 1.4,
 '0:-3': 1.5,
 '0:03': 1.9,
 '

In [10]:
corpus = ["Absolutely perfect! Love it! :-) :-) :-)",
          "Horrible! Completely useless. :(",
          "It was OK. Some good and some bad things."]
for doc in corpus:
    scores = sa.polarity_scores(doc)
    print('{:+}: {}'.format(scores['compound'], doc))

+0.9428: Absolutely perfect! Love it! :-) :-) :-)
-0.8768: Horrible! Completely useless. :(
+0.3254: It was OK. Some good and some bad things.


In [30]:
with open('nlpia_movieReviewSnippets_GroundTruth.txt', 'r', encoding="utf-8") as f:
    raw_data = []
    for line in f:
        raw_data.append(line.split("\t"))

In [31]:
pd.DataFrame(raw_data).head()

Unnamed: 0,0,1,2
0,id,sentiment,text\n
1,1,2.26666666667,The Rock is destined to be the 21st ...
2,2,3.53333333333,The gorgeously elaborate continuatio...
3,3,-0.6,Effective but too tepid biopic\n
4,4,1.46666666667,If you sometimes like to go to the m...


In [32]:
type(raw_data)

list

In [66]:
for i in range(0, len(raw_data)):
    if(i>0):
        raw_data[i][1] = float(raw_data[i][1])
    raw_data[i][2] = raw_data[i][2].replace("\n", "")
pd.DataFrame(raw_data).head()

Unnamed: 0,0,1,2
0,id,sentiment,text
1,1,2.26667,The Rock is destined to be the 21st ...
2,2,3.53333,The gorgeously elaborate continuatio...
3,3,-0.6,Effective but too tepid biopic
4,4,1.46667,If you sometimes like to go to the m...


In [72]:
movies = pd.DataFrame(raw_data)
movies.columns= raw_data[0]
movies = movies.drop(movies.index[0])
movies.head()

Unnamed: 0,id,sentiment,text
1,1,2.26667,The Rock is destined to be the 21st ...
2,2,3.53333,The gorgeously elaborate continuatio...
3,3,-0.6,Effective but too tepid biopic
4,4,1.46667,If you sometimes like to go to the m...
5,5,1.73333,"Emerges as something rare, an issue ..."


In [73]:
movies.describe()

Unnamed: 0,id,sentiment,text
count,10605,10605.0,10605
unique,10605,1417.0,10603
top,2731,2.0,'Stock up on silver bullets for dire...
freq,1,151.0,2


In [80]:
movies["sentiment"].describe()

count     10605.0
unique     1417.0
top           2.0
freq        151.0
Name: sentiment, dtype: float64

In [81]:
movies["sentiment"].astype(float).describe()

count    10605.000000
mean         0.004831
std          1.922050
min         -3.875000
25%         -1.769231
50%         -0.080000
75%          1.833333
max          3.941176
Name: sentiment, dtype: float64

In [99]:
import pandas as pd
pd.set_option('display.width', 75)
from nltk.tokenize import casual_tokenize
bags_of_words = []
from collections import Counter

In [100]:
for text in movies.text:
    bags_of_words.append(Counter(casual_tokenize(text)))

In [101]:
len(bags_of_words)

10605

In [104]:
bags_of_words[0]

Counter({'The': 1,
         'Rock': 1,
         'is': 1,
         'destined': 1,
         'to': 2,
         'be': 1,
         'the': 1,
         '21st': 1,
         "Century's": 1,
         'new': 1,
         "'": 4,
         'Conan': 1,
         'and': 1,
         'that': 1,
         "he's": 1,
         'going': 1,
         'make': 1,
         'a': 1,
         'splash': 1,
         'even': 1,
         'greater': 1,
         'than': 1,
         'Arnold': 1,
         'Schwarzenegger': 1,
         ',': 1,
         'Jean': 1,
         'Claud': 1,
         'Van': 1,
         'Damme': 1,
         'or': 1,
         'Steven': 1,
         'Segal': 1,
         '.': 1})

In [105]:
type(bags_of_words[0])

collections.Counter

In [112]:
type(bags_of_words)

list

In [114]:
pd.DataFrame.from_records(bags_of_words[0:5])

Unnamed: 0,',",",.,/,21st,Arnold,...,too,trilogy,vision,words,writer,you
0,4.0,1.0,1.0,,1.0,1.0,...,,,,,,
1,4.0,,4.0,1.0,,,...,,1.0,1.0,1.0,1.0,
2,,,,,,,...,1.0,,,,,
3,,1.0,1.0,,,,...,,,,,,1.0
4,,1.0,1.0,,,,...,,,,,,


In [120]:
pd.DataFrame.from_records(bags_of_words[0:5]).fillna(0).astype(int)

Unnamed: 0,',",",.,/,21st,Arnold,...,too,trilogy,vision,words,writer,you
0,4,1,1,0,1,1,...,0,0,0,0,0,0
1,4,0,4,1,0,0,...,0,1,1,1,1,0
2,0,0,0,0,0,0,...,1,0,0,0,0,0
3,0,1,1,0,0,0,...,0,0,0,0,0,1
4,0,1,1,0,0,0,...,0,0,0,0,0,0


In [118]:
for i in range(0, 5):
    print(len(bags_of_words[i]))

33
34
5
18
21


In [119]:
df_bows = pd.DataFrame.from_records(bags_of_words)

In [122]:
df_bows.head()

Unnamed: 0,!,"""",#,$,%,&,...,zoning,zzzzzzzzz,½,élan,–,’
0,,,,,,,...,,,,,,
1,,,,,,,...,,,,,,
2,,,,,,,...,,,,,,
3,,,,,,,...,,,,,,
4,,,,,,,...,,,,,,


In [125]:
df_bows = df_bows.fillna(0).astype(int)

In [126]:
df_bows.shape

(10605, 20756)

In [184]:
movies.sentiment.shape

(10605,)

In [130]:
from sklearn.naive_bayes import MultinomialNB  # noqa
nb = MultinomialNB()
nb = nb.fit(df_bows, movies.sentiment > 0)
movies['predicted_sentiment'] = nb.predict(df_bows) * 8 - 4
movies['error'] = (movies.predicted_sentiment - movies.sentiment).abs()
movies.error.mean().round(1)
# 2.4
movies['sentiment_ispositive'] = (movies.sentiment > 0).astype(int)
movies['predicted_ispositive'] = (movies.predicted_sentiment > 0).astype(int)
movies['sentiment predicted_sentiment sentiment_ispositive predicted_ispos'
       .split()].head(8)

Unnamed: 0,sentiment,predicted_sentiment,sentiment_ispositive,predicted_ispos
1,2.26667,4,1,1
2,3.53333,4,1,1
3,-0.6,-4,0,0
4,1.46667,4,1,1
5,1.73333,4,1,1
6,2.53333,4,1,1
7,2.46667,4,1,1
8,1.26667,-4,1,0


In [131]:
(movies.predicted_ispositive == movies.sentiment_ispositive).sum() / len(movies)

0.9344648750589345

In [153]:
with open('nlpia_amazonReviewSnippets_GroundTruth.txt', 'r', encoding="utf-8") as f:
    product_data = []
    for line in f:
        product_data.append(line.split("\t"))

In [163]:
for i in range(0, len(product_data)):
    if(i>0):
        product_data[i][1] = float(product_data[i][1])
    product_data[i][2] = product_data[i][2].replace("\n", "")
products = pd.DataFrame(product_data)
products.columns= product_data[0]
products = products.drop(products.index[0])
pd.DataFrame(products).head()

Unnamed: 0,id,sentiment,text
1,1_1,-0.9,troubleshooting ad-2500 and ad-2600 ...
2,1_2,-0.15,"repost from january 13, 2004 with a ..."
3,1_3,-0.2,does your apex dvd player only play ...
4,1_4,-0.1,or does it play audio and video but ...
5,1_5,-0.5,before you try to return the player ...


In [164]:
len(products)

3708

In [165]:
bags_of_words = []
for text in products.text:
    bags_of_words.append(Counter(casual_tokenize(text)))

In [168]:
len(bags_of_words)

3708

In [169]:
df_product_bows = pd.DataFrame.from_records(bags_of_words)
df_product_bows = df_product_bows.fillna(0).astype(int)
df_all_bows = df_bows.append(df_product_bows)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [170]:
df_product_bows.shape

(3708, 5442)

In [172]:
df_bows.shape

(10605, 20756)

In [171]:
df_all_bows.shape

(14313, 23057)

In [174]:
df_all_bows.columns

Index(['!', '"', '#', '#38', '$', '%', '&', ''', '(', '(8',
       ...
       'zoomed', 'zooming', 'zooms', 'zx', 'zzzzzzzzz', '~', '½', 'élan',
       '–', '’'],
      dtype='object', length=23057)

In [175]:
df_product_bows2 = df_all_bows.iloc[len(movies):][df_bows.columns]
df_product_bows2.shape

(3708, 20756)

In [179]:
products['ispos'] = (products.sentiment > 0).astype(int)
products['ispos'].head()

1    0
2    0
3    0
4    0
5    0
Name: ispos, dtype: int32

In [181]:
products.sentiment.head()

1    -0.9
2   -0.15
3    -0.2
4    -0.1
5    -0.5
Name: sentiment, dtype: object

In [185]:
df_product_bows2.shape

(3708, 20756)

In [187]:
df_product_bows2 = df_product_bows2.fillna(0).astype(int)

In [188]:
nb.predict(df_product_bows2.values).astype(int)

array([0, 0, 0, ..., 0, 0, 0])

In [190]:
products['pred'] = nb.predict(df_product_bows2.values).astype(int)

In [191]:
(products.pred == products.ispos).sum() / len(products)

0.557982740021575