In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk

In [10]:
# Download punkt tokenizer models
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/montygash/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [11]:
ex_bad = "This is so fun! Yay!"
ex_good = "I hate this so much."

In [15]:
# NLTK 
# Tokenize each sentence
token_bad = nltk.word_tokenize(ex_bad)
token_good = nltk.word_tokenize(ex_good)

display(token_bad)
display(token_good)

['This', 'is', 'so', 'fun', '!', 'Yay', '!']

['I', 'hate', 'this', 'so', 'much', '.']

In [17]:
# Download for 'part of speech' portion
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/montygash/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [18]:
# Find part of speech for each word
prt_speech_bad = nltk.pos_tag(token_bad)
prt_speech_good = nltk.pos_tag(token_good)

display(prt_speech_bad)
display(prt_speech_good)

[('This', 'DT'),
 ('is', 'VBZ'),
 ('so', 'RB'),
 ('fun', 'JJ'),
 ('!', '.'),
 ('Yay', 'NN'),
 ('!', '.')]

[('I', 'PRP'),
 ('hate', 'VBP'),
 ('this', 'DT'),
 ('so', 'RB'),
 ('much', 'JJ'),
 ('.', '.')]

In [20]:
# Download chunker
nltk.download('maxent_ne_chunker')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/montygash/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.


True

In [22]:
# And this
nltk.download('words')

[nltk_data] Downloading package words to /Users/montygash/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [23]:
# Now we group the tokens as chunks of text
# WHY DO WE DO THIS?
chunk_bad = nltk.chunk.ne_chunk(prt_speech_bad)
chunk_good = nltk.chunk.ne_chunk(prt_speech_good)

chunk_bad.pprint()
chunk_good.pprint()

(S This/DT is/VBZ so/RB fun/JJ !/. Yay/NN !/.)
(S I/PRP hate/VBP this/DT so/RB much/JJ ./.)


#### VADER Sentiment Scoring
(Valence Aware Dictionary and sEntiment Reasoner)

Takes all words in our sentence, assigns value of positive, negative, neutral for all words.
Sums up the values and returns.
This approach DOES NOT account for relationships between the words.
It removes 'stop words'; 'and' 'the' 'or'...

In [25]:
# !pip install tqdm



In [27]:
from nltk.sentiment import SentimentIntensityAnalyzer

# if we want to track progress with loops:
from tqdm.notebook import tqdm



In [29]:
# Download VADER Lexicon
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/montygash/nltk_data...


True

In [30]:
sia = SentimentIntensityAnalyzer()

In [31]:
sia

<nltk.sentiment.vader.SentimentIntensityAnalyzer at 0x17d4a5f10>

Before running the polarity_scores, I am going to download some other nltk corpora.

In [32]:
nltk.download('movie_reviews')
nltk.download('subjectivity')
nltk.download('opinion_lexicon')
nltk.download('sentiwordnet')
nltk.download('twitter_samples')
nltk.download('vader_lexicon')
nltk.download('stopwords')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /Users/montygash/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
[nltk_data] Downloading package subjectivity to
[nltk_data]     /Users/montygash/nltk_data...
[nltk_data]   Unzipping corpora/subjectivity.zip.
[nltk_data] Downloading package opinion_lexicon to
[nltk_data]     /Users/montygash/nltk_data...
[nltk_data]   Unzipping corpora/opinion_lexicon.zip.
[nltk_data] Downloading package sentiwordnet to
[nltk_data]     /Users/montygash/nltk_data...
[nltk_data]   Unzipping corpora/sentiwordnet.zip.
[nltk_data] Downloading package twitter_samples to
[nltk_data]     /Users/montygash/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/montygash/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/montygash/nltk_data...
[nltk_data]   Package

True

In [35]:
ex_good = "This is so fun! Yay!"
ex_bad = "I hate this so much."

In [36]:
# Score the sentence.
# neg, neu, pos range from 0 to 1. 
# compound ranges from -1 to 1
sia.polarity_scores(ex_bad)

{'neg': 0.552, 'neu': 0.448, 'pos': 0.0, 'compound': -0.5719}

In [37]:
sia.polarity_scores(ex_good)

{'neg': 0.0, 'neu': 0.261, 'pos': 0.739, 'compound': 0.8592}