# Sentiment Analysis

In [1]:
import nltk
# nltk.download()  # Run if first time using nltk on this machine

# Documents to be analyzed
document1 = 'Sushi is some of my favorite food, but I am not a fan of complicated rolls'
document2 = 'I loved the show, it was so amazing!'
document3 = 'I hated everything about that book.  The writing was awful, and I hated the characters'

# Collecting the documents into a corpus
corpus = [document1, document2, document3]

### Sentiment Analysis w/ NLTK

**Note:** No text preprocessing needs to be done with this package

In [2]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()
for sentence in corpus:
    print(sentence)
    ss = sid.polarity_scores(sentence)
    for k in ss:
        print(k, ss[k])
    print()

Sushi is some of my favorite food, but I am not a fan of complicated rolls
neg 0.149
neu 0.73
pos 0.122
compound -0.1136

I loved the show, it was so amazing!
neg 0.0
neu 0.356
pos 0.644
compound 0.8767

I hated everything about that book.  The writing was awful, and I hated the characters
neg 0.533
neu 0.467
pos 0.0
compound -0.9081





### Sentiment Analysis w/ dictionaries

$$Sentiment = \frac{(Positive\ Words - Negative\ Words)}{Total\ Words}$$

Using the [Sentiment Lexicon](https://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html#lexicon) from Professor Bing Liu of the University of Illinois at Chicago

In [3]:
# Importing the positive and negative dictionaries
positive_dict = set(line.strip().lower() for line in open('./data/positive-words.txt'))
negative_dict = set(line.strip().lower() for line in open('./data/negative-words.txt'))


def preprocess(document):
    """
    Normalizes, tokenizes, and removes stop words from documents
    """
    # Converting all words to lower case
    normalized = document.lower()
    
    # Tokenizing sentence & removing punctuation
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    tokenized = tokenizer.tokenize(normalized)
    
    # Filtering out stop words
    filtered_words = [word for word in tokenized if word not in nltk.corpus.stopwords.words('english')]
    
    # Words in dictionary are not stemmed, so this doesn't need to be done
#     stemmer = nltk.stem.PorterStemmer()
#     stemmed = [stemmer.stem(word) for word in filtered_words]
#     return stemmed

    return filtered_words


preprocess(document1)

['sushi', 'favorite', 'food', 'fan', 'complicated', 'rolls']

In [4]:
def sentiment_analysis(document):
    """
    Performs sentiment analysis using the 
    """
    # Normalizing, tokenizing, and removing stop words
    processed_document = preprocess(document)
    
    # Counting the number of positive/negative words in the dictionary
    positive_words = sum([word in positive_dict for word in processed_document])
    negative_words = sum([word in negative_dict for word in processed_document])
    total_words = len(processed_document)
    
    # Calculating the sentiment
    sentiment = (positive_words - negative_words) / total_words
    
    # Formatting the output
    return sentiment, positive_words, negative_words, total_words


# Iterating through the corpus to retrive sentiment by document
for document in corpus:
    sentiment = sentiment_analysis(document)
    print(document)
    print('Negative words: {}'.format(sentiment[2]))
    print('Positive words: {}'.format(sentiment[1]))
    print('Total words: {}'.format(sentiment[3]))
    print('Sentiment: {}\n'.format(sentiment[0]))

Sushi is some of my favorite food, but I am not a fan of complicated rolls
Negative words: 1
Positive words: 1
Total words: 6
Sentiment: 0.0

I loved the show, it was so amazing!
Negative words: 0
Positive words: 2
Total words: 3
Sentiment: 0.6666666666666666

I hated everything about that book.  The writing was awful, and I hated the characters
Negative words: 3
Positive words: 0
Total words: 7
Sentiment: -0.42857142857142855

