In [1]:
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import scipy.stats as stats
import math
import re
import nltk
import html
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords, wordnet
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import csv
import time

nltk.download('vader_lexicon')
nltk.download('sentiwordnet')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Raphael\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package sentiwordnet to
[nltk_data]     C:\Users\Raphael\AppData\Roaming\nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!


True

In [2]:
# test with pre-labelled existing data from Twitter
raw = pd.read_csv('./Data/twitter_corpus-master/full-corpus.csv', header=0)

In [3]:
raw.head()

Unnamed: 0,Topic,Sentiment,TweetId,TweetDate,TweetText
0,apple,positive,126415614616154112,Tue Oct 18 21:53:25 +0000 2011,Now all @Apple has to do is get swype on the i...
1,apple,positive,126404574230740992,Tue Oct 18 21:09:33 +0000 2011,@Apple will be adding more carrier support to ...
2,apple,positive,126402758403305474,Tue Oct 18 21:02:20 +0000 2011,Hilarious @youtube video - guy does a duet wit...
3,apple,positive,126397179614068736,Tue Oct 18 20:40:10 +0000 2011,@RIM you made it too easy for me to switch to ...
4,apple,positive,126395626979196928,Tue Oct 18 20:34:00 +0000 2011,I just realized that the reason I got into twi...


In [4]:
raw.Sentiment.unique()

array(['positive', 'negative', 'neutral', 'irrelevant'], dtype=object)

In [5]:
test = raw[raw.Sentiment != 'irrelevant']
test.drop(['Topic', 'TweetId', 'TweetDate'], inplace=True, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [6]:
test.loc[:, 'TweetText'] = test.loc[:, 'TweetText'].apply(lambda x: html.unescape(x))
test.loc[:, 'TweetText'] = test.loc[:, 'TweetText'].apply(lambda x: re.sub(r'(www\.|https?://).*?(\s|$)|@.*?(\s|$)|\$.*?(\s|$)|\d|\%|\\|/|-|_', ' ', x))
test.loc[:, 'TweetText'] = test.loc[:, 'TweetText'].apply(lambda x: re.sub(r'\s+', ' ', x))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item_labels[indexer[info_axis]]] = value


In [7]:
test.head()

Unnamed: 0,Sentiment,TweetText
0,positive,Now all has to do is get swype on the iphone a...
1,positive,will be adding more carrier support to the iP...
2,positive,Hilarious video guy does a duet with 's Siri. ...
3,positive,you made it too easy for me to switch to iPho...
4,positive,I just realized that the reason I got into twi...


## Testing TextBlob

In [8]:
start = time.process_time()
test.loc[:50, 'TextBlob Sentiment Score'] = test.loc[:50, ['TweetText']].apply(lambda x: TextBlob(x[0], analyzer=NaiveBayesAnalyzer()).sentiment[1], axis=1)
time_TextBlob = time.process_time() - start

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [9]:
print(time_TextBlob)

532.0


In [10]:
test.head()

Unnamed: 0,Sentiment,TweetText,TextBlob Sentiment Score
0,positive,Now all has to do is get swype on the iphone a...,0.4072
1,positive,will be adding more carrier support to the iP...,0.696821
2,positive,Hilarious video guy does a duet with 's Siri. ...,0.856974
3,positive,you made it too easy for me to switch to iPho...,0.637182
4,positive,I just realized that the reason I got into twi...,0.398939


In [11]:
def get_class_TextBlob(x):
    if x >= 0.6:
        return "positive"
    elif x <= 0.4:
        return "negative"
    else:
        return "neutral"

test.loc[:, 'TextBlob Sentiment'] = test.loc[:, ['TextBlob Sentiment Score']].apply(lambda x: get_class_TextBlob(x[0]), axis=1)

In [12]:
test.loc[:, 'TextBlob Match'] = np.where(test.loc[:, 'TextBlob Sentiment'] == test.loc[:, 'Sentiment'], 'Yes', 'No')

In [13]:
TextBlob_accuracy = (test['TextBlob Match'][:50] == 'Yes').sum() / 50
print(TextBlob_accuracy)

0.48


## Testing VADER

In [14]:
sia = SentimentIntensityAnalyzer()

In [15]:
start = time.process_time() 
test.loc[:, 'VADER Sentiment Score'] = test.loc[:, ['TweetText']].apply(lambda x: sia.polarity_scores(x[0])['compound'], axis=1)
time_VADAR = time.process_time() - start

In [16]:
def get_class_VADER(x):
    if x >= 0.3:
        return "positive"
    elif x <= -0.3:
        return "negative"
    else:
        return "neutral"

test.loc[:, 'VADER Sentiment'] = test.loc[:, ['VADER Sentiment Score']].apply(lambda x: get_class_VADER(x[0]), axis=1)

In [17]:
test.loc[:, 'VADER Match'] = np.where(test.loc[:, 'VADER Sentiment'] == test.loc[:, 'Sentiment'], 'Yes', 'No')

In [18]:
VADER_accuracy = (test['VADER Match'] == 'Yes').sum() / len(test)
print(VADER_accuracy)

0.6016355140186916


## Testing model trained by Naive Bayes

In [19]:
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews

In [20]:
def extract_features(word_list):
    return dict([(word, True) for word in word_list])

In [21]:
if __name__=='__main__':
   # Load positive and negative reviews  
    positive_fileids = movie_reviews.fileids('pos')
    negative_fileids = movie_reviews.fileids('neg')

In [22]:
features_positive = [(extract_features(movie_reviews.words(fileids=[f])), 'Positive') for f in positive_fileids]
features_negative = [(extract_features(movie_reviews.words(fileids=[f])), 'Negative') for f in negative_fileids]

In [23]:
# Split the data into train and test (80/20)
threshold_factor = 0.8
threshold_positive = int(threshold_factor * len(features_positive))
threshold_negative = int(threshold_factor * len(features_negative))

In [24]:
features_train = features_positive[:threshold_positive] + features_negative[:threshold_negative]
features_test = features_positive[threshold_positive:] + features_negative[threshold_negative:]

In [25]:
# Train a Naive Bayes classifier
classifier = NaiveBayesClassifier.train(features_train)
print("\nAccuracy of the classifier:", nltk.classify.util.accuracy(classifier, features_test))


Accuracy of the classifier: 0.735


In [38]:
start = time.process_time() 
temp = []
for x in test["TweetText"]:
    probdist = classifier.prob_classify(extract_features(x.split()))
    pred_sentiment = probdist.max()
    probability = probdist.prob(pred_sentiment)
    temp.append([pred_sentiment, probability])
time_NB = time.process_time() - start
    
sentiments = []
for x in temp:
    if (x[0] == 'Positive') and (x[1] > 0.7):
        sentiment = 'positive'
    elif (x[0] == 'Negative') and (x[1] > 0.7):
        sentiment = 'negative'
    else:
        sentiment = 'neutral'

    sentiments.append(sentiment)

In [39]:
sentiments_df = pd.DataFrame({'NB Sentiment':sentiments})
test['NB Sentiment'] = sentiments_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [40]:
test.loc[:, 'NB Match'] = np.where(test.loc[:, 'NB Sentiment'] == test.loc[:, 'Sentiment'], 'Yes', 'No')

In [41]:
NB_accuracy = (test['NB Match'] == 'Yes').sum() / len(test)
print(NB_accuracy)

0.41296728971962615


## Testing using SentiWordNet

In [42]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import numpy as np
from nltk import word_tokenize, pos_tag
from nltk.corpus import sentiwordnet as swn
from nltk.wsd import lesk
from nltk.stem.wordnet import WordNetLemmatizer

def get_sentiword_score(message):
        """
            takes a message and performs following operations:
            1) tokenize
            2) POS tagging
            3) reduce text to nouns, verbs, adjectives, adverbs
            4) lemmatize the words
            for each selected tag, if more than one sense exists, performs word sense disambiguation
            using lesk algorithm and finally returns positivity score, negativity score from
            sentiwordnet lexicon
        """

        tokens = word_tokenize(message)
        pos = pos_tag(tokens)
        lemmatizer = WordNetLemmatizer()
        selected_tags = list()
        scores = list()

        for i in range(len(pos)):
            if pos[i][1].startswith('J'):
                selected_tags.append((lemmatizer.lemmatize(pos[i][0], 'a'), 'a'))
            elif pos[i][1].startswith('V'):
                selected_tags.append((lemmatizer.lemmatize(pos[i][0], 'v'), 'v'))
            elif pos[i][1].startswith('N'):
                selected_tags.append((lemmatizer.lemmatize(pos[i][0], 'n'), 'n'))
            elif pos[i][1].startswith('R'):
                selected_tags.append((lemmatizer.lemmatize(pos[i][0], 'r'), 'r'))

        # score list: [(sense name, pos score, neg score)]
        for i in range(len(selected_tags)):
            senses = list(swn.senti_synsets(selected_tags[i][0], selected_tags[i][1]))
            if len(senses) == 1:
                scores.append((senses[0].synset.name(), senses[0].pos_score(), senses[0].neg_score()))
            elif len(senses) > 1:
                sense = lesk(tokens, selected_tags[i][0], selected_tags[i][1])
                if sense is None:
                    # take average score of all original senses
                    pos_score = 0
                    neg_score = 0
                    for i in senses:
                        pos_score += i.pos_score()
                        neg_score += i.neg_score()
                    scores.append((senses[0].synset.name(), pos_score/len(senses), neg_score/len(senses)))
                else:
                    sense = swn.senti_synset(sense.name())
                    scores.append((sense.synset.name(), sense.pos_score(), sense.neg_score()))

        """
            there are a number of ways for aggregating sentiment scores
            1) sum up all scores
            2) average all scores (or only for non zero scores)
            3) (1) or (2) but only for adjectives
            4) if pos score greater than neg score +1 vote else -1 vote
            here we are summing up the positive and negative scores to be used by classifier.
            whenever we encounter a negative word, we reverse the positive and negative score.
        """

        # collected from word stat financial dictionary
        negation_words = list(open('Lexicon/lexicon_negation_words.txt').read().split())

        # final_score = 0
        # counter = 1
        # for score in scores:
        #     if any(score[0].startswith(x) for x in negation_words):
        #         counter *= -1
        #     else:
        #         if score[1] > score[2]:
        #             final_score += counter*score[1]
        #         elif score[1] < score[2]:
        #             final_score -= counter*score[2]

        counter = 1
        pos_score = 0
        neg_score = 0
        for score in scores:
            if any(score[0].startswith(x) for x in negation_words):
                counter *= -1
            else:
                if counter == 1:
                    pos_score += score[1]
                    neg_score += score[2]
                elif counter == -1:
                    pos_score += score[2]
                    neg_score += score[1]

        final_score = [pos_score, neg_score]
        return final_score

In [43]:
start = time.process_time() 
scores = []
for x in test['TweetText']:
    scores.append(get_sentiword_score(x))
time_SWN = time.process_time() - start

In [45]:
ss = []
for x in scores:
    if x[0] + x[1] == 0:
        ss.append('neutral')
    elif (x[0] / (x[0] + x[1])) > 0.6:
        ss.append('positive')
    elif (x[1] / (x[0] + x[1])) > 0.6:
        ss.append('negative')
    else: 
        ss.append('neutral')

In [46]:
ss_df = pd.DataFrame({'SWN Sentiment':ss})
test['SWN Sentiment'] = ss_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [47]:
test.loc[:, 'SWN Match'] = np.where(test.loc[:, 'SWN Sentiment'] == test.loc[:, 'Sentiment'], 'Yes', 'No')

In [48]:
SWN_accuracy = (test['SWN Match'] == 'Yes').sum() / len(test)
print(SWN_accuracy)

0.3279789719626168


## Comparison

In [49]:
time_per_text_TextBlob = time_TextBlob / 50
time_per_text_VADAR = time_VADAR / len(test)
time_per_text_NB = time_NB / len(test)
time_per_text_SWN = time_SWN / len(test)

In [50]:
print("Average time taken per tweet (TextBlob): ", time_per_text_TextBlob)
print("Average time taken per tweet (VADER): ", time_per_text_VADAR)
print("Average time taken per tweet (NB): ", time_per_text_NB)
print("Average time taken per tweet (SWN): ", time_per_text_SWN)

Average time taken per tweet (TextBlob):  10.64
Average time taken per tweet (VADER):  0.0005476051401869158
Average time taken per tweet (NB):  0.00010039427570093458
Average time taken per tweet (SWN):  0.009167822721962617


In [51]:
print("TextBlob accuracy: ", TextBlob_accuracy)
print("VADER accuracy: ", VADER_accuracy)
print("NB accuracy: ", NB_accuracy)
print("SWN accuracy: ", SWN_accuracy)

TextBlob accuracy:  0.48
VADER accuracy:  0.6016355140186916
NB accuracy:  0.41296728971962615
SWN accuracy:  0.3279789719626168


## Test VADAR with stock market lexicon

Would have a lower accuracy as it uses a specific lexicon that is not specific to the test set

Hence, do not need to include this in the comparison (but can compare VADAR without the lexicon using correlation with the market price)

In [52]:
# stock market lexicon
stock_lex = pd.read_csv('./Lexicon/stock_lex.csv')

In [53]:
stock_lex['sentiment'] = (stock_lex['Aff_Score'] + stock_lex['Neg_Score'])/2
stock_lex = dict(zip(stock_lex.Item, stock_lex.sentiment))
stock_lex = {k:v for k,v in stock_lex.items() if len(k.split(' '))==1}
stock_lex_scaled = {}
for k, v in stock_lex.items():
    if v > 0:
        stock_lex_scaled[k] = v / max(stock_lex.values()) * 4
    else:
        stock_lex_scaled[k] = v / min(stock_lex.values()) * -4

In [54]:
# Loughran and McDonald
positive = []
with open('./Lexicon/lm_positive.csv', 'r') as f:
    reader = csv.reader(f)
    for row in reader:
        positive.append(row[0].strip())
    
negative = []
with open('./Lexicon/lm_negative.csv', 'r') as f:
    reader = csv.reader(f)
    for row in reader:
        entry = row[0].strip().split(" ")
        if len(entry) > 1:
            negative.extend(entry)
        else:
            negative.append(entry[0])

In [55]:
final_lex = {}
# final_lex.update({word:2.0 for word in positive})
# final_lex.update({word:-2.0 for word in negative})
final_lex.update(stock_lex_scaled)
final_lex.update(sia.lexicon)
sia.lexicon = final_lex

In [56]:
test.loc[:, 'VADER_lex Sentiment Score'] = test.loc[:, ['TweetText']].apply(lambda x: sia.polarity_scores(x[0])['compound'], axis=1)

In [57]:
def get_class_VADER_lex(x):
    if x >= 0.3:
        return "positive"
    elif x <= -0.3:
        return "negative"
    else:
        return "neutral"

test.loc[:, 'VADER_lex Sentiment'] = test.loc[:, ['VADER_lex Sentiment Score']].apply(lambda x: get_class_VADER_lex(x[0]), axis=1)

In [58]:
test.loc[:, 'VADER_lex Match'] = np.where(test.loc[:, 'VADER_lex Sentiment'] == test.loc[:, 'Sentiment'], 'Yes', 'No')

In [59]:
VADER_lex_accuracy = (test['VADER_lex Match'] == 'Yes').sum() / len(test)
print(VADER_lex_accuracy)

0.47079439252336447
