In [317]:
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import scipy.stats as stats
import math
import re
import nltk
import html
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords, wordnet
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import csv
import time

nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Raphael\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [160]:
# test with pre-labelled existing data from Twitter
raw = pd.read_csv('./Data/twitter_corpus-master/full-corpus.csv', header=0)

In [161]:
raw.head()

Unnamed: 0,Topic,Sentiment,TweetId,TweetDate,TweetText
0,apple,positive,126415614616154112,Tue Oct 18 21:53:25 +0000 2011,Now all @Apple has to do is get swype on the i...
1,apple,positive,126404574230740992,Tue Oct 18 21:09:33 +0000 2011,@Apple will be adding more carrier support to ...
2,apple,positive,126402758403305474,Tue Oct 18 21:02:20 +0000 2011,Hilarious @youtube video - guy does a duet wit...
3,apple,positive,126397179614068736,Tue Oct 18 20:40:10 +0000 2011,@RIM you made it too easy for me to switch to ...
4,apple,positive,126395626979196928,Tue Oct 18 20:34:00 +0000 2011,I just realized that the reason I got into twi...


In [162]:
raw.Sentiment.unique()

array(['positive', 'negative', 'neutral', 'irrelevant'], dtype=object)

In [163]:
test = raw[raw.Sentiment != 'irrelevant']
test.drop(['Topic', 'TweetId', 'TweetDate'], inplace=True, axis=1)

In [164]:
test.loc[:, 'TweetText'] = test.loc[:, 'TweetText'].apply(lambda x: html.unescape(x))
test.loc[:, 'TweetText'] = test.loc[:, 'TweetText'].apply(lambda x: re.sub(r'(www\.|https?://).*?(\s|$)|@.*?(\s|$)|\$.*?(\s|$)|\d|\%|\\|/|-|_', ' ', x))
test.loc[:, 'TweetText'] = test.loc[:, 'TweetText'].apply(lambda x: re.sub(r'\s+', ' ', x))

In [165]:
test.head()

Unnamed: 0,Sentiment,TweetText
0,positive,Now all has to do is get swype on the iphone a...
1,positive,will be adding more carrier support to the iP...
2,positive,Hilarious video guy does a duet with 's Siri. ...
3,positive,you made it too easy for me to switch to iPho...
4,positive,I just realized that the reason I got into twi...


## Testing TextBlob

In [320]:
start = time.process_time()    
test.loc[:50, 'TextBlob Sentiment Score'] = test.loc[:50, ['TweetText']].apply(lambda x: TextBlob(x[0], analyzer=NaiveBayesAnalyzer()).sentiment[1], axis=1)
time_TextBlob = time.process_time() - start

KeyboardInterrupt: 

In [321]:
print(time_TextBlob)

303.09375


In [167]:
test.head()

Unnamed: 0,Sentiment,TweetText,TextBlob Sentiment Score
0,positive,Now all has to do is get swype on the iphone a...,0.4072
1,positive,will be adding more carrier support to the iP...,0.696821
2,positive,Hilarious video guy does a duet with 's Siri. ...,0.856974
3,positive,you made it too easy for me to switch to iPho...,0.637182
4,positive,I just realized that the reason I got into twi...,0.398939


In [301]:
def get_class_TextBlob(x):
    if x >= 0.6:
        return "positive"
    elif x <= 0.4:
        return "negative"
    else:
        return "neutral"

test.loc[:, 'TextBlob Sentiment'] = test.loc[:, ['TextBlob Sentiment Score']].apply(lambda x: get_class_TextBlob(x[0]), axis=1)

In [302]:
test.loc[:, 'TextBlob Match'] = np.where(test.loc[:, 'TextBlob Sentiment'] == test.loc[:, 'Sentiment'], 'Yes', 'No')

In [304]:
TextBlob_accuracy = (test['TextBlob Match'][:50] == 'Yes').sum() / 50
print(TextBlob_accuracy)

0.48


## Testing VADER

In [174]:
sia = SentimentIntensityAnalyzer()

In [335]:
start = time.process_time() 
test.loc[:, 'VADER Sentiment Score'] = test.loc[:, ['TweetText']].apply(lambda x: sia.polarity_scores(x[0])['compound'], axis=1)
time_VADAR = time.process_time() - start

In [289]:
def get_class_VADER(x):
    if x >= 0.3:
        return "positive"
    elif x <= -0.3:
        return "negative"
    else:
        return "neutral"

test.loc[:, 'VADER Sentiment'] = test.loc[:, ['VADER Sentiment Score']].apply(lambda x: get_class_VADER(x[0]), axis=1)

In [290]:
test.loc[:, 'VADER Match'] = np.where(test.loc[:, 'VADER Sentiment'] == test.loc[:, 'Sentiment'], 'Yes', 'No')

In [291]:
VADER_accuracy = (test['VADER Match'] == 'Yes').sum() / len(test)
print(VADER_accuracy)

0.6016355140186916


## Comparison

In [338]:
time_per_text_TextBlob = time_TextBlob / 50
time_per_text_VADAR = time_VADAR / len(test)

In [331]:
print("TextBlob accuracy: ", TextBlob_accuracy)
print("VADER accuracy: ", VADER_accuracy)

TextBlob accuracy:  0.48
VADER accuracy:  0.6016355140186916


In [339]:
print("Average time taken per tweet (TextBlob): ", time_per_text_TextBlob)
print("Average time taken per tweet (VADER): ", time_per_text_VADAR)

Average time taken per tweet (TextBlob):  0.070625
Average time taken per tweet (VADER):  0.0002692391939252336


## Test VADAR with stock market lexicon

Would have a lower accuracy as it uses a specific lexicon that is not specific to the test set

Hence, do not need to include this in the comparison (but can compare VADAR without the lexicon using correlation with the market price)

In [325]:
# stock market lexicon
stock_lex = pd.read_csv('./Lexicon/stock_lex.csv')

In [181]:
stock_lex['sentiment'] = (stock_lex['Aff_Score'] + stock_lex['Neg_Score'])/2
stock_lex = dict(zip(stock_lex.Item, stock_lex.sentiment))
stock_lex = {k:v for k,v in stock_lex.items() if len(k.split(' '))==1}
stock_lex_scaled = {}
for k, v in stock_lex.items():
    if v > 0:
        stock_lex_scaled[k] = v / max(stock_lex.values()) * 4
    else:
        stock_lex_scaled[k] = v / min(stock_lex.values()) * -4

In [184]:
# Loughran and McDonald
positive = []
with open('./Lexicon/lm_positive.csv', 'r') as f:
    reader = csv.reader(f)
    for row in reader:
        positive.append(row[0].strip())
    
negative = []
with open('./Lexicon/lm_negative.csv', 'r') as f:
    reader = csv.reader(f)
    for row in reader:
        entry = row[0].strip().split(" ")
        if len(entry) > 1:
            negative.extend(entry)
        else:
            negative.append(entry[0])

In [185]:
final_lex = {}
# final_lex.update({word:2.0 for word in positive})
# final_lex.update({word:-2.0 for word in negative})
final_lex.update(stock_lex_scaled)
final_lex.update(sia.lexicon)
sia.lexicon = final_lex

In [186]:
test.loc[:, 'VADER_lex Sentiment Score'] = test.loc[:, ['TweetText']].apply(lambda x: sia.polarity_scores(x[0])['compound'], axis=1)

In [314]:
def get_class_VADER_lex(x):
    if x >= 0.3:
        return "positive"
    elif x <= -0.3:
        return "negative"
    else:
        return "neutral"

test.loc[:, 'VADER_lex Sentiment'] = test.loc[:, ['VADER_lex Sentiment Score']].apply(lambda x: get_class_VADER_lex(x[0]), axis=1)

In [315]:
test.loc[:, 'VADER_lex Match'] = np.where(test.loc[:, 'VADER_lex Sentiment'] == test.loc[:, 'Sentiment'], 'Yes', 'No')

In [316]:
VADER_lex_accuracy = (test['VADER_lex Match'] == 'Yes').sum() / len(test)
print(VADER_lex_accuracy)

0.47079439252336447
