In [1]:
# import standard libraries
import numpy as np
import pandas as pd
import os
import random
from sklearn import preprocessing

In [2]:
os.chdir(os.path.join(os.getcwd(), "..", "..", "..", "data", "raw"))

In [3]:
# read the data
df = pd.read_csv("tweet_product_company.csv", encoding = "ISO-8859-1")

In [4]:
# get the head
df.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [5]:
# get the tail
df.tail()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
9088,Ipad everywhere. #SXSW {link},iPad,Positive emotion
9089,"Wave, buzz... RT @mention We interrupt your re...",,No emotion toward brand or product
9090,"Google's Zeiger, a physician never reported po...",,No emotion toward brand or product
9091,Some Verizon iPhone customers complained their...,,No emotion toward brand or product
9092,Ï¡Ïàü_ÊÎÒ£Áââ_£â_ÛâRT @...,,No emotion toward brand or product


In [6]:
# null count
df.isnull().sum()

tweet_text                                               1
emotion_in_tweet_is_directed_at                       5802
is_there_an_emotion_directed_at_a_brand_or_product       0
dtype: int64

In [7]:
# get the shape
df.shape

(9093, 3)

In [8]:
# rename the columns
df.columns = ["tweet", "product", "emotion"]

In [9]:
# get the unique emotions
np.unique(df["emotion"])

array(["I can't tell", 'Negative emotion',
       'No emotion toward brand or product', 'Positive emotion'],
      dtype=object)

In [10]:
# actual emotions
emotions = df["emotion"]

In [11]:
# map the emotions to polarity values
# positive: 1
# neutral: 0
# negative: -1
actual_polarity = []
for index in range(len(emotions)):
    emotion = df["emotion"][index]
    if emotion == "I can't tell" or emotion == "No emotion toward brand or product":
        actual_polarity.append(0)
    elif emotion == "Positive emotion":
        actual_polarity.append(1)
    elif emotion == "Negative emotion":
        actual_polarity.append(-1)

In [12]:
df["actual_polarity"] = actual_polarity

In [13]:
df.columns

Index(['tweet', 'product', 'emotion', 'actual_polarity'], dtype='object')

In [14]:
# use vadersentiment for lexicon approach
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

In [15]:
tweets = df["tweet"]

In [16]:
lexicon_polarity = []
for index in range(len(tweets)):
    tweet = tweets[index]
    if isinstance(tweets[index], str) :
        vs = analyzer.polarity_scores(tweet)
        polarity = vs["compound"]
        # scoring: https://github.com/cjhutto/vaderSentiment#about-the-scoring
        if polarity >= 0.05:
            lexicon_polarity.append(1)
        elif polarity > -0.05 and polarity < 0.05:
            lexicon_polarity.append(0)
        elif polarity <= -0.05:
            lexicon_polarity.append(-1)
    else:
        lexicon_polarity.append(0)

In [17]:
df["lexicon_polarity"] = lexicon_polarity

In [18]:
df.columns

Index(['tweet', 'product', 'emotion', 'actual_polarity', 'lexicon_polarity'], dtype='object')

In [19]:
df.head(20)

Unnamed: 0,tweet,product,emotion,actual_polarity,lexicon_polarity
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,-1,-1
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,1,1
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,1,0
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,-1,1
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,1,1
5,@teachntech00 New iPad Apps For #SpeechTherapy...,,No emotion toward brand or product,0,0
6,,,No emotion toward brand or product,0,0
7,"#SXSW is just starting, #CTIA is around the co...",Android,Positive emotion,1,1
8,Beautifully smart and simple idea RT @madebyma...,iPad or iPhone App,Positive emotion,1,1
9,Counting down the days to #sxsw plus strong Ca...,Apple,Positive emotion,1,1


In [20]:
count = 0
for index in range(len(df)):
    if (df["actual_polarity"][index] == df["lexicon_polarity"][index]):
        count = count + 1
print("Accuracy score with lexicon: {0:.2f}%".format((count/len(df) * 100)))

Accuracy score with lexicon: 53.43%


In [21]:
# save the dataframe for future use
os.chdir(os.path.join(os.getcwd(), "..", "model"))
df.to_csv("3.0-sh-vader-sentiment.csv", index=False)