In [1]:
# import standard libraries
import numpy as np
import pandas as pd
import os
import random
from sklearn import preprocessing

In [2]:
os.chdir(os.path.join(os.getcwd(), "..", "..", "..", "data", "preprocessed"))

In [3]:
# read the data
df = pd.read_csv("1.1-sh-data-preprocessed.csv", encoding = "ISO-8859-1")

In [4]:
# get the head
df.head()

Unnamed: 0,tweet,emotion
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,0
1,@jessedee Know about @fludapp ? Awesome iPad/i...,2
2,@swonderlin Can not wait for #iPad 2 also. The...,2
3,@sxsw I hope this year's festival isn't as cra...,0
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,2


In [5]:
# get the tail
df.tail()

Unnamed: 0,tweet,emotion
8595,Ipad everywhere. #SXSW {link},2
8596,"Wave, buzz... RT @mention We interrupt your re...",1
8597,"Google's Zeiger, a physician never reported po...",1
8598,Some Verizon iPhone customers complained their...,1
8599,ÃÂÃÂÃÂ¡ÃÂÃÂÃÂ ÃÂÃÂ¼_ÃÂÃÂÃÂÃ...,1


In [6]:
# null count
df.isnull().sum()

tweet      0
emotion    0
dtype: int64

In [7]:
# get the shape
df.shape

(8600, 2)

In [8]:
# use vadersentiment for lexicon approach
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

In [9]:
tweets = df["tweet"]

In [10]:
lexicon_polarity = []
for index in range(len(tweets)):
    tweet = tweets[index]
    if isinstance(tweets[index], str) :
        vs = analyzer.polarity_scores(tweet)
        polarity = vs["compound"]
        # scoring: https://github.com/cjhutto/vaderSentiment#about-the-scoring
        if polarity >= 0.05:
            lexicon_polarity.append(2)
        elif polarity > -0.05 and polarity < 0.05:
            lexicon_polarity.append(1)
        elif polarity <= -0.05:
            lexicon_polarity.append(0)
    else:
        lexicon_polarity.append(1)

In [11]:
df["lexicon_polarity"] = lexicon_polarity

In [12]:
df.columns

Index(['tweet', 'emotion', 'lexicon_polarity'], dtype='object')

In [13]:
df.head(20)

Unnamed: 0,tweet,emotion,lexicon_polarity
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,0,0
1,@jessedee Know about @fludapp ? Awesome iPad/i...,2,2
2,@swonderlin Can not wait for #iPad 2 also. The...,2,1
3,@sxsw I hope this year's festival isn't as cra...,0,2
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,2,2
5,@teachntech00 New iPad Apps For #SpeechTherapy...,1,1
6,"#SXSW is just starting, #CTIA is around the co...",2,2
7,Beautifully smart and simple idea RT @madebyma...,2,2
8,Counting down the days to #sxsw plus strong Ca...,2,2
9,Excited to meet the @samsungmobileus at #sxsw ...,2,2


In [14]:
count = 0
for index in range(len(df)):
    if (df["emotion"][index] == df["lexicon_polarity"][index]):
        count = count + 1
print("Accuracy score with lexicon: {0:.2f}%".format((count/len(df) * 100)))

Accuracy score with lexicon: 52.99%


In [15]:
y_actu = pd.Series(list(df["emotion"]), name='Actual')
y_pred = pd.Series(list(df["lexicon_polarity"]), name='Predicted')
df_confusion = pd.crosstab(y_actu, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)
df_confusion

Predicted,0,1,2,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,207,155,183,545
1,617,2604,1954,5175
2,257,877,1746,2880
All,1081,3636,3883,8600


In [16]:
# save the dataframe for future use
os.chdir(os.path.join(os.getcwd(), "..", "model"))
df.to_csv("3.0-sh-vader-sentiment.csv", index=False)