In [1]:
# import standard libraries
import numpy as np
import pandas as pd
import os
import random
from sklearn import preprocessing, metrics

In [2]:
os.chdir(os.path.join(os.getcwd(), "..", "..", "..", "data", "preprocessed"))

In [3]:
# read the data
df = pd.read_csv("1.1-sh-data-preprocessed.csv", encoding = "ISO-8859-1")

In [4]:
# get the head
df.head()

Unnamed: 0,tweet,emotion
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,0
1,@jessedee Know about @fludapp ? Awesome iPad/i...,2
2,@swonderlin Can not wait for #iPad 2 also. The...,2
3,@sxsw I hope this year's festival isn't as cra...,0
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,2


In [5]:
# get the tail
df.tail()

Unnamed: 0,tweet,emotion
8550,Ipad everywhere. #SXSW {link},2
8551,"Wave, buzz... RT @mention We interrupt your re...",1
8552,"Google's Zeiger, a physician never reported po...",1
8553,Some Verizon iPhone customers complained their...,1
8554,ÃÂÃÂÃÂ¡ÃÂÃÂÃÂ ÃÂÃÂ¼_ÃÂÃÂÃÂÃ...,1


In [6]:
# null count
df.isnull().sum()

tweet      0
emotion    0
dtype: int64

In [7]:
# get the shape
df.shape

(8555, 2)

In [8]:
# use vadersentiment for lexicon approach
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

In [9]:
tweets = df["tweet"]

In [10]:
lexicon_polarity = []
for index in range(len(tweets)):
    tweet = tweets[index]
    if isinstance(tweets[index], str) :
        vs = analyzer.polarity_scores(tweet)
        polarity = vs["compound"]
        # scoring: https://github.com/cjhutto/vaderSentiment#about-the-scoring
        if polarity >= 0.05:
            lexicon_polarity.append(2)
        elif polarity > -0.05 and polarity < 0.05:
            lexicon_polarity.append(1)
        elif polarity <= -0.05:
            lexicon_polarity.append(0)
    else:
        lexicon_polarity.append(1)

In [11]:
df["lexicon_polarity"] = lexicon_polarity

In [12]:
df.columns

Index(['tweet', 'emotion', 'lexicon_polarity'], dtype='object')

In [13]:
df.head(20)

Unnamed: 0,tweet,emotion,lexicon_polarity
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,0,0
1,@jessedee Know about @fludapp ? Awesome iPad/i...,2,2
2,@swonderlin Can not wait for #iPad 2 also. The...,2,1
3,@sxsw I hope this year's festival isn't as cra...,0,2
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,2,2
5,@teachntech00 New iPad Apps For #SpeechTherapy...,1,1
6,"#SXSW is just starting, #CTIA is around the co...",2,2
7,Beautifully smart and simple idea RT @madebyma...,2,2
8,Counting down the days to #sxsw plus strong Ca...,2,2
9,Excited to meet the @samsungmobileus at #sxsw ...,2,2


In [14]:
y_actu = pd.Series(list(df["emotion"]), name='Actual')
y_pred = pd.Series(list(df["lexicon_polarity"]), name='Predicted')
df_confusion = pd.crosstab(y_actu, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)
df_confusion

Predicted,0,1,2,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,206,154,183,543
1,609,2584,1951,5144
2,255,876,1737,2868
All,1070,3614,3871,8555


In [15]:
print("Accuracy score with lexicon: {0:.2f}%".format(metrics.accuracy_score(y_actu, y_pred) * 100))

Accuracy score with lexicon: 52.92%


In [16]:
print(metrics.classification_report(y_actu, y_pred))

              precision    recall  f1-score   support

           0       0.19      0.38      0.26       543
           1       0.71      0.50      0.59      5144
           2       0.45      0.61      0.52      2868

   micro avg       0.53      0.53      0.53      8555
   macro avg       0.45      0.50      0.45      8555
weighted avg       0.59      0.53      0.54      8555



In [17]:
# save the dataframe for future use
os.chdir(os.path.join(os.getcwd(), "..", "model"))
df.to_csv("3.0-sh-vader-sentiment.csv", index=False)

In [37]:
print("Lexicon > ML: {}".format(df["tweet"][2943]))
print("Lexicon score is {}".format(df["lexicon_polarity"][2943]))
print("Actual score is {}".format(df["emotion"][2943]))

Lexicon > ML: To kick off #SXSWi @mention is giving away an iPad 2... Just visit the FB page to enter: {link} #SXSW
Lexicon score is 2
Actual score is 2


In [26]:
print("Lexicon < ML: {}".format(df["tweet"][5899]))
print("Lexicon score is {}".format(df["lexicon_polarity"][5899]))
print("Actual score is {}".format(df["emotion"][5899]))

Lexicon < ML: RT @mention Make sure you are donating to the JAPANESE Red Cross for #japan: {link} #sxswcares #sxsw #quake | thank YOU!
Lexicon score is 2
Actual score is 1


In [28]:
print("Lexicon = ML: {}".format(df["tweet"][4025]))
print("Lexicon score is {}".format(df["lexicon_polarity"][4025]))
print("Actual score is {}".format(df["emotion"][4025]))

Lexicon = ML: Next up @mention #sxswi: Your Mom Had An #iPad, Designing For Boomers #SXSW #sxswi
Lexicon score is 1
Actual score is 1


In [38]:
print("Lexicon /= ML: {}".format(df["tweet"][1769]))
print("Lexicon score is {}".format(df["lexicon_polarity"][1769]))
print("Actual score is {}".format(df["emotion"][1769]))

Lexicon /= ML: so the iPad will be available while I'm in Austin for #sxsw -- this is major #GeekDilemma
Lexicon score is 1
Actual score is 2
