- Embedding layer enables us to convert each word into a fixed length vector of defined size. The resultant vector is a dense one with having real values instead of just 0’s and 1’s. The fixed length of word vectors helps us to represent words in a better way along with reduced dimensions. [https://medium.com/analytics-vidhya/understanding-embedding-layer-in-keras-bbe3ff1327ce](https://medium.com/analytics-vidhya/understanding-embedding-layer-in-keras-bbe3ff1327ce)

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.utils import np_utils
from google.colab import drive
import numpy as np
import itertools

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Dados/sentiment/Tweets.csv')
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [4]:
df.shape

(14640, 15)

In [5]:
df.groupby(['airline_sentiment']).size()

airline_sentiment
negative    9178
neutral     3099
positive    2363
dtype: int64

In [6]:
df = df[df['airline_sentiment_confidence'] > 0.8]

In [7]:
token = Tokenizer(num_words=100)
token.fit_on_texts(df['text'].values)

In [8]:
dict(itertools.islice(token.word_index.items(), 10))

{'to': 1,
 'the': 2,
 'i': 3,
 'a': 4,
 'for': 5,
 'you': 6,
 'flight': 7,
 'united': 8,
 'and': 9,
 'on': 10}

In [9]:
len(token.word_index)

12802

In [10]:
generated_tokens = token.texts_to_sequences(df['text'].values)
generated_tokens = pad_sequences(generated_tokens, padding='post', maxlen=100)

In [11]:
generated_tokens

array([[97, 62,  0, ...,  0,  0,  0],
       [97, 99,  1, ...,  0,  0,  0],
       [97,  9, 99, ...,  0,  0,  0],
       ...,
       [13, 98, 93, ...,  0,  0,  0],
       [13, 89,  1, ...,  0,  0,  0],
       [13,  6, 23, ...,  0,  0,  0]], dtype=int32)

In [12]:
generated_tokens.shape

(10459, 100)

In [13]:
labelencoder = LabelEncoder()
y = labelencoder.fit_transform(df['airline_sentiment'])
y

array([1, 0, 0, ..., 0, 1, 0])

In [14]:
y = np_utils.to_categorical(y)
y

array([[0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       ...,
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.]], dtype=float32)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(generated_tokens, y, 
                                                    test_size=0.3)
X_test

array([[12, 24, 24, ...,  0,  0,  0],
       [13, 71, 67, ...,  0,  0,  0],
       [ 8, 57, 23, ...,  0,  0,  0],
       ...,
       [16, 99, 22, ...,  0,  0,  0],
       [13, 62, 14, ...,  0,  0,  0],
       [18, 76,  6, ...,  0,  0,  0]], dtype=int32)

In [16]:
model = Sequential()
model.add(Embedding(input_dim=len(token.word_index), output_dim=128, 
                                  input_length=generated_tokens.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(units=196, dropout=0.2, recurrent_dropout=0, 
               activation='tanh', recurrent_activation='sigmoid',
               unroll=False, use_bias=True))
model.add(Dense(units=3, activation='softmax'))

In [17]:
model.compile(loss='categorical_crossentropy', optimizer='adam',
              metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 128)          1638656   
                                                                 
 spatial_dropout1d (SpatialD  (None, 100, 128)         0         
 ropout1D)                                                       
                                                                 
 lstm (LSTM)                 (None, 196)               254800    
                                                                 
 dense (Dense)               (None, 3)                 591       
                                                                 
Total params: 1,894,047
Trainable params: 1,894,047
Non-trainable params: 0
_________________________________________________________________


In [18]:
# batch_size: weights are updated after x registries pass by the net
model.fit(X_train, y_train, epochs=10, batch_size=30, verbose=True,
          validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f57e00dfdc0>

In [19]:
loss, accuracy = model.evaluate(X_test, y_test)
print('Loss', loss)
print('Accuracy: ', accuracy)

Loss 0.828498363494873
Accuracy:  0.6937539577484131


In [20]:
pred = model.predict(X_test)
pred



array([[0.6944568 , 0.14397372, 0.16156946],
       [0.69445676, 0.14397372, 0.16156946],
       [0.69445676, 0.14397372, 0.16156946],
       ...,
       [0.6944568 , 0.14397372, 0.16156946],
       [0.69445676, 0.14397374, 0.16156946],
       [0.69445676, 0.14397374, 0.16156946]], dtype=float32)

# Vader

In [22]:
import nltk

In [23]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [36]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob

In [25]:
sentiment_analyzer = SentimentIntensityAnalyzer()

In [26]:
phrase = 'I HATE this movie!!!'
score = sentiment_analyzer.polarity_scores(phrase)
score

{'neg': 0.726, 'neu': 0.274, 'pos': 0.0, 'compound': -0.7437}

- Compound is between -1 (negative) and 1 (positive)

In [27]:
phrase = 'I HATE this movie'
score = sentiment_analyzer.polarity_scores(phrase)
score

{'neg': 0.689, 'neu': 0.311, 'pos': 0.0, 'compound': -0.6633}

In [29]:
phrase = 'I hate this movie!!!'
score = sentiment_analyzer.polarity_scores(phrase)
score

{'neg': 0.696, 'neu': 0.304, 'pos': 0.0, 'compound': -0.6784}

In [31]:
phrase = 'This plot is very old'
score = sentiment_analyzer.polarity_scores(phrase)
score

{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}

In [33]:
phrase = ':/'
score = sentiment_analyzer.polarity_scores(phrase)
score

{'neg': 1.0, 'neu': 0.0, 'pos': 0.0, 'compound': -0.34}

In [34]:
type(score)

dict

In [35]:
score['compound']

-0.34

In [37]:
phrase = TextBlob('The movie was awesome')
phrase.sentiment

Sentiment(polarity=1.0, subjectivity=1.0)

In [38]:
phrase = TextBlob('New York is great')
phrase.sentiment

Sentiment(polarity=0.4681818181818182, subjectivity=0.6022727272727273)

In [39]:
!pip install translate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting translate
  Downloading translate-3.6.1-py2.py3-none-any.whl (12 kB)
Collecting libretranslatepy==2.1.1
  Downloading libretranslatepy-2.1.1-py3-none-any.whl (3.2 kB)
Installing collected packages: libretranslatepy, translate
Successfully installed libretranslatepy-2.1.1 translate-3.6.1


In [40]:
from translate import Translator

In [41]:
translator = Translator(from_lang='pt', to_lang='en')
translation = translator.translate('Isto é uma caneta')
translation

'This is a pen.'

In [42]:
translation = translator.translate('O filme foi bom')
translation

'The movie was good'

In [43]:
score = sentiment_analyzer.polarity_scores(translation)
score

{'neg': 0.0, 'neu': 0.508, 'pos': 0.492, 'compound': 0.4404}

# Second Data

In [45]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [67]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Dados/sentiment/Tweets2.csv')
df.head()

Unnamed: 0,id,local,sentiment,text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [68]:
df.shape

(74682, 4)

In [69]:
df.groupby(['sentiment']).size()

sentiment
Irrelevant    12990
Negative      22542
Neutral       18318
Positive      20832
dtype: int64

In [70]:
df.loc[df['sentiment'] == 'Irrelevant', 'sentiment'] = 'Neutral'

In [71]:
df.groupby(['sentiment']).size()

sentiment
Negative    22542
Neutral     31308
Positive    20832
dtype: int64

In [72]:
df = df.dropna(subset=['text'])

In [73]:
df.reset_index(drop=True, inplace=True)

In [74]:
df.shape

(73996, 4)

In [75]:
token = Tokenizer(num_words=100)
token.fit_on_texts(df['text'].values)

In [76]:
generated_tokens = token.texts_to_sequences(df['text'].values)
generated_tokens = pad_sequences(generated_tokens, padding='post', maxlen=100)

In [77]:
labelencoder = LabelEncoder()
y = labelencoder.fit_transform(df['sentiment'])
y

array([2, 2, 2, ..., 2, 2, 2])

In [78]:
y = np_utils.to_categorical(y)
y

array([[0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]], dtype=float32)

In [79]:
X_train, X_test, y_train, y_test = train_test_split(generated_tokens, y, 
                                                    test_size=0.4)
X_test

array([[ 2,  2, 29, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       [46, 28, 52, ...,  0,  0,  0],
       ...,
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 2, 23,  8, ...,  0,  0,  0],
       [90, 73, 13, ...,  0,  0,  0]], dtype=int32)

In [80]:
model = Sequential()
model.add(Embedding(input_dim=len(token.word_index), output_dim=128, 
                                  input_length=generated_tokens.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(units=196, dropout=0.2, recurrent_dropout=0, 
               activation='tanh', recurrent_activation='sigmoid',
               unroll=False, use_bias=True))
model.add(Dense(units=3, activation='softmax'))

In [81]:
model.compile(loss='categorical_crossentropy', optimizer='adam',
              metrics=['accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 100, 128)          4324224   
                                                                 
 spatial_dropout1d_2 (Spatia  (None, 100, 128)         0         
 lDropout1D)                                                     
                                                                 
 lstm_2 (LSTM)               (None, 196)               254800    
                                                                 
 dense_2 (Dense)             (None, 3)                 591       
                                                                 
Total params: 4,579,615
Trainable params: 4,579,615
Non-trainable params: 0
_________________________________________________________________


In [82]:
# batch_size: weights are updated after x registries pass by the net
model.fit(X_train, y_train, epochs=5, batch_size=500, verbose=True)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f572223ffa0>

In [83]:
_, accuracy = model.evaluate(X_test, y_test)
print('Accuracy: ', accuracy)

Accuracy:  0.41933849453926086


## Vader

In [84]:
sentiment_analyzer = SentimentIntensityAnalyzer()
df['Vader_Sentiment'] = ''

for i in range(len(df.index)):
    score = sentiment_analyzer.polarity_scores(df['text'].iloc[i])
    del score['compound']
    greater = max(score, key=score.get) # neg, pos, neu
    df.loc[i, 'Vader_Sentiment'] = greater

In [85]:
df.groupby(['Vader_Sentiment']).size()

Vader_Sentiment
neg     3660
neu    65581
pos     4755
dtype: int64

In [87]:
df.loc[df['Vader_Sentiment'] == 'neu', 'Vader_Sentiment'] = 'Neutral'
df.loc[df['Vader_Sentiment'] == 'neg', 'Vader_Sentiment'] = 'Negative'
df.loc[df['Vader_Sentiment'] == 'pos', 'Vader_Sentiment'] = 'Positive'

In [88]:
df.groupby(['Vader_Sentiment']).size()

Vader_Sentiment
Negative     3660
Neutral     65581
Positive     4755
dtype: int64

In [89]:
y_pred = df['Vader_Sentiment']
y_test = df['sentiment']
cm = confusion_matrix(y_test, y_pred)
cm

array([[ 2004, 19902,   452],
       [ 1122, 28384,  1477],
       [  534, 17295,  2826]])

In [90]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.44886210065408944