In [72]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, accuracy_score

from keras.models import Sequential
from keras.layers import Dense,Embedding,LSTM,SpatialDropout1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import np_utils

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer


In [73]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Pichau\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [74]:
tw = pd.read_csv('Tweets2.csv')
tw.head(3)

Unnamed: 0,id,local,sentiment,text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...


In [75]:
tw.groupby(['sentiment']).size()

sentiment
Irrelevant    12990
Negative      22542
Neutral       18318
Positive      20832
dtype: int64

In [76]:
tw.loc[tw['sentiment']=='Irrelevant','sentiment']='Neutral'
tw = tw.dropna(subset=['text'])
tw.reset_index(drop=True,inplace=True)
tw.shape

(73996, 4)

# Supervisinado

In [77]:
token = Tokenizer(num_words=100)
token.fit_on_texts(tw['text'].values)

In [78]:
x = token.texts_to_sequences(tw['text'].values)
x = pad_sequences(x,padding = 'post',maxlen = 100)

In [79]:
lber = LabelEncoder()
y = lber.fit_transform(tw['sentiment'])
y

array([2, 2, 2, ..., 2, 2, 2])

In [80]:
y = np_utils.to_categorical(y)
y

array([[0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]], dtype=float32)

In [81]:
x_train, xtest,y_train,ytest = train_test_split(x,y,test_size = 0.4,random_state = 0)
xtest

array([[49, 39, 38, ...,  0,  0,  0],
       [12, 85, 85, ...,  0,  0,  0],
       [59,  3,  1, ...,  0,  0,  0],
       ...,
       [ 0,  0,  0, ...,  0,  0,  0],
       [65, 65, 16, ...,  0,  0,  0],
       [59, 13, 64, ...,  0,  0,  0]])

# AULA 11 - LSTM

In [82]:
modelo = Sequential()

modelo.add(Embedding(input_dim = len(token.word_index), output_dim = 128, input_length = x.shape[1]))
modelo.add(SpatialDropout1D(0.2))
modelo.add(LSTM(units=196,dropout = 0.2,recurrent_dropout = 0, activation = 'tanh',recurrent_activation = 'sigmoid',unroll = False,use_bias = True))
modelo.add(Dense(units = 3,activation = 'softmax'))

In [83]:
modelo.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
modelo.summary()

Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, 100, 128)          4324224   
                                                                 
 spatial_dropout1d_2 (Spatia  (None, 100, 128)         0         
 lDropout1D)                                                     
                                                                 
 lstm_4 (LSTM)               (None, 196)               254800    
                                                                 
 dense_3 (Dense)             (None, 3)                 591       
                                                                 
Total params: 4,579,615
Trainable params: 4,579,615
Non-trainable params: 0
_________________________________________________________________


In [84]:
modelo.fit(x_train,y_train, epochs = 5 , batch_size = 500,verbose = True)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x25990fb3cd0>

In [85]:
_,accuracy = modelo.evaluate(xtest,ytest)
print('accuracy: ',accuracy)

accuracy:  0.4233926832675934


In [86]:
prev = modelo.predict(xtest)
prev

array([[0.3107088 , 0.41157243, 0.27771878],
       [0.3107088 , 0.41157243, 0.27771878],
       [0.3107088 , 0.41157243, 0.27771878],
       ...,
       [0.3107088 , 0.41157243, 0.27771878],
       [0.3107088 , 0.41157243, 0.27771878],
       [0.3107088 , 0.41157243, 0.27771878]], dtype=float32)

# atividade  - Supervisionado

In [87]:
from keras.layers import Conv1D, MaxPooling1D,Bidirectional,Dropout
model = Sequential()

model.add(Embedding(input_dim = len(token.word_index), output_dim = 128, input_length = x.shape[1]))

model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Bidirectional(LSTM(32)))
model.add(Dropout(0.4))
model.add(Dense(3, activation='softmax'))



In [88]:
model.summary()

Model: "sequential_13"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_8 (Embedding)     (None, 100, 128)          4324224   
                                                                 
 conv1d_4 (Conv1D)           (None, 100, 32)           12320     
                                                                 
 max_pooling1d_3 (MaxPooling  (None, 50, 32)           0         
 1D)                                                             
                                                                 
 bidirectional_2 (Bidirectio  (None, 64)               16640     
 nal)                                                            
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense_4 (Dense)             (None, 3)               

In [89]:
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])

In [90]:
model.fit(x_train,y_train, epochs = 5 , batch_size = 500,verbose = True)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x259a408feb0>

In [91]:
_,accuracy = model.evaluate(xtest,ytest)
print('accuracy: ',accuracy)

accuracy:  0.5320788025856018


In [92]:
prev1 = model.predict(xtest)
prev1

array([[0.44269657, 0.34488168, 0.21242179],
       [0.3625981 , 0.4951298 , 0.14227214],
       [0.18417417, 0.4594543 , 0.35637158],
       ...,
       [0.30174053, 0.3422556 , 0.35600385],
       [0.11905987, 0.8246252 , 0.05631497],
       [0.05312379, 0.7575621 , 0.1893141 ]], dtype=float32)

# Vader

In [93]:

mas = SentimentIntensityAnalyzer()
tw['vader_sentiment'] = ''

for y  in range(len(tw.index)):
    x = mas.polarity_scores(tw['text'].iloc[y])
    del x['compound']
    maior = max(x,key = x.get)
    tw.loc[y,'vander_sentiment'] = maior

In [94]:
tw.groupby(['vander_sentiment']).size()

vander_sentiment
neg     3657
neu    65590
pos     4749
dtype: int64

In [95]:
tw.groupby(['sentiment']).size()

sentiment
Negative    22358
Neutral     30983
Positive    20655
dtype: int64

In [96]:
tw.loc[tw['vander_sentiment'] =='neu','vander_sentiment'] ='Neutral'
tw.loc[tw['vander_sentiment']=='neg','vander_sentiment']='Negative'
tw.loc[tw['vander_sentiment']=='pos','vander_sentiment']='Positive'

In [97]:
tw.groupby(['vander_sentiment']).size()

vander_sentiment
Negative     3657
Neutral     65590
Positive     4749
dtype: int64

In [99]:
y_pred = tw['vander_sentiment']
ytest = tw['sentiment']
cm = confusion_matrix(ytest,y_pred)
cm

array([[ 2002, 19904,   452],
       [ 1121, 28386,  1476],
       [  534, 17300,  2821]], dtype=int64)

In [100]:
accuracy = accuracy_score(ytest,y_pred)
accuracy

0.4487945294340235