In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Dense, Embedding,LSTM,SpatialDropout1D
from keras_preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.utils import np_utils


In [2]:
Tweets=pd.read_csv('Tweets.csv')

In [3]:
Tweets.head(1)

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)


In [4]:
Tweets.groupby(['airline_sentiment']).size()

airline_sentiment
negative    9178
neutral     3099
positive    2363
dtype: int64

In [5]:
Tweets = Tweets[Tweets['airline_sentiment_confidence']>0.8]

In [6]:
token = Tokenizer(num_words = 100)
token.fit_on_texts(Tweets['text'].values)

In [8]:
x=token.texts_to_sequences(Tweets['text'].values)
x = pad_sequences(x,padding='post',maxlen = 100)

In [9]:
print(x)

[[97 62  0 ...  0  0  0]
 [97 99  1 ...  0  0  0]
 [97  9 99 ...  0  0  0]
 ...
 [13 98 93 ...  0  0  0]
 [13 89  1 ...  0  0  0]
 [13  6 23 ...  0  0  0]]


In [10]:
labelencoder = LabelEncoder()
y = labelencoder.fit_transform(Tweets['airline_sentiment'])
print(y)

[1 0 0 ... 0 1 0]


In [11]:
y = np_utils.to_categorical(y)
print(y)

[[0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 ...
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]]


In [12]:
x_train,xtest,y_train,ytest = train_test_split(x, y, test_size=0.3,random_state=0)

In [13]:
xtest

array([[16, 14, 36, ...,  0,  0,  0],
       [13, 79, 38, ...,  0,  0,  0],
       [ 8, 10, 17, ...,  0,  0,  0],
       ...,
       [16, 14, 57, ...,  0,  0,  0],
       [16, 20, 92, ...,  0,  0,  0],
       [16, 80, 26, ...,  0,  0,  0]])

In [14]:
modelo = Sequential()

modelo.add(Embedding(input_dim=len(token.word_index),output_dim = 128,input_length= x.shape[1]))

modelo.add(SpatialDropout1D(0.2))
#units = quantidade de neuronio
modelo.add(LSTM(units = 196,dropout=0.2,recurrent_dropout=0,activation='tanh',recurrent_activation='sigmoid',unroll=False,use_bias=True))

modelo.add(Dense(units=3,activation='softmax'))

In [15]:
modelo.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 128)          1638656   
                                                                 
 spatial_dropout1d (SpatialD  (None, 100, 128)         0         
 ropout1D)                                                       
                                                                 
 lstm (LSTM)                 (None, 196)               254800    
                                                                 
 dense (Dense)               (None, 3)                 591       
                                                                 
Total params: 1,894,047
Trainable params: 1,894,047
Non-trainable params: 0
_________________________________________________________________


In [16]:
modelo.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])

In [17]:
modelo.fit(x_train,y_train,epochs=10,batch_size=30,verbose=True,validation_data=(xtest,ytest))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x283c2a68ee0>

In [18]:
loss,accuracy = modelo.evaluate(xtest, ytest)
print("loss: ", loss,'\n','acuracia: ',accuracy)

loss:  0.8163799047470093 
 acuracia:  0.7026768922805786


In [19]:
prev = modelo.predict(xtest)
prev

array([[0.7125902 , 0.13638921, 0.15102056],
       [0.7125902 , 0.13638921, 0.15102056],
       [0.7125902 , 0.13638921, 0.15102056],
       ...,
       [0.7125902 , 0.13638921, 0.15102056],
       [0.7125902 , 0.13638921, 0.15102056],
       [0.7125902 , 0.13638921, 0.15102056]], dtype=float32)

# atividade 11 - LSTM

In [50]:
model = Sequential()

model.add(Embedding(input_dim=len(token.word_index),output_dim = 128,input_length= x.shape[1]))

model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Bidirectional(LSTM(32)))
model.add(Dropout(0.4))
model.add(Dense(3, activation='softmax'))

model.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, 100, 128)          1638656   
                                                                 
 conv1d_2 (Conv1D)           (None, 100, 32)           12320     
                                                                 
 max_pooling1d_2 (MaxPooling  (None, 50, 32)           0         
 1D)                                                             
                                                                 
 bidirectional_2 (Bidirectio  (None, 64)               16640     
 nal)                                                            
                                                                 
 dropout_5 (Dropout)         (None, 64)                0         
                                                                 
 dense_6 (Dense)             (None, 3)                

In [51]:
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])

In [52]:
model.fit(x_train, y_train,validation_data=(xtest,ytest),batch_size=64, epochs=20, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x283dc3b4be0>

In [53]:
loss1,accuracy1 = model.evaluate(xtest, ytest)
print("loss: ", loss1,'\n','acuracia: ',accuracy1)

loss:  0.6433443427085876 
 acuracia:  0.7896749377250671


In [54]:
prev1 = model.predict(xtest)
prev1

array([[3.6864319e-01, 4.9890846e-01, 1.3244836e-01],
       [9.1762817e-01, 8.0004156e-02, 2.3676222e-03],
       [9.7566515e-01, 8.2137771e-03, 1.6121060e-02],
       ...,
       [9.9468106e-01, 4.7343168e-03, 5.8458361e-04],
       [4.5383140e-01, 3.0426808e-02, 5.1574183e-01],
       [9.9838483e-01, 1.3172980e-03, 2.9792110e-04]], dtype=float32)