In [1]:
import pandas as pd
import re

In [2]:
data=pd.read_csv('Sentiment.csv')
data=data[['text','sentiment']]

In [3]:
data=data[data['sentiment']!='Neutral']

In [4]:
data['text']=data['text'].apply(lambda x:x.lower())
data['text']=data['text'].apply(lambda x:re.sub('[^a-zA-Z0-9\s]','',x))

In [5]:
for idx,row in data.iterrows():
    row[0]=row[0].replace('rt',' ')

In [6]:
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [7]:
tokenizer=Tokenizer(num_words=2000,split=' ') #select 2000 most frequent words

In [8]:
tokenizer.fit_on_texts(data['text'].values)  #texts contains lists, we assume each entry of the lists to be a token.

In [9]:
x=tokenizer.texts_to_sequences(data['text'].values)   #Transforms each sequence into a list of text

In [10]:
from keras.preprocessing.sequence import pad_sequences
x=pad_sequences(x)    #insert 0 at the front

In [11]:
x

array([[   0,    0,    0, ..., 1317, 1403,  742],
       [   0,    0,    0, ...,  233,  723,   17],
       [   0,    0,    0, ...,  207,  371,  670],
       ...,
       [   0,    0,    0, ...,   72,   65,    3],
       [   0,    0,    0, ..., 1018, 1417,   73],
       [   0,    0,    0, ...,  197,    3,  721]], dtype=int32)

In [12]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D

embed_dim=128
lstm_out=196
model=Sequential()
model.add(Embedding(2000,embed_dim,input_length=x.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out,dropout=0.2,recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 28, 128)           256000    
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 28, 128)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 394       
Total params: 511,194
Trainable params: 511,194
Non-trainable params: 0
_________________________________________________________________


In [13]:
y=pd.get_dummies(data['sentiment']).values

In [14]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.33,random_state=42)

In [15]:
batch_size=32
model.fit(x_train,y_train,epochs=7,batch_size=batch_size,verbose=2)

Instructions for updating:
Use tf.cast instead.
Epoch 1/7
 - 19s - loss: 0.4359 - accuracy: 0.8154
Epoch 2/7
 - 18s - loss: 0.3231 - accuracy: 0.8688
Epoch 3/7
 - 18s - loss: 0.2790 - accuracy: 0.8866
Epoch 4/7
 - 18s - loss: 0.2557 - accuracy: 0.8972
Epoch 5/7
 - 19s - loss: 0.2281 - accuracy: 0.9089
Epoch 6/7
 - 19s - loss: 0.2074 - accuracy: 0.9147
Epoch 7/7
 - 19s - loss: 0.1842 - accuracy: 0.9264


<keras.callbacks.callbacks.History at 0x7febe203f978>

In [16]:
validation_size=1500
x_validate=x_test[-validation_size:]
y_validate=y_test[-validation_size:]
x_test=x_test[:-validation_size]
y_test=y_test[:-validation_size]
score,acc=model.evaluate(x_test,y_test,verbose=2,batch_size=batch_size)
print("score: %.2f"%score)
print("accuracy: %2f"%acc)

score: 0.40
accuracy: 0.833905


In [18]:
import numpy as np
pos_crt,neg_crt,pos_cnt,neg_cnt=0,0,0,0
for x in range(len(x_validate)):
    result=model.predict(x_validate[x].reshape(1,x_test.shape[1]),batch_size=1,verbose=2)
    if np.argmax(result)==np.argmax(y_validate[x]):
        if np.argmax(result)==0:
            neg_crt+=1
        else:
            pos_crt+=1
    if np.argmax(result)==0:
        neg_cnt+=1
    else:
        pos_cnt+=1

In [19]:
print("pos_acc", pos_crt/pos_cnt*100, "%")
print("neg_acc", neg_crt/neg_cnt*100, "%")

pos_acc 62.96296296296296 %
neg_acc 89.85868661679135 %
