In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

# Model is using 10000 positive and 10000 negative
# 5000 tweets are general while 5000 are airlines for positive sentiments
# 2000 are disaster tweets, 5000 tweets are negative airline sentiments and 3000 general sentiments

Using TensorFlow backend.


In [2]:
data = pd.read_csv('clean_tweet_model6.csv')
# Keeping only the neccessary columns
data = data[['text','sentiment']]

In [3]:
test_data = pd.read_csv('news_tests.csv')
# Keeping only the neccessary columns
test_data = test_data[['text','sentiment']]

In [4]:
data['text'] = data['text'].apply(lambda x: str(x))
data['text'] = data['text'].apply(lambda x: x.lower())
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

print(data[ data['sentiment'] == 'Positive'].size)
print(data[ data['sentiment'] == 'Negative'].size)

for idx,row in data.iterrows():
    row[0] = row[0].replace('rt',' ')
    
max_features = 2000
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X_train = tokenizer.texts_to_sequences(data['text'].values)
X_train = pad_sequences(X_train)

20000
19994


In [89]:
test_data['text'] = test_data['text'].apply(lambda x: str(x))
test_data['text'] = test_data['text'].apply(lambda x: x.lower())
test_data['text'] = test_data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

print(test_data[ test_data['sentiment'] == 'Positive'].size)
print(test_data[ test_data['sentiment'] == 'Negative'].size)

for idx,row in test_data.iterrows():
    row[0] = row[0].replace('rt',' ')
    
tokenizer_test = Tokenizer(num_words=max_features, split=' ')
tokenizer_test.fit_on_texts(test_data['text'].values)
X_test = tokenizer.texts_to_sequences(test_data['text'].values)
X_test = pad_sequences(X_test, maxlen=X_train.shape[1])

2008
2028


In [6]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_features, embed_dim,input_length = X_train.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 211, 128)          256000    
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 211, 128)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 394       
Total params: 511,194
Trainable params: 511,194
Non-trainable params: 0
_________________________________________________________________
None


In [90]:
Y_train = pd.get_dummies(data['sentiment']).values
Y_test = pd.get_dummies(test_data['sentiment']).values
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(19997, 211) (19997, 2)
(2018, 211) (2018, 2)


In [8]:
batch_size = 32
model.fit(X_train, Y_train, epochs = 10, batch_size=batch_size, verbose = 2)

Epoch 1/10
 - 393s - loss: 0.4224 - acc: 0.8023
Epoch 2/10
 - 401s - loss: 0.3018 - acc: 0.8739
Epoch 3/10
 - 399s - loss: 0.2647 - acc: 0.8892
Epoch 4/10
 - 4577s - loss: 0.2418 - acc: 0.8999
Epoch 5/10
 - 51843s - loss: 0.2215 - acc: 0.9109
Epoch 6/10
 - 442s - loss: 0.2039 - acc: 0.9176
Epoch 7/10
 - 482s - loss: 0.1888 - acc: 0.9249
Epoch 8/10
 - 403s - loss: 0.1743 - acc: 0.9319
Epoch 9/10
 - 441s - loss: 0.1610 - acc: 0.9383
Epoch 10/10
 - 436s - loss: 0.1484 - acc: 0.9427


<keras.callbacks.History at 0x12337d208>

In [20]:
model.save('my_model7.h5')

In [91]:
# validation_size = 1000
# print(X_test)
# X_validate = X_test[-validation_size:]
# Y_validate = Y_test[-validation_size:]
# X_test = X_test[:-validation_size]
# Y_test = Y_test[:-validation_size]
X_validate = X_test
Y_validate = Y_test
score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

score: 0.23
acc: 0.92


In [93]:
pos_cnt, neg_cnt, pos_correct, neg_correct = 0, 0, 0, 0
for x in range(len(X_validate)):
    
    result = model.predict(X_validate[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]
   
    if np.argmax(result) == np.argmax(Y_validate[x]):
        if np.argmax(Y_validate[x]) == 0:
            neg_correct += 1
        else:
            pos_correct += 1
       
    if np.argmax(Y_validate[x]) == 0:
        neg_cnt += 1
    else:
        pos_cnt += 1
     
    
print("pos_acc", pos_correct/pos_cnt*100, "%")
print("neg_acc", neg_correct/neg_cnt*100, "%")

40
please stay out of hong kong the riots there are insanely messy
Negative
[1 0]
[0.24292727 0.75707275]
82
please stay out of hong kong the riots there are insanely messy
Negative
[1 0]
[0.24292727 0.75707275]
143
the grenade which the terrorists threw at the camp exploded
Negative
[1 0]
[0.06641155 0.93358845]
149
swearing dishonesty and murder and theft and adultery are rife crime follows upon crime
Negative
[1 0]
[0.34687218 0.65312785]
160
extreme weather is set to compound the sectors slide deeper into debt
Negative
[1 0]
[0.47359404 0.52640593]
188
protests have resulted in several road closures in cape town atlantis
Negative
[1 0]
[0.3720579 0.6279421]
192
basra protests us closes consulate after weeks of deadly protests
Negative
[1 0]
[0.1691438  0.83085614]
280
palestine savepalestine saveaqsa istandwithpalestine standwithaqsa palestineinmyhea  israeliswarcriminal boycotisrael freepalestine
Negative
[1 0]
[0.19537164 0.8046284 ]
368
cholera death toll and suspected cholera c

1230
mate i was on yesterday about perl
Positive
[0 1]
[0.50032836 0.4996717 ]
1231
currently riding in the back seat of my car lol i decided to be nice and let the two tall guys drive and ride shotgun
Positive
[0 1]
[0.94139564 0.05860438]
1232
on da bridge goin back home
Positive
[0 1]
[0.9807968  0.01920323]
1236
second place is first loser but not when the second place shi s look better than first places
Positive
[0 1]
[0.9699828  0.03001717]
1238
hope so too except this subject still touches on font despite promising css lectures it doesn t even look at js at all
Positive
[0 1]
[0.997747   0.00225301]
1263
updating weed wiki chilling with grape fruit and og kush that s organically grown kush not original gangster
Positive
[0 1]
[0.9438333  0.05616671]
1268
woohooo congrats on the two of you how does it feel to finally not be living in sin
Positive
[0 1]
[0.9500257  0.04997428]
1277
new picture but why is my old one showing
Positive
[0 1]
[0.6874487  0.31255132]
1278
lmao pediphile

1855
dont go to england yet come to california pick me up we go to vegas and then we go back to my lovely brazil say yes
Positive
[0 1]
[0.80791026 0.19208977]
1869
the vaccine is therapeutic not preventitive so you really should hope never to need it
Positive
[0 1]
[0.85845125 0.14154871]
1882
hey twilight must be winner x live live
Positive
[0 1]
[0.61223876 0.38776124]
1883
has decided to reactivate his ym
Positive
[0 1]
[0.9831028  0.01689717]
1893
still alive and got one hell of a story to tell you will be back on tuesday to fill you in
Positive
[0 1]
[0.7481956 0.2518044]
1900
hey my kellan and ashley x live live
Positive
[0 1]
[0.76565695 0.23434307]
1902
no great pels here someday in the meantime a pilot isaac newton pr burgundy an estie di steel bl will do
Positive
[0 1]
[0.58084154 0.41915846]
1904
yes nudity is the answer i ll spread the good word for a glimpse of that
Positive
[0 1]
[0.64302707 0.3569729 ]
1914
hey its been a long day dont hate
Positive
[0 1]
[0.9710263  0.

In [110]:
twt = ['there is an hour delay at the train station']
#vectorizing the tweet by the pre-fitted tokenizer instance
twt = tokenizer.texts_to_sequences(twt)
#padding the tweet to have exactly the same shape as `embedding_2` input
twt = pad_sequences(twt, maxlen=211, dtype='int32', value=0)
print(twt)
sentiment = model.predict(twt,batch_size=1,verbose = 2)[0]
if(np.argmax(sentiment) == 0):
    print("negative")
elif (np.argmax(sentiment) == 1):
    print("positive")

[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0   

In [101]:
model.save('my_model7.h5')