<h3> Importing Libraries </h3>

In [63]:
import numpy as np 
import pandas as pd 

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re

<h3> Reducing the scope of clean data </h3>

In [64]:
data = pd.read_csv(r'Twitter_data.csv')
data.category=data.category.astype(str)
data = data[['clean_text','category']]

In [65]:
data = data[ data['category'] != '0.0']

In [66]:
data

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0
8,with upcoming election india saga going import...,1.0
...,...,...
162972,engine growth modi unveils indias first 12000 ...,1.0
162973,modi promised 2014 lok sabha elections that be...,1.0
162975,why these 456 crores paid neerav modi not reco...,-1.0
162976,dear rss terrorist payal gawar what about modi...,-1.0


In [67]:
data['category'] = data['category'].apply(lambda x: 'Negative' if x == '-1.0' else 'Positive')

print(data[ data['category'] == 'Positive'].size)
print(data[ data['category'] == 'Negative'].size)

144514
71020


In [69]:
data.clean_text=data.clean_text.astype(str)

<h3> Vectorizing and converting text to sequences </h3>

In [80]:
max_features = 2000
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(data['clean_text'].values)
X = tokenizer.texts_to_sequences(data['clean_text'].values)
X = pad_sequences(X)

In [86]:
X

array([[   0,    0,    0, ...,    3,   60,    3],
       [   0,    0,    0, ...,    1,   60,   42],
       [   0,    0,    0, ...,  353,   19,  841],
       ...,
       [   0,    0,    0, ...,   74,  328, 1744],
       [   0,    0,    0, ...,  216,  220, 1555],
       [   0,    0,    0, ...,    7,  147,    4]])

<h3> LSTM Network </h3>

In [81]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 48, 128)           256000    
_________________________________________________________________
spatial_dropout1d_2 (Spatial (None, 48, 128)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 394       
Total params: 511,194
Trainable params: 511,194
Non-trainable params: 0
_________________________________________________________________
None


<h3> Splitting the dataset </h3>

In [82]:
Y = pd.get_dummies(data['category']).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.5, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(53883, 48) (53883, 2)
(53884, 48) (53884, 2)


<h3> Training the model </h3>

In [83]:
batch_size = 32
model.fit(X_train, Y_train, epochs = 7, batch_size=batch_size, verbose = 2)

Epoch 1/7
 - 255s - loss: 0.3503 - accuracy: 0.8460
Epoch 2/7
 - 253s - loss: 0.2266 - accuracy: 0.9105
Epoch 3/7
 - 256s - loss: 0.2023 - accuracy: 0.9207
Epoch 4/7
 - 253s - loss: 0.1892 - accuracy: 0.9278
Epoch 5/7
 - 255s - loss: 0.1779 - accuracy: 0.9313
Epoch 6/7
 - 253s - loss: 0.1660 - accuracy: 0.9379
Epoch 7/7
 - 252s - loss: 0.1564 - accuracy: 0.9413


<keras.callbacks.callbacks.History at 0x1ea73d79c48>

<h3> Testing the model </h3>

In [84]:
validation_size = 1500

X_validate = X_test[-validation_size:]
Y_validate = Y_test[-validation_size:]
X_test = X_test[:-validation_size]
Y_test = Y_test[:-validation_size]
score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

score: 0.22
acc: 0.92


In [85]:
pos_cnt, neg_cnt, pos_correct, neg_correct = 0, 0, 0, 0
for x in range(len(X_validate)):
    
    result = model.predict(X_validate[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]
   
    if np.argmax(result) == np.argmax(Y_validate[x]):
        if np.argmax(Y_validate[x]) == 0:
            neg_correct += 1
        else:
            pos_correct += 1
       
    if np.argmax(Y_validate[x]) == 0:
        neg_cnt += 1
    else:
        pos_cnt += 1



print("pos_acc", pos_correct/pos_cnt*100, "%")
print("neg_acc", neg_correct/neg_cnt*100, "%")

pos_acc 96.33663366336633 %
neg_acc 86.73469387755102 %
