# Sentiment Analysis in Keras LSTM Implementation

In [1]:
# Import the necesary packages
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import re
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.layers import Dropout

from keras.constraints import max_norm
from sklearn.model_selection import train_test_split
from IPython.core.interactiveshell import InteractiveShell
# import plotly.figure_factory as ff
# InteractiveShell.ast_node_interactivity = 'all'
# from plotly.offline import iplot
# from PreProcessing import Cleaned_X_Y

In [2]:
data = pd.read_csv('urdu_sentiment.csv')

In [3]:
X_data = data['sentence']
print(X_data)

0        میں نے ایٹم بم بنایا ھے ۔۔۔۔او بھائی ایٹم بمب ...
1        چندے سے انقلاب اور عمران خان وزیر اعظم نہیں بن...
2                                 ٹویٹر کا خیال کیسے آیا ؟
3          ابھی تک اسکی لہریں کبھی کبھی آ جاتی ہیں یار :أْ
4         گندی زبان اور گٹر جیسے دماغ والے جاهل جیالے ه...
                               ...                        
29594                                                   حس
29595                                                   حل
29596                                                   سا
29597                                                   عز
29598                                                   فن
Name: sentence, Length: 29599, dtype: object


In [12]:
y_data = data['review']

In [13]:
#print("Data available and its shapes: X : {}, Y : {}".format(X_data.shape, y_data.shape))

In [15]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 20000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 50
# This is fixed.
EMBEDDING_DIM = 300

In [16]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)


In [17]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(X_data)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 22515 unique tokens.


In [18]:
X = tokenizer.texts_to_sequences(X_data)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

Shape of data tensor: (29599, 50)


In [19]:
Y = pd.get_dummies(y_data).values
print('Shape of label tensor:', Y.shape)

Shape of label tensor: (29599, 3)


In [20]:
# Train Test Split the data.
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.10, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(26639, 50) (26639, 3)
(2960, 50) (2960, 3)


In [21]:
# Construct the Model.
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.3))
model.add(LSTM(60, dropout=0.3, recurrent_dropout=0.3, kernel_constraint=max_norm(3), recurrent_constraint=max_norm(3), bias_constraint=max_norm(3)))
model.add(Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='RMSprop', metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 50, 300)           6000000   
                                                                 
 spatial_dropout1d (SpatialD  (None, 50, 300)          0         
 ropout1D)                                                       
                                                                 
 lstm (LSTM)                 (None, 60)                86640     
                                                                 
 dense (Dense)               (None, 3)                 183       
                                                                 
Total params: 6,086,823
Trainable params: 6,086,823
Non-trainable params: 0
_________________________________________________________________
None


In [35]:
from keras.models import load_model
modellll = load_model('final_senti.h5')
new_complaint = ['وہ ایک آدمی ہے']
seq = tokenizer.texts_to_sequences(new_complaint)
padded = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH)
pred = modellll.predict(padded)
labels = ['Negative','Neutral','Positive']
print(pred, labels[np.argmax(pred)])


[[0.17966363 0.44262362 0.37771273]] Neutral
