In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SimpleRNN, SpatialDropout1D

from sklearn.model_selection import train_test_split
from sklearn import metrics

In [2]:
data = pd.read_csv("sentiment22-final.csv")
data.head()

Unnamed: 0,App_name,Category,User_Review,Sentiments
0,brainly,Education,�Horrible. This app has changed so much. I've ...,Negative
1,brainly,Education,The app is not responding,Negative
2,brainly,Education,Not working properly plz fix the problem,Negative
3,brainly,Education,App is not good,Negative
4,brainly,Education,not working properly,Negative


In [3]:
classes = data['Sentiments']
print(classes.value_counts())

Positive    6177
Negative    3136
Neutral      686
Name: Sentiments, dtype: int64


In [4]:
data.shape

(9999, 4)

In [5]:
data = data[['User_Review', 'Sentiments']]
data.head()

Unnamed: 0,User_Review,Sentiments
0,�Horrible. This app has changed so much. I've ...,Negative
1,The app is not responding,Negative
2,Not working properly plz fix the problem,Negative
3,App is not good,Negative
4,not working properly,Negative


In [7]:
def clean_train_data(x):
    text = x
    text = text.lower()
    text = re.sub('\[.*?\]', '', text) # remove square brackets
    text = re.sub(r'[^\w\s]','',text) # remove punctuation
    text = re.sub('\w*\d\w*', '', text) # remove words containing numbers
    text = re.sub('\n', '', text)
    return text

In [8]:

data['User_Review'] = data.User_Review.apply(lambda x : clean_train_data(x))
data.head()

Unnamed: 0,User_Review,Sentiments
0,horrible this app has changed so much ive plac...,Negative
1,the app is not responding,Negative
2,not working properly plz fix the problem,Negative
3,app is not good,Negative
4,not working properly,Negative


In [11]:
all_cat_data = data.copy()

# 2 Class Analysis

In [9]:
data = data[data['Sentiments'] != 'Neutral']
data.head()

Unnamed: 0,User_Review,Sentiments
0,horrible this app has changed so much ive plac...,Negative
1,the app is not responding,Negative
2,not working properly plz fix the problem,Negative
3,app is not good,Negative
4,not working properly,Negative


In [10]:
print(len(data[data['Sentiments'] == 'Positive']))
print(len(data[ data['Sentiments'] == 'Negative']))
print(len(data[ data['Sentiments'] == 'Neutral']))

6177
3136
0


In [11]:
model1_data = data.copy()

In [12]:
max_features = 2000
token = Tokenizer(num_words=max_features, split = ' ')
token.fit_on_texts(data['User_Review'].values)

X = token.texts_to_sequences(data['User_Review'].values)
X = pad_sequences(X)

In [13]:
X.shape

(9313, 118)

In [14]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_features, embed_dim, input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 118, 128)          256000    
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 118, 128)          0         
_________________________________________________________________
lstm (LSTM)                  (None, 196)               254800    
_________________________________________________________________
dense (Dense)                (None, 2)                 394       
Total params: 511,194
Trainable params: 511,194
Non-trainable params: 0
_________________________________________________________________


In [15]:
Y = pd.get_dummies(data['Sentiments']).values

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.30, random_state=42)

In [17]:
batch_size = 32
history = model.fit(X_train, y_train, epochs=10, batch_size=batch_size, verbose=2)

Epoch 1/10
204/204 - 70s - loss: 0.4251 - accuracy: 0.8087
Epoch 2/10
204/204 - 71s - loss: 0.2695 - accuracy: 0.9061
Epoch 3/10
204/204 - 71s - loss: 0.2319 - accuracy: 0.9189
Epoch 4/10
204/204 - 72s - loss: 0.2057 - accuracy: 0.9297
Epoch 5/10
204/204 - 70s - loss: 0.1904 - accuracy: 0.9348
Epoch 6/10
204/204 - 70s - loss: 0.1732 - accuracy: 0.9417
Epoch 7/10
204/204 - 69s - loss: 0.1747 - accuracy: 0.9408
Epoch 8/10
204/204 - 69s - loss: 0.1563 - accuracy: 0.9474
Epoch 9/10
204/204 - 69s - loss: 0.1476 - accuracy: 0.9495
Epoch 10/10
204/204 - 69s - loss: 0.1465 - accuracy: 0.9486


In [18]:
# score = model.predict(X_test)
score, acc = model.evaluate(X_test, y_test, batch_size=batch_size, verbose=2)
print('score', score)
print('accuracy', acc)

88/88 - 3s - loss: 0.3134 - accuracy: 0.9123
score 0.31344369053840637
accuracy 0.9123120903968811


In [19]:
text = ['i would recommend it if you have no other options']
text = token.texts_to_sequences(text)
text = pad_sequences(text, maxlen=28, dtype='int32', value=0)
res = model.predict(text, batch_size=1,verbose = 2)
res

1/1 - 0s


array([[0.01426646, 0.9857335 ]], dtype=float32)

In [20]:
if np.argmax(res[0]) == 0:
    print("Negetive Comment")
elif np.argmax(res[0]) == 1:
    print("Positive Comment")

Positive Comment


# Multiclass Sentiment Analysis

### data preprocessing

In [12]:
ms_data = all_cat_data.copy()

In [22]:
ms_data.head()

Unnamed: 0,User_Review,Sentiments
0,horrible this app has changed so much ive plac...,Negative
1,the app is not responding,Negative
2,not working properly plz fix the problem,Negative
3,app is not good,Negative
4,not working properly,Negative


In [13]:
num_of_rows = 4000
shuffled = ms_data.reindex(np.random.permutation(ms_data.index))
nt = shuffled[shuffled['Sentiments'] == 'Neutral'][:num_of_rows]
ng = shuffled[shuffled['Sentiments'] == 'Negative'][:num_of_rows]
ps = shuffled[shuffled['Sentiments'] == 'Positive'][:num_of_rows]
combine_data = pd.concat([nt, ng, ps], ignore_index=True)
combine_data = combine_data.reindex(np.random.permutation(combine_data.index))
combine_data['label'] = 0
combine_data.head()

Unnamed: 0,User_Review,Sentiments,label
912,very worst app,Negative,0
5970,useful and handy,Positive,0
6603,excellent excercise for the brain fun exciting,Positive,0
4527,nice app,Positive,0
1514,sign up problems,Negative,0


In [14]:
classes = data['Sentiments']
print(classes.value_counts())

Positive    6177
Negative    3136
Neutral      686
Name: Sentiments, dtype: int64


In [15]:
print(len(combine_data[combine_data['Sentiments'] == 'Neutral']))
print(len(combine_data[combine_data['Sentiments'] == 'Negative']))
print(len(combine_data[combine_data['Sentiments'] == 'Positive']))

686
3136
4000


In [16]:
ms_data = combine_data.copy()

In [17]:
print(len(ms_data[ms_data['Sentiments'] == 'Neutral']))
print(len(ms_data[ms_data['Sentiments'] == 'Negative']))
print(len(ms_data[ms_data['Sentiments'] == 'Positive']))

686
3136
4000


In [18]:
ms_data.loc[ms_data['Sentiments'] == 'Neutral', 'label'] = 0
ms_data.loc[ms_data['Sentiments'] == 'Negative', 'label'] = 1
ms_data.loc[ms_data['Sentiments'] == 'Positive', 'label'] = 2

In [19]:
ms_data.head(10)

Unnamed: 0,User_Review,Sentiments,label
912,very worst app,Negative,1
5970,useful and handy,Positive,2
6603,excellent excercise for the brain fun exciting,Positive,2
4527,nice app,Positive,2
1514,sign up problems,Negative,1
1697,unreadable much wiese than online version,Negative,1
5195,knowing the meaning of words can be useful in ...,Positive,2
1713,nothing is working,Negative,1
67,most helpful but sometimes videos are not working,Neutral,0
7605,i love this app,Positive,2


In [20]:
from keras.utils import to_categorical

In [21]:
labels = to_categorical(ms_data['label'], num_classes=3)

In [22]:
labels.shape

(7822, 3)

In [23]:
labels[:10]

array([[0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.]], dtype=float32)

In [24]:
max_features = 3000
max_len = 130
ms_token = Tokenizer(num_words=max_features)
ms_token.fit_on_texts(ms_data['User_Review'].values)
ms_sequences = ms_token.texts_to_sequences(ms_data['User_Review'].values)
X = pad_sequences(ms_sequences, maxlen=max_len)

In [25]:
word_index = ms_token.word_index
print('Found %s unique tokens.' % len(word_index))

Found 5160 unique tokens.


In [26]:
y = labels

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.30, random_state=42)

In [28]:
embed_dim = 128
lstm_out = 96

In [29]:
ms_model = Sequential()
ms_model.add(Embedding(max_features, embed_dim, input_length = X.shape[1]))
ms_model.add(SpatialDropout1D(0.7))
ms_model.add(LSTM(64, dropout=0.7, recurrent_dropout=0.7))
ms_model.add(Dense(3, activation='softmax'))
ms_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

ms_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 130, 128)          384000    
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 130, 128)          0         
_________________________________________________________________
lstm (LSTM)                  (None, 64)                49408     
_________________________________________________________________
dense (Dense)                (None, 3)                 195       
Total params: 433,603
Trainable params: 433,603
Non-trainable params: 0
_________________________________________________________________


In [31]:
from keras.callbacks import EarlyStopping

In [32]:
batch_size = 50
ms_history = ms_model.fit(X_train, y_train, epochs=20, batch_size=batch_size,validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss',patience=7, min_delta=0.0001)])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [34]:
from sklearn.metrics import precision_score


In [35]:
loss, accuracy = ms_model.evaluate(X_test, y_test)
print("loss", loss)
print("accuracy", accuracy)


loss 0.4054883122444153
accuracy 0.8713250756263733


In [42]:
from sklearn import metrics
con = metrics.confusion_matrix(y_test.argmax(axis=1), predictions.argmax(axis=1))

In [38]:
predictions = ms_model.predict(X_test)
predictions

array([[1.18352444e-04, 1.09003996e-02, 9.88981247e-01],
       [1.29490690e-02, 6.92314655e-02, 9.17819440e-01],
       [5.84118068e-04, 5.09797316e-03, 9.94317949e-01],
       ...,
       [2.21776980e-04, 9.98198330e-01, 1.57989271e-03],
       [1.79843057e-03, 1.87429547e-01, 8.10772002e-01],
       [4.63938743e-01, 1.39752626e-01, 3.96308631e-01]], dtype=float32)

In [39]:
pd.crosstab(y_test.argmax(axis=1), predictions.argmax(axis=1), rownames=['True'], colnames=['Predicted'], margins=True)


Predicted,0,1,2,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,164,24,22,210
1,12,801,125,938
2,20,99,1080,1199
All,196,924,1227,2347


In [40]:
metrics.confusion_matrix(y_test.argmax(axis=1), predictions.argmax(axis=1))


array([[ 164,   24,   22],
       [  12,  801,  125],
       [  20,   99, 1080]], dtype=int64)

In [43]:
precision = np.diag(con) / (np.diag(con) +(con.sum(axis=0) - np.diag(con)))
precision


array([0.83673469, 0.86688312, 0.8801956 ])

In [44]:
recall = np.diag(con)/ ((con.sum(axis=1) - np.diag(con)) + np.diag(con))
recall

array([0.78095238, 0.85394456, 0.90075063])

In [45]:
f1_score = 2 * ((precision * recall) / (precision + recall))
f1_score

array([0.80788177, 0.8603652 , 0.89035449])

# 70:30

In [48]:
batch_size = 50
ms_history = ms_model.fit(X_train, y_train, epochs=20, batch_size=batch_size,validation_split=0.3,callbacks=[EarlyStopping(monitor='val_loss',patience=7, min_delta=0.0001)])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20


In [49]:
loss, accuracy = ms_model.evaluate(X_test, y_test)
print("loss", loss)
print("accuracy", accuracy)


loss 0.44677942991256714
accuracy 0.8772901296615601


In [50]:
precision = np.diag(con) / (np.diag(con) +(con.sum(axis=0) - np.diag(con)))
precision

array([0.83673469, 0.86688312, 0.8801956 ])