In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SimpleRNN, SpatialDropout1D

from sklearn.model_selection import train_test_split
from sklearn import metrics

In [5]:
data = pd.read_csv("Nega.csv")
data.head()

Unnamed: 0,App_name,Category,User_Review,Sentiments,Review_Type
0,brainly,Education,Not working properly plz fix the problem,Negative,AC
1,brainly,Education,"App is not good, stuck in between",Negative,AC
2,brainly,Education,not working properly,Negative,AC
3,brainly,Education,app is not working,Negative,AC
4,brainly,Education,this app will be not started some issue,Negative,AC


In [6]:
classes = data['Sentiments']
print(classes.value_counts())

Negative    9112
Name: Sentiments, dtype: int64


In [4]:
data.shape

(9112, 5)

In [7]:
data = data[['User_Review', 'Review_Type']]
data.head()

Unnamed: 0,User_Review,Review_Type
0,Not working properly plz fix the problem,AC
1,"App is not good, stuck in between",AC
2,not working properly,AC
3,app is not working,AC
4,this app will be not started some issue,AC


In [8]:
def clean_train_data(x):
    text = x
    text = text.lower()
    text = re.sub('\[.*?\]', '', text) # remove square brackets
    text = re.sub(r'[^\w\s]','',text) # remove punctuation
    text = re.sub('\w*\d\w*', '', text) # remove words containing numbers
    text = re.sub('\n', '', text)
    return text

In [9]:

data['User_Review'] = data.User_Review.apply(lambda x : clean_train_data(x))
data.head()

Unnamed: 0,User_Review,Review_Type
0,not working properly plz fix the problem,AC
1,app is not good stuck in between,AC
2,not working properly,AC
3,app is not working,AC
4,this app will be not started some issue,AC


In [10]:
all_cat_data = data.copy()

# 2 Class Analysis

In [11]:
data = data[data['Sentiments'] != 'Neutral']
data.head()

KeyError: 'Sentiments'

In [10]:
print(len(data[data['Sentiments'] == 'Positive']))
print(len(data[ data['Sentiments'] == 'Negative']))
print(len(data[ data['Sentiments'] == 'Neutral']))

6177
3136
0


In [11]:
model1_data = data.copy()

In [12]:
max_features = 2000
token = Tokenizer(num_words=max_features, split = ' ')
token.fit_on_texts(data['User_Review'].values)

X = token.texts_to_sequences(data['User_Review'].values)
X = pad_sequences(X)

In [13]:
X.shape

(9313, 118)

In [14]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_features, embed_dim, input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 118, 128)          256000    
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 118, 128)          0         
_________________________________________________________________
lstm (LSTM)                  (None, 196)               254800    
_________________________________________________________________
dense (Dense)                (None, 2)                 394       
Total params: 511,194
Trainable params: 511,194
Non-trainable params: 0
_________________________________________________________________


In [15]:
Y = pd.get_dummies(data['Sentiments']).values

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.30, random_state=42)

In [17]:
batch_size = 32
history = model.fit(X_train, y_train, epochs=10, batch_size=batch_size, verbose=2)

Epoch 1/10
204/204 - 70s - loss: 0.4251 - accuracy: 0.8087
Epoch 2/10
204/204 - 71s - loss: 0.2695 - accuracy: 0.9061
Epoch 3/10
204/204 - 71s - loss: 0.2319 - accuracy: 0.9189
Epoch 4/10
204/204 - 72s - loss: 0.2057 - accuracy: 0.9297
Epoch 5/10
204/204 - 70s - loss: 0.1904 - accuracy: 0.9348
Epoch 6/10
204/204 - 70s - loss: 0.1732 - accuracy: 0.9417
Epoch 7/10
204/204 - 69s - loss: 0.1747 - accuracy: 0.9408
Epoch 8/10
204/204 - 69s - loss: 0.1563 - accuracy: 0.9474
Epoch 9/10
204/204 - 69s - loss: 0.1476 - accuracy: 0.9495
Epoch 10/10
204/204 - 69s - loss: 0.1465 - accuracy: 0.9486


In [18]:
# score = model.predict(X_test)
score, acc = model.evaluate(X_test, y_test, batch_size=batch_size, verbose=2)
print('score', score)
print('accuracy', acc)

88/88 - 3s - loss: 0.3134 - accuracy: 0.9123
score 0.31344369053840637
accuracy 0.9123120903968811


In [19]:
text = ['i would recommend it if you have no other options']
text = token.texts_to_sequences(text)
text = pad_sequences(text, maxlen=28, dtype='int32', value=0)
res = model.predict(text, batch_size=1,verbose = 2)
res

1/1 - 0s


array([[0.01426646, 0.9857335 ]], dtype=float32)

In [20]:
if np.argmax(res[0]) == 0:
    print("Negetive Comment")
elif np.argmax(res[0]) == 1:
    print("Positive Comment")

Positive Comment


# Multiclass Sentiment Analysis

### data preprocessing

In [12]:
ms_data = all_cat_data.copy()

In [13]:
ms_data.head()

Unnamed: 0,User_Review,Review_Type
0,not working properly plz fix the problem,AC
1,app is not good stuck in between,AC
2,not working properly,AC
3,app is not working,AC
4,this app will be not started some issue,AC


In [14]:
num_of_rows = 4000
shuffled = ms_data.reindex(np.random.permutation(ms_data.index))
nt = shuffled[shuffled['Review_Type'] == 'AC'][:num_of_rows]
ng = shuffled[shuffled['Review_Type'] == 'AI'][:num_of_rows]
ps = shuffled[shuffled['Review_Type'] == 'PI'][:num_of_rows]
of = shuffled[shuffled['Review_Type'] == 'UI'][:num_of_rows]
og = shuffled[shuffled['Review_Type'] == 'VU'][:num_of_rows]
ob = shuffled[shuffled['Review_Type'] == 'BR'][:num_of_rows]
qw = shuffled[shuffled['Review_Type'] == 'FI'][:num_of_rows]
cv = shuffled[shuffled['Review_Type'] == 'FR'][:num_of_rows]



combine_data = pd.concat([nt, ng, ps,of,og,ob,qw,cv], ignore_index=True)
combine_data = combine_data.reindex(np.random.permutation(combine_data.index))
combine_data['label'] = 0
combine_data.head()

Unnamed: 0,User_Review,Review_Type,label
6986,i cant fing create a google docs and when i do...,BR,0
2302,u can search only limited words there is limi...,PI,0
3901,i dont know why they changed the tab layout it...,UI,0
381,this app when i open it was blank,AC,0
2873,rate this app,PI,0


In [15]:
classes = data['Review_Type']
print(classes.value_counts())

PI    2403
BR    2201
AC    1129
FI     937
CI     873
VU     777
AI     363
FR     223
UI     206
Name: Review_Type, dtype: int64


In [16]:
print(len(combine_data[combine_data['Review_Type'] == 'AC']))
print(len(combine_data[combine_data['Review_Type'] == 'AI']))
print(len(combine_data[combine_data['Review_Type'] == 'PI']))
print(len(combine_data[combine_data['Review_Type'] == 'UI']))
print(len(combine_data[combine_data['Review_Type'] == 'VU']))
print(len(combine_data[combine_data['Review_Type'] == 'BR']))
print(len(combine_data[combine_data['Review_Type'] == 'FI']))
print(len(combine_data[combine_data['Review_Type'] == 'FR']))

1129
363
2403
206
777
2201
937
223


In [17]:
ms_data = combine_data.copy()

In [18]:
print(len(ms_data[ms_data['Review_Type'] == 'AC']))
print(len(ms_data[ms_data['Review_Type'] == 'AI']))
print(len(ms_data[ms_data['Review_Type'] == 'PI']))
print(len(ms_data[ms_data['Review_Type'] == 'UI']))
print(len(ms_data[ms_data['Review_Type'] == 'VU']))
print(len(ms_data[ms_data['Review_Type'] == 'BR']))
print(len(ms_data[ms_data['Review_Type'] == 'FI']))
print(len(ms_data[ms_data['Review_Type'] == 'FR']))

1129
363
2403
206
777
2201
937
223


In [19]:
ms_data.loc[ms_data['Review_Type'] == 'AC', 'label'] = 0
ms_data.loc[ms_data['Review_Type'] == 'AI', 'label'] = 1
ms_data.loc[ms_data['Review_Type'] == 'PI', 'label'] = 2
ms_data.loc[ms_data['Review_Type'] == 'UI', 'label'] = 3
ms_data.loc[ms_data['Review_Type'] == 'VU', 'label'] = 4
ms_data.loc[ms_data['Review_Type'] == 'BR', 'label'] = 5
ms_data.loc[ms_data['Review_Type'] == 'FI', 'label'] = 6
ms_data.loc[ms_data['Review_Type'] == 'FR', 'label'] = 7

In [20]:
ms_data.head(10)

Unnamed: 0,User_Review,Review_Type,label
6986,i cant fing create a google docs and when i do...,BR,5
2302,u can search only limited words there is limi...,PI,2
3901,i dont know why they changed the tab layout it...,UI,3
381,this app when i open it was blank,AC,0
2873,rate this app,PI,2
334,i like it but it stuck in between,AC,0
3781,i never recommended this app,PI,2
1624,this a bad app bacuse my mobie is talk on this...,PI,2
2442,elsa could not hear my voice,PI,2
7461,tapping buttons doesnt work consistently,FI,6


In [21]:
from keras.utils import to_categorical

In [22]:
labels = to_categorical(ms_data['label'], num_classes=8)

In [23]:
labels.shape

(8239, 8)

In [24]:
labels[:10]

array([[0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0.]], dtype=float32)

In [27]:
max_features = 3000
max_len = 130
ms_token = Tokenizer(num_words=max_features)
ms_token.fit_on_texts(ms_data['User_Review'].values)
ms_sequences = ms_token.texts_to_sequences(ms_data['User_Review'].values)
X = pad_sequences(ms_sequences, maxlen=max_len)

In [28]:
word_index = ms_token.word_index
print('Found %s unique tokens.' % len(word_index))

Found 7992 unique tokens.


In [29]:
y = labels

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=42)

In [31]:
embed_dim = 128
lstm_out = 96

In [55]:
ms_model = Sequential()
ms_model.add(Embedding(max_features, embed_dim, input_length = X.shape[1]))
ms_model.add(SpatialDropout1D(0.7))
ms_model.add(LSTM(64, dropout=0.7, recurrent_dropout=0.7))
ms_model.add(Dense(8, activation='softmax'))
ms_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

ms_model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 130, 128)          384000    
_________________________________________________________________
spatial_dropout1d_4 (Spatial (None, 130, 128)          0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 64)                49408     
_________________________________________________________________
dense_4 (Dense)              (None, 8)                 520       
Total params: 433,928
Trainable params: 433,928
Non-trainable params: 0
_________________________________________________________________


In [53]:
from keras.callbacks import EarlyStopping

In [56]:
batch_size = 80
ms_history = ms_model.fit(X_train, y_train, epochs=20, batch_size=batch_size,validation_split=0.25,callbacks=[EarlyStopping(monitor='val_loss')])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20


In [46]:
from sklearn.metrics import precision_score


In [47]:
loss, accuracy = ms_model.evaluate(X_test, y_test)
print("loss", loss)
print("accuracy", accuracy)


loss 1.1776673793792725
accuracy 0.6145631074905396


In [48]:
from sklearn import metrics
print("accuracy", accuracy)


accuracy 0.6145631074905396


In [58]:
predictions = ms_model.predict(X_test)
predictions

array([[6.30153120e-02, 2.81930761e-03, 4.93694603e-01, ...,
        3.37423265e-01, 9.34701711e-02, 5.75461611e-03],
       [3.16282250e-02, 1.91523705e-03, 5.27920544e-01, ...,
        3.51671398e-01, 8.26125145e-02, 2.86135264e-03],
       [4.40796465e-03, 9.05929599e-04, 3.69890779e-02, ...,
        4.09954697e-01, 1.27686262e-01, 3.08621326e-03],
       ...,
       [1.27625793e-01, 1.85959022e-02, 1.15012921e-01, ...,
        5.79502404e-01, 1.23596214e-01, 2.06099171e-02],
       [3.60909551e-02, 5.73219091e-04, 1.32946044e-01, ...,
        7.51636803e-01, 7.37666488e-02, 2.48475769e-03],
       [1.75582096e-01, 1.94611084e-02, 3.43983620e-01, ...,
        2.00539082e-01, 1.95110813e-01, 3.60732637e-02]], dtype=float32)

In [59]:
pd.crosstab(y_test.argmax(axis=1), predictions.argmax(axis=1), rownames=['True'], colnames=['Predicted'], margins=True)


Predicted,0,1,2,3,4,5,6,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,169,0,23,0,5,63,1,261
1,0,55,23,0,9,7,0,94
2,23,12,470,0,6,86,2,599
3,6,11,6,10,9,15,1,58
4,4,1,6,0,121,61,2,195
5,63,3,66,0,4,419,3,558
6,19,0,59,0,6,108,47,239
7,8,2,12,0,3,15,16,56
All,292,84,665,10,163,774,72,2060


In [60]:
metrics.confusion_matrix(y_test.argmax(axis=1), predictions.argmax(axis=1))


array([[169,   0,  23,   0,   5,  63,   1,   0],
       [  0,  55,  23,   0,   9,   7,   0,   0],
       [ 23,  12, 470,   0,   6,  86,   2,   0],
       [  6,  11,   6,  10,   9,  15,   1,   0],
       [  4,   1,   6,   0, 121,  61,   2,   0],
       [ 63,   3,  66,   0,   4, 419,   3,   0],
       [ 19,   0,  59,   0,   6, 108,  47,   0],
       [  8,   2,  12,   0,   3,  15,  16,   0]], dtype=int64)

In [62]:
from sklearn import metrics
con = metrics.confusion_matrix(y_test.argmax(axis=1), predictions.argmax(axis=1))

In [63]:
precision = np.diag(con) / (np.diag(con) +(con.sum(axis=0) - np.diag(con)))
precision


  precision = np.diag(con) / (np.diag(con) +(con.sum(axis=0) - np.diag(con)))


array([0.57876712, 0.6547619 , 0.70676692, 1.        , 0.74233129,
       0.54134367, 0.65277778,        nan])

In [64]:
recall = np.diag(con)/ ((con.sum(axis=1) - np.diag(con)) + np.diag(con))
recall

array([0.64750958, 0.58510638, 0.78464107, 0.17241379, 0.62051282,
       0.75089606, 0.19665272, 0.        ])

In [65]:
f1_score = 2 * ((precision * recall) / (precision + recall))
f1_score

array([0.61121157, 0.61797753, 0.74367089, 0.29411765, 0.67597765,
       0.62912913, 0.3022508 ,        nan])

In [66]:
print(precision)

[0.57876712 0.6547619  0.70676692 1.         0.74233129 0.54134367
 0.65277778        nan]
