# LSTM sentiment classification

#### I have implemented BoW and TfIdf models But those model just predict based on frequency of word , they do not preserve semantic meaning , As in this model it will predict sentiment based on whole review.

Here I will take fraction of data since training this model is very costly in terms of GPU and also I have taken only one LSTM layer so model structure is simple but you can change this to get higher accuracy score if you have resources 

I have processed and cleaned amazon review data in other notebook which is also uploded as Final on github<br>
 

In [0]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score 
from statistics import mode
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix 

In [0]:
data=pd.read_csv("binaryrating.csv")

In [12]:
print(data.shape)
data.head()

(40000, 2)


Unnamed: 0,review,binary_rating
0,found game bit complicated expected played two...,low
1,im sure would love game could play loved hitma...,low
2,like bf game work wireless xbox controller don...,high
3,game requires open online account play game co...,low
4,st shipment received book instead gamend shipm...,low


In [14]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 

Using TensorFlow backend.


In [0]:
import re

data['review'] = data['review'].apply(lambda x:re.sub('[^a-zA-Z ]+','',str(x)))

In [0]:
def prepare_data(data):
    tz = Tokenizer(num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n[0-9]', lower=True, split=' ')
    tz.fit_on_texts(data)
    total_words = len(tz.word_index)
    
    input_sequences = []
    for line in data:
        token_list = tz.texts_to_sequences([line])[0]
        input_sequences.append(token_list)    
    max_len_sequence = max([len(x) for x in input_sequences])
    padded_sequence = np.array(pad_sequences(input_sequences, maxlen = max_len_sequence, padding = 'pre'))  #using pad_sequence for generating same dimensions training data
    
    return padded_sequence,max_len_sequence,total_words

In [0]:
predictors,max_len_sequence,total_words=prepare_data(data['review'])

In [20]:
print(predictors.shape,'\n')
print(max_len_sequence,'\n')
print(total_words)

(40000, 2967) 

2967 

116287


In [0]:
def create_model(max_len_sequence, total_words):
    model = Sequential()
    
    model.add(Embedding(total_words+1, 10, input_length=max_len_sequence))
    
    model.add(LSTM(100))
    model.add(Dropout(0.1))
    model.add(Dense(1,activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])
    
    return model

In [34]:
model = create_model(max_len_sequence, total_words)

print(model.summary(),'\n')

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 2967, 10)          1162880   
_________________________________________________________________
lstm_3 (LSTM)                (None, 100)               44400     
_________________________________________________________________
dropout_3 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 101       
Total params: 1,207,381
Trainable params: 1,207,381
Non-trainable params: 0
_________________________________________________________________
None 



In [35]:
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
target=encoder.fit_transform(data['binary_rating'])

train_x,test_x,train_y,test_y=train_test_split(predictors,target,test_size=0.6, random_state=42)

print(target[:5])
print(train_x.shape)
print(test_x.shape)
print(train_y.shape)

[1 1 0 1 1]
(16000, 2967)
(24000, 2967)
(16000,)


In [36]:
model.fit(train_x,train_y,epochs=5,batch_size=128)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f3cf05b4278>

In [37]:
pred=model.predict_classes(test_x)
print(confusion_matrix(test_y,pred))
print(accuracy_score(test_y,pred))

[[9962 1959]
 [2841 9238]]
0.8


We got same accuracy as before with Tfidf model

Some ways we can improve accuracy is:
<ul>
<li>
Take more data and use more layer in model</li>
<li>
use pretained embedding vectors e.g. Word2Vec instead of training them from scratch because Word2Vec is trianed on billions of words so it will have more accurate representation of words.</li>
<li>use bi-directional lstm </li>
</ul>