# Sentiment Analysis of Movie Reviews - LSTM

In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import numpy as np

from keras.callbacks import ModelCheckpoint
from keras.callbacks import TensorBoard
from keras.layers import Dense
from keras.layers import LSTM
from keras.models import load_model
from keras.models import Sequential
from keras.optimizers import RMSprop
from keras.utils.np_utils import to_categorical
from keras.preprocessing.text import Tokenizer
import nltk
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from operator import itemgetter
import utils

In [22]:
corpus = ["Was geht ab hier drüben ärgerte?", "Waß soll das hier", "Hier ist es total langweilig"]
corpus

['Was geht ab hier drüben ärgerte?',
 'Waß soll das hier',
 'Hier ist es total langweilig']

In [23]:
import utils

In [24]:
ce = utils.CorpusEncoding()

In [25]:
corpus_tok = [utils.text_preprocess(doc, stop=False, language='german') for doc in corpus]
corpus_tok

[['was', 'geht', 'ab', 'hier', 'drueb', 'aergert'],
 ['wass', 'soll', 'das', 'hier'],
 ['hier', 'ist', 'es', 'total', 'langweil']]

In [26]:
ce.fit(corpus_tok)

In [27]:
#ce.reduce_vocab(5)

In [28]:
ce.transform(corpus_tok, drop_unknown=True)

[[1, 14, 9, 4, 3, 7, 5], [1, 15, 12, 6, 3], [1, 3, 10, 8, 13, 11]]

In [29]:
def wordcount_encoding(corpus, upper=None, lower=None):
    corpus_prep = []
    for doc in corpus:
        doc = utils.rem_html(doc)
        doc = utils.rem_punctuation(doc)
        doc = utils.rem_additional_whitespaces(doc)
        doc = nltk.word_tokenize(doc)
        doc = [element.lower() for element in doc]
        #doc = utils.stem_words(doc, language='english')
        corpus_prep.append(doc)
    words, counts = utils.wordcount_corpus(corpus_prep)
    sorted_tuplelist = np.array(sorted(zip(words,counts), key=itemgetter(1), reverse=True))
    sorted_tuplelist = sorted_tuplelist[upper:lower,:]
    print(sorted_tuplelist)
    vocab_size = len(sorted_tuplelist)
    indices = np.arange(vocab_size) + 3
    word_to_index = dict(zip(sorted_tuplelist[:,0], indices))
    #word_to_index = {key: value for key, value in wor}
    word_to_index['<PAD>'] = 0
    word_to_index['<START>'] = 1
    word_to_index['<UNKNOWN>'] = 2
    index_to_word = {value: key for key, value in word_to_index.items()}
    corpus_enc = []
    for doc in corpus_prep:
        doc_prep = [1]
        for word in doc:
            try:
                doc_prep.append(word_to_index[word])
            except:
                continue
        corpus_enc.append(doc_prep)
    return word_to_index, index_to_word, corpus_enc

In [30]:
word_to_index, index_to_word, corps = wordcount_encoding(corpus, 0, 3)

[['hier' '3']
 ['ab' '1']
 ['das' '1']]


In [31]:
word_to_index
index_to_word

{3: 'hier', 4: 'ab', 5: 'das', 0: '<PAD>', 1: '<START>', 2: '<UNKNOWN>'}

In [32]:
pad_sequences(corps)

array([[1, 4, 3],
       [1, 5, 3],
       [0, 1, 3]], dtype=int32)

In [33]:
df = pd.read_csv('labeledTrainData.tsv', sep='\t')
df.info()
print(df.head())
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
id           25000 non-null object
sentiment    25000 non-null int64
review       25000 non-null object
dtypes: int64(1), object(2)
memory usage: 586.0+ KB
       id  sentiment                                             review
0  5814_8          1  With all this stuff going down at the moment w...
1  2381_9          1  \The Classic War of the Worlds\" by Timothy Hi...
2  7759_3          0  The film starts with a manager (Nicholas Bell)...
3  3630_4          0  It must be assumed that those who praised this...
4  9495_8          1  Superbly trashy and wondrously unpretentious 8...


Unnamed: 0,sentiment
count,25000.0
mean,0.5
std,0.50001
min,0.0
25%,0.0
50%,0.5
75%,1.0
max,1.0


In [34]:
from sklearn.model_selection import train_test_split

In [35]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
X_train, X_test = df_train['review'].values, df_test['review']
y_train, y_test = df_train['sentiment'], df_test['sentiment']

In [39]:
X_train_prep = [utils.text_preprocess(x) for x in X_train[:20]]

In [42]:
ce.fit(X_train_prep)

In [43]:
ce.word_to_index

{'movi': 3,
 'film': 4,
 'one': 5,
 'get': 6,
 'time': 7,
 'watch': 8,
 'bad': 9,
 'see': 10,
 'stori': 11,
 'thing': 12,
 'would': 13,
 'like': 14,
 'peopl': 15,
 'good': 16,
 'much': 17,
 'veri': 18,
 'well': 19,
 'ever': 20,
 'got': 21,
 'look': 22,
 'onli': 23,
 'read': 24,
 'star': 25,
 'think': 26,
 '10': 27,
 'ani': 28,
 'famili': 29,
 'feel': 30,
 'great': 31,
 'made': 32,
 'make': 33,
 'need': 34,
 'seen': 35,
 'year': 36,
 'act': 37,
 'actual': 38,
 'also': 39,
 'back': 40,
 'best': 41,
 'camera': 42,
 'charact': 43,
 'everi': 44,
 'gandhi': 45,
 'go': 46,
 'left': 47,
 'life': 48,
 'light': 49,
 'origin': 50,
 'perfect': 51,
 'perform': 52,
 'play': 53,
 'seem': 54,
 'shot': 55,
 'show': 56,
 'worst': 57,
 'zombi': 58,
 '90': 59,
 'actor': 60,
 'beauti': 61,
 'becaus': 62,
 'bit': 63,
 'book': 64,
 'come': 65,
 'day': 66,
 'director': 67,
 'even': 68,
 'figur': 69,
 'gore': 70,
 'guy': 71,
 'hammer': 72,
 'happen': 73,
 'horror': 74,
 'know': 75,
 'main': 76,
 'might': 77,
 

In [310]:
word_to_index, index_to_word, X_train_prep = wordcount_encoding(X_train, 50, 5000)

[['more' '11443']
 ['she' '11308']
 ['when' '11303']
 ...
 ['dud' '66']
 ['enormous' '66']
 ['examination' '66']]


In [311]:
#X_test_prep = []
#for doc in X_test:
#    doc_prep = []
#    for word in doc:
#        try:
#            doc_prep.append(word_to_index[word])
#        except:
#            doc_prep.append(1)
#    X_test_prep.append(doc_prep)

In [312]:
pd.Series(X_train_prep).apply(lambda x: len(x)).describe()

count    20000.000000
mean       114.196600
std         83.682466
min          3.000000
25%         63.000000
50%         86.000000
75%        139.000000
max       1118.000000
dtype: float64

In [313]:
X_train_pad = pad_sequences(X_train_prep, maxlen=150)
X_test_pad = pad_sequences(X_test_prep, maxlen=150)

In [314]:
X_train_pad[:2]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    1, 1003,  913,  924, 2818, 1759, 3977, 1298, 3233, 1555,
        1759, 3977, 1167, 1156,  232,  963,  220, 1224,  243, 3312,  668,
        3977, 1962,  135,  931,   26,  840,  452, 3977,   34, 3464,   38,
          28,  423,   22,   49,  913,   26,  195,  181,  789, 1227, 3690,
        1175,   31,  492,  156,  242, 4744,  248,  218,  776, 1809,   28,
         173, 2487, 1305,   96, 2297, 1962,  572,  132, 1226,    5,  319,
        2586, 2317, 2944,    4,   47,   71,  448,    5,  319, 2580,   96,
         276,  196,  951,  144, 3690, 1175,   28,  173, 3977, 3850, 1509,
          51,  215,  547,  485,  238, 1453,  183,  451,  215,  173, 1240,
          23,  172,  319, 2362, 1479, 1635, 1926,   28,  157,   27,   59,
          13,   14,  229,  166,   57, 

In [315]:
from keras.layers import Embedding, Dropout
top_words = np.matrix(X_train_pad)

In [316]:
#X_train_prep = []
len(X_train_pad[0])

150

In [317]:
pd.DataFrame(X_train_pad).max().max()

4952

In [322]:
embedding_vector_length = 32 
model = Sequential() 
model.add(Embedding(4952+2, 256))
model.add(LSTM(150,dropout=0.4, recurrent_dropout=0.4)) 
model.add(Dense(1, activation='sigmoid')) 
model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy']) 
print(model.summary()) 

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_27 (Embedding)     (None, None, 256)         1268224   
_________________________________________________________________
lstm_26 (LSTM)               (None, 150)               244200    
_________________________________________________________________
dense_26 (Dense)             (None, 1)                 151       
Total params: 1,512,575
Trainable params: 1,512,575
Non-trainable params: 0
_________________________________________________________________
None


In [323]:
model.fit(X_train_pad, y_train, validation_split=0.2, nb_epoch=10, batch_size=128)

  """Entry point for launching an IPython kernel.


Train on 16000 samples, validate on 4000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f4866ef8518>

In [17]:
tk.fit_on_texts(corpus)
print(tk.word_counts)
print(tk.document_count)
print(tk.word_index)
print(tk.word_docs)

OrderedDict([('was', 4), ('geht', 2), ('ab', 2), ('hier', 6), ('soll', 2), ('das', 2), ('ist', 2), ('es', 2), ('langweilig', 2)])
6
{'hier': 1, 'was': 2, 'geht': 3, 'ab': 4, 'soll': 5, 'das': 6, 'ist': 7, 'es': 8, 'langweilig': 9}
{'hier': 6, 'was': 4, 'geht': 2, 'ab': 2, 'das': 2, 'soll': 2, 'ist': 2, 'langweilig': 2, 'es': 2}


In [18]:
encoded_docs = tk.texts_to_matrix(corpus, mode='count')
print(encoded_docs)

[[0. 1. 1. 1. 1. 0. 0. 0. 0. 0.]
 [0. 1. 1. 0. 0. 1. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 1. 1. 1.]]


In [6]:
utils.wordcount_corpus(corpus,flatten=True )

(array([' ', '?', 'H', 'W', 'a', 'b', 'd', 'e', 'g', 'h', 'i', 'l', 'n',
        'o', 'r', 's', 't', 'w'], dtype='<U1'),
 array([9, 1, 1, 2, 5, 1, 1, 6, 3, 3, 6, 4, 1, 1, 3, 6, 2, 1]))

### Read Data

In [2]:
df = pd.read_pickle('reviews.pkl')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 5 columns):
id                               25000 non-null object
sentiment                        25000 non-null int64
review                           25000 non-null object
review_preprocessed              25000 non-null object
review_preprocessed_tokenized    25000 non-null object
dtypes: int64(1), object(4)
memory usage: 976.6+ KB


In [4]:
X_train, X_test, y_train, y_test = train_test_split(df['review_preprocessed_tokenized'],
                                                    df['sentiment'], test_size=0.2, random_state=42) 

In [5]:
X_train

23311    [movi, plain, dumb, cast, ralph, meeker, mike,...
23623    [dahmer, young, confus, man, dahmer, confus, m...
1020     [may, saint, preserv, us, becaus, movi, go, he...
12645    [combin, read, novella, view, film, inspir, wi...
1533     [daniel, day, lewi, left, foot, give, us, one,...
3518     [perhap, former, moscovit, elast, sens, humor,...
483      [kid, took, movi, love, four, children, age, 9...
19370    [well, well, roeg, touch, bit, nerv, genius, c...
12667    [excel, film, understand, whi, mani, peopl, li...
7403     [particular, fond, remak, steal, modern, jargo...
2712     [think, movi, got, low, rate, becaus, got, jud...
11605    [read, plot, summari, worst, one, ever, read, ...
7921     [wait, wait, film, come, trailer, seem, year, ...
7801     [premis, may, seem, goofi, sinc, murphi, chara...
24811    [back, high, school, day, salina, kansa, film,...
20273    [act, film, veri, well, act, film, say, perfor...
6750     [movi, classic, perfect, certain, pace, perfec.

In [12]:
%%bash
pwd

/home/jodahr/jupyter/notebooks/Misc/MovieRatings-TextMining


In [8]:
word2index, indextword = utils.get_vocabulary(df.review_preprocessed_tokenized)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [11]:
def sentence_to_index(token_list, word2index):
    index_sequence = []
    for element in token_list:
        index = word2index[element]
        index_sequence.append(index)
    return index_sequence

In [13]:
df['review_index'] = df.review_preprocessed_tokenized.apply(lambda x: sentence_to_index(x,word2index))

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 6 columns):
id                               25000 non-null object
sentiment                        25000 non-null int64
review                           25000 non-null object
review_preprocessed              25000 non-null object
review_preprocessed_tokenized    25000 non-null object
review_index                     25000 non-null object
dtypes: int64(1), object(5)
memory usage: 1.1+ MB


In [15]:
df['doc_length'] = df.review_index.apply(lambda x: len(x))

In [18]:
df['doc_length'].median()

91.0