In [None]:
!pip install nltk sklearn pandas

You should consider upgrading via the '/opt/python/envs/default/bin/python -m pip install --upgrade pip' command.[0m


In [None]:
import pandas as pd    
import numpy as np  
import nltk
from nltk.corpus import stopwords   
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential     # the model
from tensorflow.keras.layers import Embedding, LSTM, Dense # layers of the architecture
from tensorflow.keras.callbacks import ModelCheckpoint   # save model
from tensorflow.keras.models import load_model   # load saved model
import re

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/datalore/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
data = pd.read_csv('train.csv',index_col=0)
print(data)

                                                    text  label
id                                                             
0      I am at a distinct disadvantage here. I have n...      0
1      Micro-phonies is a classic Stooge short. The g...      1
2      The story has been told before. A deadly disea...      1
3      As I post this comment, IMDb currently rates A...      0
4      This film was reeeeeeallyyyy bad! Was it meant...      0
...                                                  ...    ...
29995  This is a brilliant and well made contribution...      1
29996  When i was told of this movie i thought it wou...      1
29997  It's getting worse, the series is on a serious...      0
29998  Big fat slob 'Uncle Buck', played by John (eat...      0
29999  This is a movie with an excellent concept for ...      0

[30000 rows x 2 columns]


## Pre-processing

In [None]:
english_stops = set(stopwords.words('english'))

In [None]:
def load_dataset():
    df = pd.read_csv('train.csv',index_col=0)

    x_data = df['text']
    y_data = df['label']

    x_data = x_data.replace({'<.*?>': ''}, regex = True)          # remove html tag
    x_data = x_data.replace({'[^A-Za-z]': ' '}, regex = True)     # remove non alphabet
    x_data = x_data.apply(lambda review: [w for w in review.split() if w not in english_stops])  # remove stop words
    x_data = x_data.apply(lambda review: [w.lower() for w in review])   # lower case

    return x_data, y_data

x_data, y_data = load_dataset()

print('text')
print(x_data, '\n')
print('label')
print(y_data)

text
id
0        [i, distinct, disadvantage, i, seen, first, tw...
1        [micro, phonies, classic, stooge, short, the, ...
2        [the, story, told, a, deadly, disease, spreadi...
3        [as, i, post, comment, imdb, currently, rates,...
4        [this, film, reeeeeeallyyyy, bad, was, meant, ...
                               ...                        
29995    [this, brilliant, well, made, contribution, gr...
29996    [when, told, movie, thought, would, another, c...
29997    [it, getting, worse, series, serious, fall, th...
29998    [big, fat, slob, uncle, buck, played, john, ea...
29999    [this, movie, excellent, concept, story, got, ...
Name: text, Length: 30000, dtype: object 

label
id
0        0
1        1
2        1
3        0
4        0
        ..
29995    1
29996    1
29997    0
29998    0
29999    0
Name: label, Length: 30000, dtype: int64


## Spliting data

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.2)

print('******** Train Set *********')
print(x_train, '\n')
print(x_test, '\n')
print('******** Test Set *********')
print(y_train, '\n')
print(y_test)

******** Train Set *********
id
16265    [after, repeated, listenings, cd, soundtrack, ...
2375     [this, one, features, interesting, way, handli...
156      [i, think, croc, hunter, pretty, cool, guy, i,...
26593    [i, sometimes, enjoy, really, lousy, movies, o...
19994    [drawing, restraint, kind, movie, one, either,...
                               ...                        
4720     [i, impressed, i, could, take, year, old, son,...
12558    [repetitive, music, annoying, narration, terri...
23240    [a, sentimental, school, drama, set, denmark, ...
10756    [i, liked, movies, its, another, yash, raj, fi...
15197    [this, one, time, favorite, films, may, move, ...
Name: text, Length: 24000, dtype: object 

id
27783    [this, pleasant, film, even, premise, silly, i...
2196     [considering, teen, films, like, breakfast, cl...
15701    [i, love, book, jane, eyre, seen, many, versio...
21149    [i, found, movie, funny, i, loved, made, polit...
469      [this, dreadful, boring, mov

In [None]:
def get_max_length():
    review_length = []
    for review in x_train:
        review_length.append(len(review))

    return int(np.ceil(np.mean(review_length)))

In [None]:
token = Tokenizer(lower=False) 
token.fit_on_texts(x_train)

x_train = token.texts_to_sequences(x_train)
x_test = token.texts_to_sequences(x_test)

max_length = get_max_length()

x_train = pad_sequences(x_train, maxlen=max_length, padding='post', truncating='post')
x_test = pad_sequences(x_test, maxlen=max_length, padding='post', truncating='post')

total_words = len(token.word_index) + 1   # add 1 because of 0 padding

print('encoded X Train\n', x_train, '\n')
print('encoded X Test\n', x_test, '\n')
print('max review length: ', max_length)

encoded X Train
 [[  305  2299 36654 ...  1317   196   692]
 [    8     5   875 ...     0     0     0]
 [    1    31  7465 ...     0     0     0]
 ...
 [   40  3306   279 ...  3845  1596  7028]
 [    1   346    28 ...    98  1598  1194]
 [    8     5    12 ...     0     0     0]] 

encoded X Test
 [[    8  2366     4 ...     0     0     0]
 [ 1008  1395    37 ...     0     0     0]
 [    1    49   165 ...     0     0     0]
 ...
 [ 1486   198  5917 ...  1407  1219   984]
 [    1   127   549 ...     0     0     0]
 [  600  3265 37505 ...     0     0     0]] 

max review length:  131


## Building model

In [None]:
EMBED_DIM = 32
LSTM_OUT = 32

model = Sequential()
model.add(Embedding(total_words, EMBED_DIM, input_length = max_length))
model.add(LSTM(LSTM_OUT))
model.add(Dense(1, activation='relu'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

print(model.summary())

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 131, 32)           2337504   
                                                                 
 lstm_3 (LSTM)               (None, 32)                8320      
                                                                 
 dense_3 (Dense)             (None, 1)                 33        
                                                                 
Total params: 2,345,857
Trainable params: 2,345,857
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
checkpoint = ModelCheckpoint(
    './LSTM.h5',
    monitor='accuracy',
    save_best_only=True,
    verbose=1
)

## Training 

In [None]:
model.fit(x_train, y_train, batch_size = 256, epochs = 10, callbacks=[checkpoint])

Epoch 1/10
Epoch 00001: accuracy improved from -inf to 0.57262, saving model to ./LSTM.h5
Epoch 2/10
Epoch 00002: accuracy improved from 0.57262 to 0.72596, saving model to ./LSTM.h5
Epoch 3/10
Epoch 00003: accuracy did not improve from 0.72596
Epoch 4/10
Epoch 00004: accuracy improved from 0.72596 to 0.81937, saving model to ./LSTM.h5
Epoch 5/10
Epoch 00005: accuracy improved from 0.81937 to 0.93275, saving model to ./LSTM.h5
Epoch 6/10
Epoch 00006: accuracy improved from 0.93275 to 0.95779, saving model to ./LSTM.h5
Epoch 7/10
Epoch 00007: accuracy did not improve from 0.95779
Epoch 8/10
Epoch 00008: accuracy did not improve from 0.95779
Epoch 9/10
Epoch 00009: accuracy improved from 0.95779 to 0.96929, saving model to ./LSTM.h5
Epoch 10/10
Epoch 00010: accuracy improved from 0.96929 to 0.98154, saving model to ./LSTM.h5


<keras.callbacks.History at 0x7f8433acce50>

## Testing

In [None]:
model.evaluate(x_test, y_test,batch_size = 128)



[0.5046171545982361, 0.8728333115577698]

In [None]:
loaded_model = load_model('./LSTM.h5')