In [1]:
import string 
import re 
from os import listdir 
from collections import Counter 
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer 
from keras.utils.vis_utils import plot_model 
from keras.models import Sequential 
from keras.layers import Dense
from keras.layers import Flatten 
from pandas import DataFrame 
from matplotlib import pyplot
from keras.layers import Embedding 
from keras.layers.convolutional import Conv1D 
from keras.layers.convolutional import MaxPooling1D
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model


Using TensorFlow backend.


In [2]:
# load doc into memory 
def load_doc(filename): 
    # open the file as read only 
    file = open(filename,'r') 
    # read all text 
    text = file.read() 
    # close the file 
    file.close() 
    return text

In [3]:
# turn a doc into clean tokens 
def clean_doc(doc,vocab): 
    # split into tokens by white space 
    tokens = doc.split() 
    # prepare regex for char filtering 
    re_punc = re.compile('[%s]' % re.escape(string.punctuation)) 
    # remove punctuation from each word 
    tokens = [re_punc.sub('', w) for w in tokens] 
    return ' '.join(tokens)
    return tokens

In [4]:
# load doc, clean and return line of tokens 
def doc_to_line(filename, vocab): 
    # load the doc 
    doc = load_doc(filename) 
    # clean doc 
    tokens = clean_doc(doc,vocab) 
    # filter by vocab 
    tokens = [w for w in tokens if w in vocab] 
    return ' '.join(tokens)

In [5]:

# load all docs in a directory 
def process_docs(directory, vocab,is_train): 
    lines = list()
    # walk through all files in the folder 
    for filename in listdir(directory): 
        # skip any reviews in the test set 
        if is_train and filename.startswith('cv9'): 
            continue 
        if not is_train and not filename.startswith('cv9'): 
            continue     
        # create the full path of the file to open 
        path = directory + '/' + filename 
        # load the doc 
        doc = load_doc(path) 
        # clean doc 
        tokens = clean_doc(doc,vocab) 
        lines.append(tokens)
    return lines


In [6]:

# load and clean a dataset 
def load_clean_dataset(vocab,is_train): 
    # load documents 
    neg = process_docs('txt_sentoken/neg', vocab,is_train) 
    pos = process_docs('txt_sentoken/pos', vocab,is_train) 
    docs = neg + pos 
    # prepare labels 
    labels = [0 for _ in range(len(neg))] + [1 for _ in range(len(pos))] 
    return docs, labels    

In [7]:
# fit a tokenizer 
def create_tokenizer(lines): 
    tokenizer = Tokenizer() 
    tokenizer.fit_on_texts(lines) 
    return tokenizer


In [8]:
# integer encode and pad documents 
def encode_docs(tokenizer, max_length, docs): 
    # integer encode 
    encoded = tokenizer.texts_to_sequences(docs) 
    # pad sequences 
    padded = pad_sequences(encoded, maxlen=max_length, padding='post') 
    return padded


In [9]:
# define the model 
def define_model(vocab_size, max_length): 
    # define network 
    model = Sequential() 
    model.add(Embedding(vocab_size, 100, input_length=max_length)) 
    model.add(Conv1D(filters=32, kernel_size=8, activation='relu')) 
    model.add(MaxPooling1D(pool_size=2)) 
    model.add(Flatten()) 
    model.add(Dense(10, activation='relu')) 
    model.add(Dense(1, activation='sigmoid')) 
    # compile network 
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 
    # summarize defined 
    model.summary() 
    plot_model(model, to_file='model_cnn.png', show_shapes=True) 
    return model


In [10]:
# classify a review as negative or positive 
def predict_sentiment(review, vocab, tokenizer, max_length, model): 
    # clean review 
    line = clean_doc(review, vocab) 
    # encode and pad review 
    padded = encode_docs(tokenizer, max_length, [line]) 
    # predict sentiment 
    yhat = model.predict(padded, verbose=0) 
    # retrieve predicted percentage and label 
    percent_pos = yhat[0,0] 
    if round(percent_pos) == 0: 
        return (1-percent_pos), 'NEGATIVE' 
    return percent_pos, 'POSITIVE'



In [11]:
# load the vocabulary 
vocab_filename = 'vocab.txt' 
vocab = load_doc(vocab_filename) 
vocab = set(vocab.split()) 
# load training data 
train_docs, ytrain = load_clean_dataset(vocab, True) 
# create the tokenizer 
tokenizer = create_tokenizer(train_docs) 
# define vocabulary size 
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary size: %d' % vocab_size) 
# calculate the maximum sequence length 
max_length = max([len(s.split()) for s in train_docs]) 
print('Maximum length: %d' % max_length) 
# encode data 
Xtrain = encode_docs(tokenizer, max_length, train_docs) 
# define model 
model = define_model(vocab_size, max_length) 
# fit network 
model.fit(Xtrain, ytrain, epochs=10, verbose=2) 
# save the model 
model.save('model_cnn.h5')

Vocabulary size: 45242
Maximum length: 2365






Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 2365, 100)         4524200   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 2358, 32)          25632     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 1179, 32)          0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 37728)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 10)                377290    
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 11       

In [12]:
test_docs, ytest = load_clean_dataset(vocab, False)
Xtest = encode_docs(tokenizer, max_length, test_docs) 
# load the model 
model = load_model('model_cnn.h5') 
# evaluate model on training dataset 
_, acc = model.evaluate(Xtrain, ytrain, verbose=0) 
print('Train Accuracy: %.2f' % (acc*100)) 
# evaluate model on test dataset 
_, acc = model.evaluate(Xtest, ytest, verbose=0) 
print('Test Accuracy: %.2f' % (acc*100))


Train Accuracy: 100.00
Test Accuracy: 89.00


In [13]:
text = 'Everyone will enjoy this film. I love it, recommended!' 
percent, sentiment = predict_sentiment(text, vocab, tokenizer, max_length, model) 
print('Review: [%s]\nSentiment: %s (%.3f%%)' % (text, sentiment, percent*100)) 
# test negative text 
text = 'This is a bad movie. Do not watch it. It sucks.' 
percent, sentiment = predict_sentiment(text, vocab, tokenizer, max_length, model) 
print('Review: [%s]\nSentiment: %s (%.3f%%)' % (text, sentiment, percent*100))


Review: [Everyone will enjoy this film. I love it, recommended!]
Sentiment: POSITIVE (55.834%)
Review: [This is a bad movie. Do not watch it. It sucks.]
Sentiment: NEGATIVE (60.949%)


In [14]:
text = 'It was nice ! i enjoyed it' 
percent, sentiment = predict_sentiment(text, vocab, tokenizer,max_length, model) 
print('Review: [%s]\nSentiment: %s (%.3f%%)' % (text, sentiment, percent*100))

Review: [It was nice ! i enjoyed it]
Sentiment: POSITIVE (51.938%)


In [15]:
text = 'I hate the movvie!' 
percent, sentiment = predict_sentiment(text, vocab, tokenizer,max_length, model) 
print('Review: [%s]\nSentiment: %s (%.3f%%)' % (text, sentiment, percent*100))

Review: [I hate the movvie!]
Sentiment: NEGATIVE (50.493%)
