In [180]:
import string
import re
from os import listdir
from numpy import array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from keras.utils.vis_utils import plot_model
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Embedding
from tensorflow.python.keras.layers.convolutional import Conv1D
from tensorflow.python.keras.layers.convolutional import MaxPooling1D
from tensorflow.python.keras import backend
from tensorflow import keras

In [181]:
# load doc into memory
def load_doc(filename):
    file = open(filename,'r')
    text = file.read()
    file.close()
    return text

In [182]:
# Turn a doc into clean tokens
def clean_doc(doc,vocab):
    tokens = doc.split()
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    tokens = [re_punc.sub('',w) for w in tokens]
    tokens = [w for w in tokens if w in vocab]
    tokens = ' '.join(tokens)
    return tokens

In [183]:
# Load all docs in a directory
def process_docs(directory,vocab,is_train):
    documents = list()
    for filename in listdir(directory):
        if is_train and filename.startswith('cv9'):
            continue
        if not is_train and not filename.startswith('cv9'):
            continue
        path = directory+'/'+filename
        doc = load_doc(path)
        tokens = clean_doc(doc,vocab)
        documents.append(tokens)
    return documents

In [184]:
# Load and clean a dataset
def load_clean_dataset(vocab,is_train):
    neg = process_docs('txt_sentoken/neg',vocab,is_train)
    pos = process_docs('txt_sentoken/pos',vocab,is_train)
    docs = neg+pos
    # Prepare labels
    labels = array([0 for _ in range(len(neg))]+[1 for _ in range(len(pos))])
    return docs,labels

In [185]:
# Fit a tokenizer
def create_tokenizer(lines):
    tokenizer =Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer


In [186]:
# integer encode and pad documents
def encode_docs(tokenizer,max_length,docs):
    encoded = tokenizer.texts_to_sequences(docs)
    #pad sequences
    padded = pad_sequences(encoded,maxlen=max_length,padding='post')
    return padded

In [187]:
# Define the model
def define_model(vocab_size,max_length):
    model =Sequential()
    model.add(Embedding(vocab_size,100,input_length= max_length))
    model.add(Conv1D(filters=32,kernel_size=8,activation= 'relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(10,activation='relu'))
    model.add(Dense(1,activation= 'sigmoid'))
    # Compile the network
    model.compile(loss = 'binary_crossentropy',optimizer = 'adam',metrics= ['accuracy'])
    #model.summary()
    return model

In [188]:
# Classify a review as postive or negative
def predict_sentiment(review,vocab,tokenizer,max_length,model):
    line = clean_doc(review,vocab)
    padded =encode_docs(tokenizer,max_length,[line])
    # Predict sentiment
    yhat = model.predict(padded,verbose = 0)
    percent_pos = yhat[0,0]
    if round(percent_pos)==0:
        return (1-percent_pos),'NEGATIVE'
    return percent_pos,'POSITIVE'

In [None]:
# Load the voicabulary
vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = set(vocab.split())
# load all reviews
train_docs,ytrain = load_clean_dataset(vocab,True)
test_docs, ytest = load_clean_dataset(vocab,False)
# Create the tokenizer
tokenizer = create_tokenizer(train_docs)
# define vocab size 
vocab_size = len(tokenizer.word_index)+1
print('Vocab Size: %d' %vocab_size)
# Calculate the max sequence length
max_length = max([len(s.split()) for s in train_docs])
print('Max Length: %d'%max_length)
# Encode data
Xtrain = encode_docs(tokenizer,max_length,train_docs)
Xtest = encode_docs(tokenizer,max_length,test_docs)
model = define_model(vocab_size,max_length)
model.fit(Xtrain,ytrain,epochs=10,verbose=2)
# Evaluate model on training data
_,acc = model.evaluate(Xtrain,ytrain,verbose = 0)
print('Train Accuracy: %.2f' %(acc*100))
# evaluate model on test dataset
_,acc = model.evaluate(Xtest,ytest,verbose = 0)
print('Test Accuracy: %.2f' %(acc*100))

Vocab Size: 25768
Max Length: 1317
Train on 1800 samples
Epoch 1/10
1800/1800 - 8s - loss: 0.6892 - accuracy: 0.5233
Epoch 2/10
1800/1800 - 8s - loss: 0.5288 - accuracy: 0.7356
Epoch 3/10
1800/1800 - 9s - loss: 0.1351 - accuracy: 0.9478
Epoch 4/10
1800/1800 - 8s - loss: 0.0166 - accuracy: 0.9989
Epoch 5/10
1800/1800 - 10s - loss: 0.0039 - accuracy: 1.0000
Epoch 6/10
1800/1800 - 10s - loss: 0.0020 - accuracy: 1.0000
Epoch 7/10
1800/1800 - 10s - loss: 0.0014 - accuracy: 1.0000
Epoch 8/10
1800/1800 - 10s - loss: 9.7693e-04 - accuracy: 1.0000
Epoch 9/10
1800/1800 - 10s - loss: 7.6753e-04 - accuracy: 1.0000
Epoch 10/10


In [None]:
# Testing
text = 'Everyone will enjoy this film.'
percent,sentiment = predict_sentiment(text,vocab,tokenizer,max_length,model)
print('Review" [%s]\nSentiment: %s (%.3f%%)' %(text,sentiment,percent*100))