In [29]:
import string
import re
from os import listdir
from collections import Counter
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.utils.vis_utils import plot_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.python.keras import backend

In [30]:
# Load doc into memory
def load_doc(filename):
    file = open(filename,'r')
    text = file.read()
    file.close()
    return text

In [31]:
# Turn a doc into clean tokens
def clean_doc(doc):
    tokens = doc.split()
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    tokens = [re_punc.sub('',w) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    #filter out stop words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # Filter out short tokens
    tokens = [word for word in tokens if len(word)>1]
    return tokens

In [32]:
# Load doc , clean and retuirn line of tokens
def doc_to_line(filename,vocab):
    # load the doc
    doc = load_doc(filename)
    # clean doc
    tokens = clean_doc(doc)
    # filter by vocab
    tokens = [w for w in tokens if w in vocab]
    return ' '.join(tokens)

In [33]:
# load all docs in a directory
def process_docs(directory, vocab):
    lines = list()
    # walk through all files in the folder
    for filename in listdir(directory):
        # create a full path of the file to open
        path = directory + '/' + filename
        #load and clean the doc
        line = doc_to_line(path,vocab)
        #add to list
        lines.append(line)
    return lines

In [60]:
# Load and clean dataset
def load_clean_dataset(vocab):
    # load documents
    neg = process_docs('txt_sentoken/neg',vocab)
    pos = process_docs('txt_sentoken/pos',vocab)
    docs =neg +pos
    # prepare labels
    labels = [0 for _ in range(len(neg))] + [1 for _ in range(len(pos))]
    return docs, labels
                          

In [61]:
# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [62]:
# Define the model
def define_model(n_words):
    # define network
    model = Sequential()
    model.add(Dense(50,input_shape=(n_words,), activation = 'relu'))
    model.add(Dense(1,activation= 'sigmoid'))
    
    #compile the network
    model.compile(loss='binary_crossentropy', optimizer = 'adam',metrics=['accuracy'])
    
    #summary
    model.summary()
    
    #plot_model(model,to_file='model.png',show_shapes = True)
    return model

In [63]:
# Classify a review as negative or positive
def predict_sentiment(review,vocab,tokenizer,model):
    # Clean
    tokens = clean_doc(review)
    # filter byvocab
    tokens = [w for w in tokens if w in vocab]
    # convert to line
    line = ' '.join(tokens)
    # encode
    encoded = tokenizer.texts_to_matrix([line], mode = 'binary')
    # predict sentiment
    yhat = model.predict(encoded, verbose=0)
    #retrieve predicted percentage and label
    percent_pos = yhat[0,0]
    if round(percent_pos)==0:
        return (1-percent_pos), 'NEGATIVE'
    return percent_pos, 'POSITIVE'

In [64]:
# load the vocabulary
vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = set(vocab.split())
# load all reviews
train_docs, ytrain = load_clean_dataset(vocab)
test_docs,ytest = load_clean_dataset(vocab)
# create the tokenizer
tokenizer = create_tokenizer(train_docs)
# Encode data
Xtrain = tokenizer.texts_to_matrix(train_docs, mode = 'binary')
Xtest = tokenizer.texts_to_matrix(test_docs,mode = 'binary')
# define network
n_words = Xtrain.shape[1]
model = define_model(n_words)
# fit network
model.fit(Xtrain,ytrain,epochs = 10,verbose=2)

#test positive text
text = 'Best movie ever! It was great, I recommend it'
percent, sentiment = predict_sentiment(text,vocab,tokenizer,model)
print('Review: [%s]\nSentiment: %s (%.3f%%)' %(text, sentiment,percent*100))
# test negative text
text = 'This is a bad movie'
percent,sentiment = predict_sentiment(text,vocab,tokenizer,model)
print('Reviw: [%s]\nSentiment: %s (%.3f%%)' % (text,sentiment,percent*100))

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_15 (Dense)             (None, 50)                1288450   
_________________________________________________________________
dense_16 (Dense)             (None, 1)                 51        
Total params: 1,288,501
Trainable params: 1,288,501
Non-trainable params: 0
_________________________________________________________________
Train on 2000 samples
Epoch 1/10
2000/2000 - 1s - loss: 0.4382 - accuracy: 0.8160
Epoch 2/10
2000/2000 - 1s - loss: 0.0474 - accuracy: 0.9975
Epoch 3/10
2000/2000 - 1s - loss: 0.0141 - accuracy: 1.0000
Epoch 4/10
2000/2000 - 1s - loss: 0.0068 - accuracy: 1.0000
Epoch 5/10
2000/2000 - 1s - loss: 0.0037 - accuracy: 1.0000
Epoch 6/10
2000/2000 - 1s - loss: 0.0022 - accuracy: 1.0000
Epoch 7/10
2000/2000 - 1s - loss: 0.0015 - accuracy: 1.0000
Epoch 8/10
2000/2000 - 1s - loss: 0.0011 - accuracy: 1.0000
Epoc