In [3]:
from nltk.corpus import stopwords
import string
import re
import os
from collections import Counter

rootdir_pos = '/Users/Mal/Desktop/review_polarity/txt_sentoken/pos'
rootdir_neg = '/Users/Mal/Desktop/review_polarity/txt_sentoken/neg'

In [4]:
def load_doc(filename):
    file = open(filename, "r")
    text = file.read()
    file.close()
    return text

def clean_doc(doc):
    tokens = doc.split()
    re_punc = re.compile("[%s]" % re.escape(string.punctuation))
    tokens = [re_punc.sub("", w) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    stop_words = set(stopwords.words("english"))
    tokens = [w for w in tokens if not w in stop_words]
    tokens = [word for word in tokens if len(word) > 1]
    return tokens

def add_doc_to_vocab(filename, vocab):
    doc = load_doc(filename)
    tokens = clean_doc(doc)
    vocab.update(tokens)

In [5]:
vocab = Counter()

def process_docs_tokens(pos, neg):
    for subdir, dirs, files in os.walk(pos):
        for file in files:
            filepath = subdir + os.sep + file
            #print(file)
            if file.startswith("cv9"):
                #print("here", file)
                continue
            else:
                add_doc_to_vocab(filepath, vocab)

    for subdir, dirs, files in os.walk(neg):
        for file in files:
            filepath = subdir + os.sep + file
            #print(file)
            if file.startswith("cv9"):
                #print("here", file)
                continue
            else:
                add_doc_to_vocab(filepath, vocab)

    #print(len(vocab))
    #print(vocab.most_common(50))

def save_list(lines, filename):
    data = "\n".join(lines)
    file = open(filename,"w")
    file.write(data)
    file.close()

process_docs_tokens(rootdir_pos, rootdir_neg)
min_occurrence = 2
tokens = [k for k,c in vocab.items() if c >= min_occurrence]
print(len(tokens))
save_list(tokens,"vocab.txt")

25767


In [6]:
def doc_to_line(filename, vocab):
    doc = load_doc(filename)
    tokens = clean_doc(doc)
    tokens = [w for w in tokens if w in vocab]
    return  " ".join(tokens)

def process_docs(directory, vocab, is_train):
    lines = list()
    for subdir, dirs, files in os.walk(directory):
        for file in files:
            filepath = subdir + os.sep + file
            #print(file)
            if is_train and file.startswith("cv9"):
                #print("here", file)
                continue
            if not is_train and not file.startswith("cv9"):
                continue
            else:
                add_doc_to_vocab(filepath, vocab)
                line = doc_to_line(filepath, vocab)
                lines.append(line)
    return lines

def load_clean_dataset(vocab, is_train):
    neg = process_docs(rootdir_neg , vocab, is_train)
    pos = process_docs(rootdir_pos, vocab, is_train)
    docs = neg + pos
    labels = [0 for _ in range(len(neg))] + [1 for _ in range(len(pos))]
    return docs, labels

from keras.preprocessing.text import Tokenizer

def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer


In [7]:

from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import Dense
def define_model(n_words):
  # define network
    model = Sequential()
    model.add(Dense(50, input_shape=(n_words,), activation= 'relu' ))
    model.add(Dense(1, activation= 'sigmoid' ))
  # compile network
    model.compile(loss= 'binary_crossentropy' , optimizer= 'adam' , metrics=[ 'accuracy'])
  # summarize defined model
    model.summary()
    #plot_model(model, to_file='model.png' , show_shapes=True)
    return model

vocab_filename = "vocab.txt"
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)

train_docs, ytrain = load_clean_dataset(vocab, True)
test_docs, ytest = load_clean_dataset(vocab, False)
tokenizer = create_tokenizer(train_docs)
#print(Xtrain.shape, Xtest.shape)
Xtrain = tokenizer.texts_to_matrix(train_docs, mode="freq")
Xtest = tokenizer.texts_to_matrix(test_docs, mode="freq")
# define the model
n_words = Xtest.shape[1]
model = define_model(n_words)
# fit network
model.fit(Xtrain, ytrain, epochs=10, verbose=2)
# evaluate
loss, acc = model.evaluate(Xtest, ytest, verbose=0)
print( 'Test Accuracy: %f' % (acc*100))


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 50)                2213900   
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 51        
Total params: 2,213,951
Trainable params: 2,213,951
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
 - 6s - loss: 0.6918 - accuracy: 0.5961
Epoch 2/10
 - 4s - loss: 0.6830 - accuracy: 0.8417
Epoch 3/10
 - 4s - loss: 0.6671 - accuracy: 0.8883
Epoch 4/10
 - 4s - loss: 0.6414 - accuracy: 0.8928
Epoch 5/10
 - 4s - loss: 0.6069 - accuracy: 0.9444
Epoch 6/10
 - 4s - loss: 0.5664 - accuracy: 0.9417
Epoch 7/10
 - 4s - loss: 0.5221 - accuracy: 0.9556
Epoch 8/10
 - 4s - loss: 0.4776 - accuracy: 0.9639
Epoch 9/10
 - 4s - loss: 0.4330 - accuracy: 0.9672
Epoch 10/10
 - 4s - loss: 0.3921 - accuracy: 0.

In [8]:
from pandas import DataFrame
from matplotlib import pyplot

def prepare_data(train_docs, test_docs, mode):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(train_docs)
    Xtrain = tokenizer.texts_to_matrix(train_docs, mode=mode)
    Xtest = tokenizer.texts_to_matrix(test_docs, mode=mode)
    return Xtrain, Xtest

def evaluate_mode(Xtrain, ytrain, Xtest, ytest):
    scores = list()
    n_repeats = 3
    n_words = Xtest.shape[1]
    for i in range(n_repeats):
    # define network
        model = Sequential()
        model.add(Dense(50, input_shape=(n_words,), activation= 'relu' ))
        model.add(Dense(1, activation='sigmoid' ))
    # compile network
        model.compile(loss= 'binary_crossentropy' , optimizer= 'adam' , metrics=[ 'accuracy' ])
    # fit network
        model.fit(Xtrain, ytrain, epochs=10, verbose=2)
    # evaluate
        loss, acc = model.evaluate(Xtest, ytest, verbose=0)
        scores.append(acc)
        print( '%d accuracy: %s'  % ((i+1), acc))
    return scores

vocab_filename =  'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = set(vocab.split())
# load all reviews
train_docs, ytrain = load_clean_dataset(vocab, True)
test_docs, ytest = load_clean_dataset(vocab, False)
# run experiment
modes = [ 'binary', 'count', 'tfidf', 'freq']
results = DataFrame()
for mode in modes:
  # prepare data for mode
    Xtrain, Xtest = prepare_data(train_docs, test_docs, mode)
  # evaluate model on data for mode
    results[mode] = evaluate_mode(Xtrain, ytrain, Xtest, ytest)
# summarize results
print(results.describe())
# plot results
results.boxplot()
pyplot.show()

Epoch 1/10
 - 5s - loss: 0.4659 - accuracy: 0.7828
Epoch 2/10
 - 4s - loss: 0.0406 - accuracy: 0.9978
Epoch 3/10
 - 4s - loss: 0.0113 - accuracy: 1.0000
Epoch 4/10
 - 4s - loss: 0.0058 - accuracy: 1.0000
Epoch 5/10
 - 4s - loss: 0.0034 - accuracy: 1.0000
Epoch 6/10
 - 4s - loss: 0.0020 - accuracy: 1.0000
Epoch 7/10
 - 4s - loss: 0.0013 - accuracy: 1.0000
Epoch 8/10
 - 4s - loss: 8.8571e-04 - accuracy: 1.0000
Epoch 9/10
 - 4s - loss: 6.4882e-04 - accuracy: 1.0000
Epoch 10/10
 - 4s - loss: 4.9566e-04 - accuracy: 1.0000
1 accuracy: 0.925000011920929
Epoch 1/10
 - 5s - loss: 0.4636 - accuracy: 0.7944
Epoch 2/10
 - 4s - loss: 0.0412 - accuracy: 0.9967
Epoch 3/10
 - 4s - loss: 0.0129 - accuracy: 1.0000
Epoch 4/10
 - 4s - loss: 0.0065 - accuracy: 1.0000
Epoch 5/10
 - 4s - loss: 0.0036 - accuracy: 1.0000
Epoch 6/10
 - 4s - loss: 0.0022 - accuracy: 1.0000
Epoch 7/10
 - 4s - loss: 0.0012 - accuracy: 1.0000
Epoch 8/10
 - 4s - loss: 7.5261e-04 - accuracy: 1.0000
Epoch 9/10
 - 4s - loss: 4.9368e-04

<Figure size 640x480 with 1 Axes>

In [10]:
def predict_sentiment(review, vocab, tokenizer, model):
    tokens = clean_doc(review)
    tokens = [w for w in tokens if w in vocab]
    line = ' '.join(tokens)
    encoded = tokenizer.texts_to_matrix([line], mode= 'binary' )
    yhat = model.predict(encoded, verbose=0)
    percent_pos = yhat[0,0]
    if round(percent_pos) == 0:
        return (1-percent_pos), 'NEGATIVE'
    return percent_pos,  'POSITIVE'

In [12]:
text = 'Best movie ever! It was great, I recommend it.'
percent, sentiment = predict_sentiment(text, vocab, tokenizer, model)
print('Review: [%s]\nSentiment: %s (%.3f%%)' % (text, sentiment, percent*100))
text = 'This is a bad movie.'
percent, sentiment = predict_sentiment(text, vocab, tokenizer, model)
print('Review: [%s]\nSentiment: %s (%.3f%%)' % (text, sentiment, percent*100))


Review: [Best movie ever! It was great, I recommend it.]
Sentiment: POSITIVE (99.979%)
Review: [This is a bad movie.]
Sentiment: NEGATIVE (99.919%)
