In [1]:
from nltk.corpus import stopwords
import string, re
from os import listdir
from collections import Counter
import keras
from keras.preprocessing.text import Tokenizer
from keras import Sequential
from keras.layers import Dense
# import string and re for string and regex manipulation
# import stopwords to filter them out for memory usuage


# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all the text
    text = file.read()
    # close the file
    file.close()
    return text

# turn a doc into clean tokens
def clean_doc(doc):
    # split into tokens by white space
    tokens = doc.split()
    # prepare regex for char filtering
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    # remove punctuation from each word
    tokens = [re_punc.sub('', w) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # filter out short tokens
    tokens = [word for word in tokens if len(word)>1]
    return tokens

# add doc_to_line() which cleans and filters out tokens not in vocabulary
# load doc, clean and return line of tokens
def doc_to_line(filename, vocab):
    # load the doc
    doc = load_doc(filename)
    # clean doc
    tokens = clean_doc(doc)
    # filter by vocab
    tokens = [w for w in tokens if w in vocab]
    return ' '.join(tokens)

# load all docs in a directory
def process_docs(directory, vocab, is_train):
    lines = []
    # walk through all files in the folder
    for filename in listdir(directory):
        # skip any reviews in the test set
        # added is_train argument which is a boolean variable
        if is_train and filename.startswith('cv9'):
            continue
        if not is_train and not filename.startswith('cv9'):
            continue
        # create the full path of the file to open
        path = directory + '\\' + filename
        # load and clean the doc
        line = doc_to_line(path, vocab)
        # add to list
        lines.append(line)
    return lines

# load and clean a dataset
def load_clean_dataset(vocab, is_train):
    # takes in our vocabularly list that we've created by removing stopwords
    # from our data and filtering out words that appear < 2 times.
    # load the documents
    # added is_train argument which is a boolean variable
    neg = process_docs('C:\\Users\\Aaron\\Downloads\\txt_sentoken\\neg', vocab, is_train)
    pos = process_docs('C:\\Users\\Aaron\\Downloads\\txt_sentoken\\pos', vocab, is_train)
    docs = neg + pos
    # prepare labels
    labels = [0 for _ in range(len(neg))] + [1 for _ in range(len(pos))]
    return docs, labels

# important to note that here, the data is still not split yet!
# however, our vocabularly is based off of the first 900 words in neg and 
# first 900 in pos! So we have 200 files that potentially have words that are
# not in our vocabuluary list!

# load our vocabulary text file which was previously saved
vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = set(vocab.split())

# load all train reviews
docs, labels = load_clean_dataset(vocab, True)
# quick summary of what we have
print(len(docs), len(labels))

# Keras has a tokenizer class we could use that would just do all of the 
# work that we did above, but if we use it we're in less control and 
# know less of why our tokens are the way they are

# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

Using TensorFlow backend.


1800 1800


In [2]:
# load all reviews
# create training and test sets
train_docs, y_train = load_clean_dataset(vocab, True)
test_docs, y_test = load_clean_dataset(vocab, False)
# create the tokenizer
tokenizer = create_tokenizer(train_docs)
# encode data
X_train = tokenizer.texts_to_matrix(train_docs, mode = 'freq')
X_test = tokenizer.texts_to_matrix(test_docs, mode = 'freq')
print(X_train.shape, X_test.shape)

(1800, 25768) (200, 25768)


In [3]:
X_train[0]

array([0.        , 0.01519757, 0.00911854, ..., 0.        , 0.        ,
       0.        ])

In [4]:
# create size of input called n_words for our MLP model
n_words = X_test.shape[1]

In [10]:
# create a checkpoint call back to save the best model
checkpoint_cb = keras.callbacks.ModelCheckpoint("IMDB_Sentiment_Model.h5",
                                                save_best_only=True)

# create model using function
def define_model(n_words):
    # defining quick MLP
    model = Sequential()
    model.add(Dense(50, input_shape=(n_words,), activation='relu'))
    model.add(Dense(100, activation='relu'))
    model.add(Dense(100, activation='relu'))
    model.add(Dense(50, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    # compile network
    model.compile(loss='binary_crossentropy', 
                  optimizer='adam',
                  metrics=['accuracy'],)
    # summarize defined model
    model.summary()
    # plot_model(model, to_file='model.png', show_shapes=True)
    return model

In [11]:
# create model
model = define_model(n_words)
# fit our model
model.fit(X_train, y_train, epochs=8, callbacks=[checkpoint_cb])

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_5 (Dense)              (None, 50)                1288450   
_________________________________________________________________
dense_6 (Dense)              (None, 100)               5100      
_________________________________________________________________
dense_7 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_8 (Dense)              (None, 50)                5050      
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 51        
Total params: 1,308,751
Trainable params: 1,308,751
Non-trainable params: 0
_________________________________________________________________
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.callbacks.History at 0x263800f94c8>

In [12]:
# evaluating our model
loss, acc = model.evaluate(X_test, y_test)
print('Test Accuracy: %f' % (acc*100))

Test Accuracy: 91.000003


# Predicting Sentiment for New Reviews

In [8]:
# classify a review as negative or positive
def predict_sentiment(review, vocab, tokenizer, model):
    # clean/prep new unseen review for model
    tokens = clean_doc(review)
    # filter by vocab
    tokens = [w for w in tokens if w in vocab]
    # convert to line
    line = ' '.join(tokens)
    # encode the data
    encoded = tokenizer.texts_to_matrix([line], mode = 'binary')
    # predict sentiment
    prediction = model.predict(encoded, verbose = 0)
    percent_pos = prediction[0,0]
    if round(percent_pos) == 0:
        return (1-percent_pos), 'NEGATIVE'
    return percent_pos, 'POSITIVE'

In [9]:
# test positive text
text = 'Best movie ever! It was great and I recommend it.'
percent, sentiment = predict_sentiment(text, vocab, tokenizer, model)
print('Review: [%s]\nSentiment: %s (%.3f%%)' % (text, sentiment, percent*100))
# test negative text
text = 'This is a bad movie.'
percent, sentiment = predict_sentiment(text, vocab, tokenizer, model)
print('Review: [%s]\nSentiment: %s (%.3f%%)' % (text, sentiment, percent*100))

Review: [Best movie ever! It was great and I recommend it.]
Sentiment: POSITIVE (100.000%)
Review: [This is a bad movie.]
Sentiment: NEGATIVE (100.000%)
