In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tflearn
from tflearn.data_utils import to_categorical

hdf5 is not supported on this machine (please install/reinstall h5py for optimal experience)


In [2]:
reviews = pd.read_csv('reviews.txt', header=None)
labels = pd.read_csv('labels.txt', header=None)

In [3]:
from collections import Counter

total_counts = Counter()# bag of words here

for _, row in reviews.iterrows():
    total_counts.update(row[0].split(' '))
print("Total words in data set: ", len(total_counts))

Total words in data set:  74074


In [4]:
# keep the first 10000 most frequent words
vocab = sorted(total_counts, key=total_counts.get, reverse=True)[:10000]
print(vocab[:60])

['', 'the', '.', 'and', 'a', 'of', 'to', 'is', 'br', 'it', 'in', 'i', 'this', 'that', 's', 'was', 'as', 'for', 'with', 'movie', 'but', 'film', 'you', 'on', 't', 'not', 'he', 'are', 'his', 'have', 'be', 'one', 'all', 'at', 'they', 'by', 'an', 'who', 'so', 'from', 'like', 'there', 'her', 'or', 'just', 'about', 'out', 'if', 'has', 'what', 'some', 'good', 'can', 'more', 'she', 'when', 'very', 'up', 'time', 'no']


In [5]:
word2idx = {word: i for i, word in enumerate(vocab)} ## create the word-to-index dictionary here

In [6]:
len(vocab)

10000

In [7]:
word2idx.get?
# D.get(k[,d]) -> D[k] if k in D, else d.  d defaults to None.

In [8]:
#Text to vector function: converts a some text to a word vector.
def text2vector(text):
    word_vector = np.zeros(len(vocab), dtype=np.int_)
    for word in text.split(' '):
        idx = word2idx.get(word, None)
        if idx is None:
            continue
        else:
            word_vector[idx] += 1
    
    return np.array(word_vector)

In [9]:
text2vector('The tea is for a party to celebrate '
            'the movie so she has no time for a cake')[:65]

array([0, 1, 0, 0, 2, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0])

In [10]:
word_vectors = np.zeros((len(reviews), len(vocab)), dtype=np.int_)

for ii, (_, text) in enumerate(reviews.iterrows()):
    word_vectors[ii] = text2vector(text[0])

In [11]:
word_vectors[:5, :23]

array([[ 18,   9,  27,   1,   4,   4,   6,   4,   0,   2,   2,   5,   0,
          4,   1,   0,   2,   0,   0,   0,   0,   0,   0],
       [  5,   4,   8,   1,   7,   3,   1,   2,   0,   4,   0,   0,   0,
          1,   2,   0,   0,   1,   3,   0,   0,   0,   1],
       [ 78,  24,  12,   4,  17,   5,  20,   2,   8,   8,   2,   1,   1,
          2,   8,   0,   5,   5,   4,   0,   2,   1,   4],
       [167,  53,  23,   0,  22,  23,  13,  14,   8,  10,   8,  12,   9,
          4,  11,   2,  11,   5,  11,   0,   5,   3,   0],
       [ 19,  10,  11,   4,   6,   2,   2,   5,   0,   1,   2,   3,   1,
          0,   0,   0,   3,   1,   0,   1,   0,   0,   0]])

In [12]:
# Train, Validation, Test sets
Y = (labels=='positive').astype(np.int_)
records = len(labels)

shuffle = np.arange(records)
np.random.shuffle(shuffle)
test_fraction = 0.9

train_split, test_split = shuffle[:int(records*test_fraction)], shuffle[int(records*test_fraction):]
trainX, trainY = word_vectors[train_split,:], to_categorical(Y.values[train_split], 2)
testX, testY = word_vectors[test_split,:], to_categorical(Y.values[test_split], 2)

In [13]:
trainY

array([[ 1.,  1.],
       [ 1.,  1.],
       [ 1.,  1.],
       ..., 
       [ 1.,  1.],
       [ 1.,  1.],
       [ 1.,  1.]])

In [103]:
# to_categorical

Y = (labels=='positive').astype(np.int_)
Y.loc[:, (1)] = (Y[0] == 0).astype(int)

inputY = Y.loc[:, [0, 1]].as_matrix()

In [105]:
records = len(labels)

shuffle = np.arange(records)
np.random.shuffle(shuffle)
test_fraction = 0.9

train_split, test_split = shuffle[:int(records*test_fraction)], shuffle[int(records*test_fraction):]

In [107]:
trainX, trainY = word_vectors[train_split,:], inputY[train_split,:]
testX, testY   = word_vectors[test_split,:],  inputY[test_split,:]

In [108]:
trainY

array([[0, 1],
       [1, 0],
       [0, 1],
       ..., 
       [0, 1],
       [1, 0],
       [0, 1]])

In [None]:
# Network building use tf
def build_model():
    # This resets all parameters and variables, leave this here
    init = tf.global_variables_initializer()
    sess = tf.Session()
    sess.run(init)
    
    # Input
    x = tf.placeholder(tf.float32, [None, 10000])
    
    # Hidden layer(s)
    net = tf.nn.fully_connected(x, 200, activation='ReLU')
    net = tf.nn.fully_connected(net, 25, activation='ReLU')
    
    # Output layer
    net = tf.nn.fully_connected(net, 2, activation='softmax')
    net = tf.contrib.regr(net, optimizer='sgd', learning_rate=0.1, loss='categorical_crossentropy')
    
    model = tflearn.DNN(net)
    return model

In [14]:
# Network building
def build_model():
    # This resets all parameters and variables, leave this here
    tf.reset_default_graph()
    
    # Input
    net = tflearn.input_data([None, 10000])
    
    # Hidden layer(s)
    net = tflearn.fully_connected(net, 200, activation='ReLU')
    net = tflearn.fully_connected(net, 25, activation='ReLU') #
    
    # Output layer
    net = tflearn.fully_connected(net, 2, activation='softmax')
    net = tflearn.regression(net, optimizer='sgd', learning_rate=0.1, loss='categorical_crossentropy')
    
    model = tflearn.DNN(net)
    return model

In [15]:
model = build_model()

In [16]:
# Training
model.fit(trainX, trainY, validation_set=0.1, show_metric=True, batch_size=128, n_epoch=10)

Training Step: 1589  | total loss: [1m[32m1.38630[0m[0m | time: 7.571s
| SGD | epoch: 010 | loss: 1.38630 - acc: 0.4885 -- iter: 20224/20250
Training Step: 1590  | total loss: [1m[32m1.38630[0m[0m | time: 8.629s
| SGD | epoch: 010 | loss: 1.38630 - acc: 0.4920 | val_loss: 1.38630 - val_acc: 0.5173 -- iter: 20250/20250
--


In [17]:
# test 

predictions = (np.array(model.predict(testX))[:,0] >= 0.5).astype(np.int_)
test_accuracy = np.mean(predictions == testY[:,0], axis=0)
print("Test accuracy: ", test_accuracy)

Test accuracy:  0.5176


In [20]:
# Helper function that uses your model to predict sentiment
def test_sentence(sentence):
    positive_prob = model.predict([text2vector(sentence.lower())])[0][1]
    print('Sentence: {}'.format(sentence))
    print('P(positive) = {:.3f} :'.format(positive_prob), 
          'Positive' if positive_prob > 0.5 else 'Negative')

In [21]:
sentence = "Moonlight is by far the best movie of 2016."
test_sentence(sentence)

sentence = "It's amazing anyone could be talented enough to make something this spectacularly awful"
test_sentence(sentence)

Sentence: Moonlight is by far the best movie of 2016.
P(positive) = 0.500 : Negative
Sentence: It's amazing anyone could be talented enough to make something this spectacularly awful
P(positive) = 0.500 : Negative
