In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import numpy as np
import random
import pickle
from collections import Counter

In [4]:
lemmatizer = WordNetLemmatizer()
# maximum limitation of lines
hm_lines = 10000000

# returns the lexicon bases on txt file
def create_lexicon(pos, neg):
    lexicon = []
    for fi in [pos, neg]:
        with open(fi, 'r') as f:
            contents = f.readlines()
            for l in contents[:hm_lines]:
                # tokenize
                all_words = word_tokenize(l.lower())
                lexicon += list(all_words)
    
    lexicon = [lemmatizer.lemmatize(i) for i in lexicon]
    # generate a dictionary
    w_counts = Counter(lexicon)
    # w_counts = {"the":5212, "and":2325}
    l2 = []
    for w in w_counts:
        # drop out some super common and rare words
        if 1000 > w_counts[w] > 50:
            l2.append(w)
    print(len(l2))
    
    return l2


def sample_handling(sample, lexicon, classification):
    featureset = []
    '''
    [
        ...
        [[0,1,0,1,1...], [0,1]],
        [[0,0,0,0,1...], [1,0]],
        [[0,1,0,0,1...], [0,1]],
        ...
    ]
    '''
    with open(sample, 'r') as f:
        contents = f.readlines()
        
        for l in contents[:hm_lines]:
            current_words = word_tokenize(l.lower())
            current_words = [lemmatizer.lemmatize(i) for i in current_words]
            features = np.zeros(len(lexicon))
            
            for word in current_words:
                if word.lower() in lexicon:
                    index_value = lexicon.index(word.lower())
                    features[index_value] += 1
                    
            features = list(features)
            featureset.append([features, classification])

    return featureset


def create_features_sets_and_labels(pos, neg, test_size=0.1):
    
    lexicon = create_lexicon(pos, neg)
    features = []
    features += sample_handling("pos.txt", lexicon, [1, 0])
    features += sample_handling("neg.txt", lexicon, [0, 1])
    random.shuffle(features)
    
    features = np.array(features)
    
    testing_size = int(test_size*len(features))
    '''
    [
        [features, class],
        [features, class],
        ...
    ]
    '''
    train_x = list(features[:, 0][:-testing_size])
    train_y = list(features[:,1][:-testing_size])
    
    test_x = list(features[:,0][-testing_size:])
    test_y = list(features[:,1][-testing_size:])
    
    return train_x, train_y, test_x, test_y

In [5]:
import tensorflow as tf

# context = ssl._create_unverified_context()
# urllib.request("https://storage.googleapis.com/cvdf-datasets/mnist/", context=context)
'''
input > weight > hidden layer 1 (activation function) > weights> hidden l 2
(activation function) > weights > output layer

compare output to intended output > loss function (cross entropy)
optimization function (optimizer) > minimize cost (AdamOptimizer .. SGD, AdaGrad)

back propagation

feed forward + back prop = epoch #one cycle
'''

train_x, train_y, test_x, test_y = create_features_sets_and_labels('pos.txt', 'neg.txt')

n_nodes_hl1 = 500
n_nodes_hl2 = 500
n_nodes_hl3 = 500

n_classes = 2
batch_size = 100

# height x width
x = tf.placeholder('float', [None, len(train_x[0])])
y = tf.placeholder('float')


def neural_network_model(data):
    hidden_1_layer = {"weights": tf.Variable(tf.random_normal([len(train_x[0]), n_nodes_hl1])),
                      "biases": tf.Variable(tf.random_normal([n_nodes_hl1]))}

    hidden_2_layer = {"weights": tf.Variable(tf.random_normal([n_nodes_hl1, n_nodes_hl2])),
                      "biases": tf.Variable(tf.random_normal([n_nodes_hl1]))}

    hidden_3_layer = {"weights": tf.Variable(tf.random_normal([n_nodes_hl2, n_nodes_hl3])),
                      "biases": tf.Variable(tf.random_normal([n_nodes_hl1]))}

    output_layer = {"weights": tf.Variable(tf.random_normal([n_nodes_hl3, n_classes])),
                    "biases": tf.Variable(tf.random_normal([n_classes]))}

    l1 = tf.add(tf.matmul(data, hidden_1_layer["weights"]), hidden_1_layer["biases"])
    l1 = tf.nn.relu(l1)

    l2 = tf.add(tf.matmul(l1, hidden_2_layer["weights"]), hidden_2_layer["biases"])
    l2 = tf.nn.relu(l2)

    l3 = tf.add(tf.matmul(l2, hidden_3_layer["weights"]), hidden_3_layer["biases"])
    l3 = tf.nn.relu(l3)

    output = tf.add(tf.matmul(l3, output_layer["weights"]), output_layer["biases"])

    return output


def train_neural_network(x):
    prediction = neural_network_model(x)
    cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=y))
    optimizer = tf.train.AdamOptimizer().minimize(cost)

    # cycles feed forward + back prop
    epochs = 10
    # learning rate = 0.01

    # training the network
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        for epoch in range(epochs):
            
            epoch_loss = 0
            i = 0
            
            while i < len(train_x):
                start = i
                end = i + batch_size
                batch_x = np.array(train_x[start:end])
                batch_y = np.array(train_y[start:end])
                _, c = sess.run([optimizer, cost], feed_dict={x: batch_x, y: batch_y})
                epoch_loss += c
                i = end
                
            print("Epoch", epoch + 1, "complete out pf", epochs, "loss:", epoch_loss)

        correct = tf.equal(tf.argmax(prediction, 1), tf.argmax(y, 1))
        accuracy = tf.reduce_mean(tf.cast(correct, "float"))
        print("Accuracy:", accuracy.eval({x:test_x, y: test_y}))

train_neural_network(x)

423
Epoch 1 complete out pf 10 loss: 220411.730957
Epoch 2 complete out pf 10 loss: 104618.041016
Epoch 3 complete out pf 10 loss: 65412.6449585
Epoch 4 complete out pf 10 loss: 42028.9488068
Epoch 5 complete out pf 10 loss: 29689.2575073
Epoch 6 complete out pf 10 loss: 31925.911499
Epoch 7 complete out pf 10 loss: 29191.0944748
Epoch 8 complete out pf 10 loss: 20317.8209782
Epoch 9 complete out pf 10 loss: 13368.6230154
Epoch 10 complete out pf 10 loss: 12215.1002598
Accuracy: 0.546904
