In [1]:
import nltk
from nltk.tokenize import word_tokenize
import numpy as np
import random
import pickle
from collections import Counter
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

import os
os.chdir("C:/Users/Madhu/NLP/Senti")
pos_data = "pos.txt"
neg_data = "neg.txt"

max_lines=100000

In [2]:
def create_lexicon(pos,neg):
    
    lexicon=[]
    
    with open(pos,"r") as pos_f:
        lines = pos_f.readlines()
        
        for line in lines[:max_lines]:
            tokens=word_tokenize(line)
            lexicon+=list(tokens)
    
    with open(neg,"r") as neg_f:
        lines = neg_f.readlines()
        
        for line in lines[:max_lines]:
            tokens=word_tokenize(line)
            lexicon+=list(tokens)
            
    lexicon = [lemmatizer.lemmatize(word) for word in lexicon]
    words_count = Counter(lexicon)
    
    w_count_filter = []
    for word_count in words_count:
        if 1000 > words_count[word_count] > 50:
            w_count_filter.append(word_count)
            
    return w_count_filter

def sample_handling(file,lexicon,classification):

    featureset = []

    with open(file,'r') as f:
        lines = f.readlines()
        
        for line in lines[:max_lines]:
            tokens = word_tokenize(line.lower())
            tokens = [lemmatizer.lemmatize(word) for word in tokens]
            
            features = np.zeros(len(lexicon))
            for word in tokens:
                if word.lower() in lexicon:
                    index_value = lexicon.index(word.lower())
                    features[index_value] += 1

            features = list(features)
            featureset.append([features,classification])

    return featureset

def create_feature_sets_and_labels(test_size=0.1):
    
    lexicon = create_lexicon(pos_data,neg_data)
    
    features = []
    
    features += sample_handling(pos_data,lexicon,[1,0])
    features += sample_handling(neg_data,lexicon,[0,1])
    
    random.shuffle(features)
    features = np.array(features)
    
    
    testing_size = int(test_size*len(features))

    train_x = list(features[:,0][:-testing_size])
    train_y = list(features[:,1][:-testing_size])
    
    test_x = list(features[:,0][-testing_size:])
    test_y = list(features[:,1][-testing_size:])

    return train_x,train_y,test_x,test_y

# Training in Tensor Flow

In [3]:
import tensorflow as tf

train_X , train_Y , test_X , test_Y = create_feature_sets_and_labels()

In [4]:
X = tf.placeholder("float")
Y = tf.placeholder("float")

In [5]:
W_H1 = tf.Variable(tf.random_normal([len(train_X[0]),1625]))
W_H2 = tf.Variable(tf.random_normal([1625,1625]))
W_H3 = tf.Variable(tf.random_normal([1625,1625]))
W_O =  tf.Variable(tf.random_normal([1625,2]))

In [6]:
def model(X, W_H1, W_H2,W_H3, W_O):
    
    h1 = tf.nn.relu(tf.matmul(X,W_H1))
    h2 = tf.nn.relu(tf.matmul(h1,W_H2))
    h3 = tf.nn.relu(tf.matmul(h2,W_H3))
    o = tf.matmul(h3,W_O)
    
    return o

In [7]:
Y_pred = model(X, W_H1, W_H2,W_H3, W_O)

In [8]:
#cost function
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=Y_pred, labels=Y))
 # compute mean cross entropy (softmax is applied internally)

In [9]:
optimizer = tf.train.GradientDescentOptimizer(0.001).minimize(cost)

In [10]:
epochs = 15
batch_size = 100
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    for epoch in range(epochs):
        epoch_loss = 0
        i=0
        while i < len(train_X):
            start = i
            end = i+batch_size
            batch_x = train_X[start:end]
            batch_y = train_Y[start:end]

            _, c = sess.run([optimizer, cost], feed_dict={X: batch_x, Y: batch_y})
            epoch_loss += c
            i+=batch_size
                
        print('Epoch', epoch+1, 'completed out of',epochs,'loss:',epoch_loss)
    
    
    correct = tf.equal(tf.argmax(Y_pred, 1), tf.argmax(Y, 1))
    accuracy = tf.reduce_mean(tf.cast(correct, 'float'))

    print('Accuracy:',accuracy.eval({X:test_X, Y:test_Y}))

            
        

Epoch 1 completed out of 15 loss: 2025286.99146
Epoch 2 completed out of 15 loss: 98842.8157501
Epoch 3 completed out of 15 loss: 24705.1590424
Epoch 4 completed out of 15 loss: 11694.4422455
Epoch 5 completed out of 15 loss: 5011.83792019
Epoch 6 completed out of 15 loss: 2577.47369814
Epoch 7 completed out of 15 loss: 1637.79358387
Epoch 8 completed out of 15 loss: 1158.10869781
Epoch 9 completed out of 15 loss: 949.821534193
Epoch 10 completed out of 15 loss: 781.919290616
Epoch 11 completed out of 15 loss: 723.038212102
Epoch 12 completed out of 15 loss: 660.730764009
Epoch 13 completed out of 15 loss: 637.133991666
Epoch 14 completed out of 15 loss: 645.379779585
Epoch 15 completed out of 15 loss: 621.367512938
Accuracy: 0.575985
