In [19]:
import tensorflow as tf
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import numpy as np
import random
import pickle
from collections import Counter

lemmatizer = WordNetLemmatizer()
hm_lines = 10000000


def create_lexicon(pos,neg):
    lexicon = []
    for fi in [pos,neg]:
        with open(fi,'r') as f :
            contents = f.readlines()
            for l in contents[:hm_lines]:
                all_words = word_tokenize(l.lower())
                lexicon += list(all_words)
                

    lexicon = [lemmatizer.lemmatize(i) for i in lexicon]
    w_counts = Counter(lexicon)
    l2 = []
    #print(w_counts)
    for w in w_counts:
        if 50<w_counts[w]<1000:
            l2.append(w)
    
    print("Lenght of l2: ",len(l2))
    
    return l2    



def sample_handling(sample, lexicon, classification):
    featureset = []
    with open(sample,'r') as f:
        contents = f.readlines()
        for l in contents[:hm_lines]:
            current_words = word_tokenize(l)
            current_words = [lemmatizer.lemmatize(i) for i in current_words]
            features = np.zeros(len(lexicon))
            for word in current_words:
                if word.lower() in lexicon:
                    index_value = lexicon.index(word.lower())
                    features[index_value] += 1 
            
            features = list(features)
            featureset.append([features, classification])
        
    print("featureSet len: ",len(featureset))
    return featureset    



def create_features_sets_and_labels(pos, neg, test_size = 0.1):
    lexicon = create_lexicon(pos,neg)
    features = []
    features += sample_handling(pos, lexicon, [1,0])
    print("After pos")
    features += sample_handling(neg, lexicon, [0,1])
    
    random.shuffle(features)
    print("After random")
    features = np.array(features)
    testing_size = int(test_size*len(features))
                                                  #       features      label(neg)                label(pos)
    train_x = list(features[:,0][:-testing_size]) # [ [[1 0 0 1 1 1 0..],[0 1]], [[1 0 1 1 0 1 0..],[1 0]] ]
                                                #           0              1             0            1
    train_y = list(features[:,1][:-testing_size]) # labels for neg features 
    # read above 2 exp as from list of list read 0th data in list and store it from 0 to -testing size 
    print("After training")
    test_x = list(features[:,0][-testing_size:])
    test_y = list(features[:,1][-testing_size:])
    
    print(len(train_y))
    
    return train_x, train_y, test_x, test_y


train_x, train_y, test_x, test_y = create_features_sets_and_labels('pos.txt', 'neg.txt')


n_nodes_hl1 = 500
n_nodes_hl2 = 500
n_nodes_hl3 = 500


'''
In machine learning, an epoch is a full iteration over samples. Here, we are restricting the model
to 10 complete epochs or cycles of the algorithm running through the dataset.

The batch variable determines the amount of data being fed to the algorithm
at any given time, in this case, 100 images.
'''
n_classes = 2
batch_size = 100

# Height x Weight matrix
# None represents batch size

'''
The method tf.placeholder allows us to create variables that act as nodes holding the data.
Here, x is a 2-dimensionall array holding the MNIST images, with none implying the batch size
(which can be of any size) and 784 being a single 28×28 image. y is the target output class that
consists of a 2-dimensional array of 10 classes (denoting the numbers 0-9) that identify what digit is stored in each image.

'''
x = tf.placeholder('float', [None, len(train_x[0])] )
y = tf.placeholder('float') #label for x


def neural_network_model(data):
    hidden_layer_1 = {'weights': tf.Variable(tf.random_normal([ len(train_x[0]) ,n_nodes_hl1])),
                      'biases': tf.Variable(tf.random_normal([n_nodes_hl1]))}
    
    hidden_layer_2 = {'weights': tf.Variable(tf.random_normal([n_nodes_hl1,n_nodes_hl2])),
                      'biases': tf.Variable(tf.random_normal([n_nodes_hl2]))}
    
    hidden_layer_3 = {'weights': tf.Variable(tf.random_normal([n_nodes_hl2,n_nodes_hl3])),
                      'biases': tf.Variable(tf.random_normal([n_nodes_hl3]))}
    
    output_layer = {'weights': tf.Variable(tf.random_normal([n_nodes_hl3,n_classes])),
                      'biases': tf.Variable(tf.random_normal([n_classes]))}
    
    l1 = tf.add(tf.matmul(data,hidden_layer_1['weights']), hidden_layer_1['biases'])
    l1 = tf.nn.relu(l1)
    
    l2 = tf.add(tf.matmul(l1,hidden_layer_2['weights']), hidden_layer_2['biases'])
    l2 = tf.nn.relu(l2)
    
    l3 = tf.add(tf.matmul(l2,hidden_layer_3['weights']), hidden_layer_3['biases'])
    l3 = tf.nn.relu(l3)
    
    output = tf.matmul(l3,output_layer['weights']) + output_layer['biases']
    
    return output
    

''' We will be using a simple softmax model to implement our network. Softmax is a generalization of logistic regression,
usually used in the final layer of a network. It is useful because it helps in multi-classification models where a given
output can be a list of many different things.
It provides values between 0 to 1 that in addition give you the probability of the output belonging to a particular class. 
'''

def train_neural_network(x):
    prediction = neural_network_model(x)

    ''' This is the cost function of the model – a cost function is a difference between the predicted value and
    the actual value that we are trying to minimize to improve the accuracy of the model'''
    
    cost  = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = prediction,labels = y))
    optimizer = tf.train.AdamOptimizer().minimize(cost)
    
    hm_epochs = 10
    
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        
        for epoch in range(hm_epochs):
            epoch_loss = 0
            i=0
            while i < len(train_x):
                start = i
                end = i + batch_size
                batch_x = np.array(train_x[start:end])
                batch_y = np.array(train_y[start:end])
                
                _,c = sess.run([optimizer,cost], feed_dict = {x: batch_x , y: batch_y})
                epoch_loss += c
                i += batch_size
                
            print('Epoch ',epoch+1,' completed out of ',hm_epochs,', loss ',epoch_loss)
        
        
        correct = tf.equal(tf.argmax(prediction,1), tf.argmax(y,1))
        accuracy  = tf.reduce_mean(tf.cast(correct,'float'))
        print('Accuracy: ',accuracy.eval({ x: test_x, y: test_y }))

        
# hm_epochs = 4, accuracy = 93%
# hm_epochs = 10, accuracy = 95.14%
#hm_epochs = 15, accuracy = 95.54%
train_neural_network(x)        
        

        

Lenght of l2:  423
featureSet len:  5331
After pos
featureSet len:  5331
After random
After training
9596
Epoch  1  completed out of  10 , loss  249044.7716064453
Epoch  2  completed out of  10 , loss  120561.58612060547
Epoch  3  completed out of  10 , loss  74895.1994934082
Epoch  4  completed out of  10 , loss  47944.176193237305
Epoch  5  completed out of  10 , loss  33127.80347442627
Epoch  6  completed out of  10 , loss  26397.741241455078
Epoch  7  completed out of  10 , loss  20259.56967163086
Epoch  8  completed out of  10 , loss  16680.198356628418
Epoch  9  completed out of  10 , loss  11918.138452529907
Epoch  10  completed out of  10 , loss  12110.612785339355
Accuracy:  0.55722326


In [16]:
import tensorflow as tf
import numpy as np
import random
import nltk
from nltk.tokenize import word_tokenize
import numpy as np
import random
import pickle
from collections import Counter
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
hm_lines = 100000

def create_lexicon(pos,neg):
    lexicon = []
    with open(pos,'r') as f:
        contents = f.readlines()
        for l in contents[:hm_lines]:
            all_words = word_tokenize(l)
            lexicon += list(all_words)

    with open(neg,'r') as f:
        contents = f.readlines()
        for l in contents[:hm_lines]:
            all_words = word_tokenize(l)
            lexicon += list(all_words)

    lexicon = [lemmatizer.lemmatize(i) for i in lexicon]
    w_counts = Counter(lexicon)
    l2 = []
    for w in w_counts:
        #print(w_counts[w])
        if 1000 > w_counts[w] > 50:
            l2.append(w)
    print(len(l2))
    return l2


def sample_handling(sample,lexicon,classification):
    featureset = []
    with open(sample,'r') as f:
        contents = f.readlines()
        for l in contents[:hm_lines]:
            current_words = word_tokenize(l.lower())
            current_words = [lemmatizer.lemmatize(i) for i in current_words]
            features = np.zeros(len(lexicon))
            for word in current_words:
                if word.lower() in lexicon:
                    index_value = lexicon.index(word.lower())
                    features[index_value] += 1
            features = list(features)
            featureset.append([features,classification])
    return featureset

def create_feature_sets_and_labels(pos,neg,test_size = 0.1):
    lexicon = create_lexicon(pos,neg)
    features = []
    features += sample_handling('pos.txt',lexicon,[1,0])
    features += sample_handling('neg.txt',lexicon,[0,1])
    random.shuffle(features)
    features = np.array(features)

    testing_size = int(test_size*len(features))

    train_x = list(features[:,0][:-testing_size])
    train_y = list(features[:,1][:-testing_size])
    test_x = list(features[:,0][-testing_size:])
    test_y = list(features[:,1][-testing_size:])

    return train_x,train_y,test_x,test_y

train_x,train_y,test_x,test_y = create_feature_sets_and_labels('pos.txt','neg.txt')


n_classes = 2
batch_size = 100

x = tf.placeholder('float',[None,len(train_x[0])])
y = tf.placeholder('float')

import tensorflow as tf

import numpy as np
train_x,train_y,test_x,test_y = create_feature_sets_and_labels('pos.txt','neg.txt')
n_nodes_hl1 = 500
n_nodes_hl2 = 500
n_nodes_hl3 = 500

n_classes = 2

batch_size = 100

x = tf.placeholder('float',[None,len(train_x[0])])
y = tf.placeholder('float')

def neural_network_model(data):
    hidden_1_layer = {'weights': tf.Variable(tf.random_normal([len(train_x[0]),n_nodes_hl1])),
                      'biases': tf.Variable(tf.random_normal([n_nodes_hl1]))}

    hidden_2_layer = {'weights': tf.Variable(tf.random_normal([n_nodes_hl1,n_nodes_hl2])),
                      'biases': tf.Variable(tf.random_normal([n_nodes_hl2]))}

    hidden_3_layer = {'weights': tf.Variable(tf.random_normal([n_nodes_hl2,n_nodes_hl3])),
                      'biases': tf.Variable(tf.random_normal([n_nodes_hl3]))}

    output_layer = {'weights': tf.Variable(tf.random_normal([n_nodes_hl3,n_classes])),
                      'biases': tf.Variable(tf.random_normal([n_classes]))}

    l1= tf.add(tf.matmul(data, hidden_1_layer['weights']) , hidden_1_layer['biases'])
    l1 = tf.nn.relu(l1)

    l2= tf.add(tf.matmul(l1, hidden_2_layer['weights']) , hidden_2_layer['biases'])
    l2 = tf.nn.relu(l2)

    l3= tf.add(tf.matmul(l2, hidden_3_layer['weights']) , hidden_3_layer['biases'])
    l3 = tf.nn.relu(l3)

    output = tf.matmul(l3, output_layer['weights']) + output_layer['biases']
    return output

def train_neural_network(x):
    prediction = neural_network_model(x)
    cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction,labels=y))
    optimizer = tf.train.AdamOptimizer().minimize(cost)

    hm_epochs = 10

    with tf.Session() as sess:
        sess.run(tf.initialize_all_variables())

        for epoch in range(hm_epochs):
            epoch_loss=0
            i=0
            while i < len(train_x):
                start = i
                end = i + batch_size
                batch_x = np.array(train_x[start:end])
                batch_y = np.array(train_y[start:end])

                _,c = sess.run([optimizer,cost] , feed_dict = {x: batch_x , y : batch_y})
                epoch_loss+= c
                i+= batch_size
            print("Epoch",epoch+1 , 'completed out of ' ,hm_epochs, ' loss: ', epoch_loss )



        correct = tf.equal(tf.argmax(prediction,1), tf.argmax(y,1))
        accuracy = tf.reduce_mean(tf.cast(correct, 'float'))
        print('Accuracy: ', accuracy.eval({x:test_x , y: test_y}))
        

        
train_neural_network(x)        

423
423
Epoch 1 completed out of  10  loss:  236152.9129638672
Epoch 2 completed out of  10  loss:  109876.04608154297
Epoch 3 completed out of  10  loss:  66765.44448852539
Epoch 4 completed out of  10  loss:  42331.59622192383
Epoch 5 completed out of  10  loss:  27228.843627929688
Epoch 6 completed out of  10  loss:  19795.766136169434
Epoch 7 completed out of  10  loss:  20923.42981338501
Epoch 8 completed out of  10  loss:  16688.859075546265
Epoch 9 completed out of  10  loss:  12001.857012748718
Epoch 10 completed out of  10  loss:  21833.35897731781
Accuracy:  0.5778612


In [30]:
np.ones((1,2,3))

array([[[1., 1., 1.],
        [1., 1., 1.]]])

In [2]:
l = []
le = [1,2,34,5,6,67,6]
ls = [1,0]
l.append([le,ls])
l

[[[1, 2, 34, 5, 6, 67, 6], [1, 0]]]