In [1]:
import tensorflow as tf
from nltk.corpus import movie_reviews
from nltk.stem import WordNetLemmatizer
from collections import Counter
import nltk
import numpy as np
import random

In [2]:
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')

In [3]:
len(movie_reviews.words(fileids=negids[4]))

901

In [4]:
print(movie_reviews.words(fileids=negids[4]))

['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...]


In [5]:
# лемматизация - приведение слова к ее нормальной форме
lemmatizer = WordNetLemmatizer()

In [6]:
# собираем словарь для слов, встречающихся во всех текстах, и приводим их к нормальному виду
def create_lexicon():
    lexicon = movie_reviews.words(fileids=negids) + movie_reviews.words(fileids=posids)
    lexicon = [lemmatizer.lemmatize(i) for i in lexicon]
    word_counts = Counter(lexicon)
    l2 = []
    for w in word_counts:
        if 5000 > word_counts[w] > 50:
            l2.append(w)
    print(len(l2))
    return(l2)

In [7]:
# cобираем сэмплы, каждый сэмпл равен размеру словаря.
# фичи в сэмпле - это количество слов словаря, встречающихся в тексте
def sample_handling(sample_ids, lexicon, classification):
    featureset = []
    for ids in sample_ids:
        # берем слова текста
        words = movie_reviews.words(fileids=ids)
        # приводим в нормальную форму
        current_words = [lemmatizer.lemmatize(i) for i in words]
        # считаем количество вхождений слов текста в словарь и генерируем сэмпл
        features = np.zeros(len(lexicon))
        for word in current_words:
            if word in lexicon:
                index_value = lexicon.index(word)
                features[index_value] += 1
        features = list(features)
        
        featureset.append([features, classification])
    
    print(len(featureset))
        
    return featureset

In [8]:
# генерация всего датасета
def create_feature_sets_and_labels(pos, neg, test_size = 0.1):
    lexicon = create_lexicon()
    features = []
    features += sample_handling(pos, lexicon, [1,0])
    features += sample_handling(neg, lexicon, [0,1])
    random.shuffle(features)
    
    features = np.array(features)
    
    testing_size = int(len(features) * test_size)
    
    x_train = list(features[:,0][:-testing_size])
    y_train = list(features[:,1][:-testing_size])
    
    x_test = list(features[:,0][-testing_size:])
    y_test = list(features[:,1][-testing_size:])
    
    return x_train, y_train, x_test, y_test

In [9]:
x_train, y_train, x_test, y_test = create_feature_sets_and_labels(posids, negids, test_size = 0.1)

2531
1000
1000


In [10]:
len(x_train[1])

2531

In [11]:
n_nodes_1 = 2500
n_nodes_2 = 1000
n_nodes_3 = 500
n_nodes_4 = 500

In [12]:
n_classes = 2
batch_size = 100

In [13]:
# hight * width
x = tf.placeholder('float',[None, len(x_train[0])])
y = tf.placeholder('float')

In [14]:
def NN_model(data):
    # imput_data * weights + biases
    hidden_1_layer = {'weights':tf.Variable(tf.random_normal([len(x_train[0]), n_nodes_1])),
                      'biases':tf.Variable(tf.random_normal([n_nodes_1]))}
    
    hidden_2_layer = {'weights':tf.Variable(tf.random_normal([n_nodes_1, n_nodes_2])),
                      'biases':tf.Variable(tf.random_normal([n_nodes_2]))}

    output_layer =   {'weights':tf.Variable(tf.random_normal([n_nodes_2, n_classes])),
                      'biases':tf.Variable(tf.random_normal([n_classes]))}
    
    # layer1 = relu(imput_data * weights_1 + biases_1)
    l1 = tf.add(tf.matmul(data, hidden_1_layer['weights']), hidden_1_layer['biases'])
    l1 = tf.nn.relu(l1)
    
    # layer2 = relu(layer_1 * weights_2 + biases_2)
    l2 = tf.add(tf.matmul(l1, hidden_2_layer['weights']), hidden_2_layer['biases'])
    l2 = tf.nn.relu(l2)
    
    # output = layer_2 * weights_output + biases_output
    output = tf.matmul(l2, output_layer['weights']) + output_layer['biases']
    
    return output
    

In [15]:
def train_NN(x):
    predict = NN_model(x)
    # softmax нормализует полученные веса output слоя(сумма всех значений = 1)
    # cross_entropy_with_logits высчитывает ошибку для многоклассовой классификации
    # ошибка вычисляется как distance(softmax(outputs), Labels)) = -summ(Labels * log(outputs))
    # нaм нужно минимизировать эту ошибку
    cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = predict, labels = y))
    
    # оптимизатор(улучшенный стохастический градиентный спуск)
    optimizer = tf.train.AdamOptimizer().minimize(cost)
    
    epochs = 20
    
    with tf.Session() as session:
        # инициализируем граф со всеми переменными
        session.close()
        session.run(tf.initialize_all_variables())
        
        for epoch in range(epochs):
            epoch_loss = 0
            
            i = 0
            while i < len(x_train):
                start = i
                end = i + batch_size
                epoch_x = np.array(x_train[start:end])
                epoch_y = np.array(y_train[start:end])
                # на каждой эпохе оптимизируем ошибку для каждой части выборки 
                _, c = session.run([optimizer, cost], feed_dict={x:epoch_x, y:epoch_y})
                epoch_loss += c
                i += batch_size
            print('epoch', epoch, 'complited out of', epochs, 'loss:', epoch_loss)
       
        # считаем метрику accuracy
        correct = tf.equal(tf.argmax(predict, 1), tf.argmax(y, 1))
        
        accuracy = tf.reduce_mean(tf.cast(correct,'float'))
        print( 'accuracy:', accuracy.eval({x:x_test, y:y_test}))
            
train_NN(x)

Instructions for updating:
Use `tf.global_variables_initializer` instead.
epoch 0 complited out of 20 loss: 137045.237793
epoch 1 complited out of 20 loss: 45687.5255127
epoch 2 complited out of 20 loss: 16427.8270264
epoch 3 complited out of 20 loss: 8195.83482361
epoch 4 complited out of 20 loss: 6117.39791107
epoch 5 complited out of 20 loss: 5853.35428572
epoch 6 complited out of 20 loss: 10751.1024132
epoch 7 complited out of 20 loss: 4911.11160851
epoch 8 complited out of 20 loss: 23312.4387531
epoch 9 complited out of 20 loss: 41299.6495662
epoch 10 complited out of 20 loss: 52579.8607895
epoch 11 complited out of 20 loss: 16431.1257191
epoch 12 complited out of 20 loss: 3527.59813309
epoch 13 complited out of 20 loss: 142.419630051
epoch 14 complited out of 20 loss: 0.0
epoch 15 complited out of 20 loss: 0.0
epoch 16 complited out of 20 loss: 0.0
epoch 17 complited out of 20 loss: 0.0
epoch 18 complited out of 20 loss: 0.0
epoch 19 complited out of 20 loss: 0.0
accuracy: 0.805
