In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from keras.layers import Embedding, Dense
from keras.datasets import imdb
from keras import preprocessing
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
import os
import json
from google.colab import drive

In [98]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#### Задание 1. Загрузите данные. Преобразуйте текстовые файлы во внутренние структуры данных, которые используют индексы вместо слов.


In [0]:
data_dir = '/content/drive/My Drive/Bsuir - Big data/4 semester/MO/data/a7/'
imdb_data=pd.read_csv(data_dir + 'IMDB Dataset.csv')

In [160]:
imdb_data.shape

(50000, 2)

In [161]:
imdb_data.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [162]:
imdb_data.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [0]:
labels = list(np.where(imdb_data.sentiment == 'positive', 1, 0))
texts = list(imdb_data.review)

In [0]:
review_length = 200 # cuts off reviews after 200 words
training_samples = 40000 # trains on 20,000 samples
validation_samples = 5000 # validates on 5000 samples
test_samples = 5000 # validates on 5000 samples
max_words = 20000 # only top 20,000 words
batch_size = 64

In [0]:
def tokenize_data(word_indexes=None):
  tokenizer = Tokenizer(num_words=max_words)
  tokenizer.fit_on_texts(texts)
  sequences = tokenizer.texts_to_sequences(texts)
  
  if word_indexes:
    tokenizer.word_index = word_indexes

  word_index = tokenizer.word_index

  print ('Found %s unique tokens.' % len(word_index))

  # add padding up to review length
  return pad_sequences(sequences, maxlen=review_length)

In [166]:
data = tokenize_data()

Found 124252 unique tokens.


In [0]:
labels = np.asarray(labels)
indices = np.arange(data.shape[0])

# shuffle and select training and validation sample
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

x_train = data[:training_samples]
y_train = labels[:training_samples]

x_val = data[training_samples: training_samples + validation_samples]
y_val = labels[training_samples: training_samples + validation_samples]

x_test = data[-test_samples:]
y_test = labels[-test_samples:]

In [168]:
x_train

array([[    0,     0,     0, ...,   409,     4,   614],
       [ 4659,   443,    48, ...,    41,     4,   156],
       [    0,     0,     0, ..., 10609,   723,   156],
       ...,
       [    0,     0,     0, ...,  4594, 14313,    15],
       [ 3987,   639,    16, ...,   443, 10851, 10228],
       [    0,     0,     0, ...,  3947,   541,  1138]], dtype=int32)

In [169]:
print('Shape of train X: %s' % (x_train.shape,))
print('Shape of train Y: %s' % (y_train.shape,))
print('Shape of validation X: %s' % (x_val.shape,))
print('Shape of validation Y: %s' % (y_val.shape,))
print('Shape of test X: %s' % (x_test.shape,))
print('Shape of test Y: %s' % (y_test.shape,))

Shape of train X: (40000, 200)
Shape of train Y: (40000,)
Shape of validation X: (5000, 200)
Shape of validation Y: (5000,)
Shape of test X: (5000, 200)
Shape of test Y: (5000,)


In [170]:
word_index

{'the': 1,
 'and': 2,
 'a': 3,
 'of': 4,
 'to': 5,
 'is': 6,
 'br': 7,
 'in': 8,
 'it': 9,
 'i': 10,
 'this': 11,
 'that': 12,
 'was': 13,
 'as': 14,
 'for': 15,
 'with': 16,
 'movie': 17,
 'but': 18,
 'film': 19,
 'on': 20,
 'not': 21,
 'you': 22,
 'are': 23,
 'his': 24,
 'have': 25,
 'be': 26,
 'one': 27,
 'he': 28,
 'all': 29,
 'at': 30,
 'by': 31,
 'an': 32,
 'they': 33,
 'so': 34,
 'who': 35,
 'from': 36,
 'like': 37,
 'or': 38,
 'just': 39,
 'her': 40,
 'out': 41,
 'about': 42,
 'if': 43,
 "it's": 44,
 'has': 45,
 'there': 46,
 'some': 47,
 'what': 48,
 'good': 49,
 'when': 50,
 'more': 51,
 'very': 52,
 'up': 53,
 'no': 54,
 'time': 55,
 'my': 56,
 'even': 57,
 'would': 58,
 'she': 59,
 'which': 60,
 'only': 61,
 'really': 62,
 'see': 63,
 'story': 64,
 'their': 65,
 'had': 66,
 'can': 67,
 'me': 68,
 'well': 69,
 'were': 70,
 'than': 71,
 'much': 72,
 'we': 73,
 'bad': 74,
 'been': 75,
 'get': 76,
 'do': 77,
 'great': 78,
 'other': 79,
 'will': 80,
 'also': 81,
 'into': 82,
 'p

In [171]:
x_train[0]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,    15,
         105,     4,     1,  1537,  5047,     1,    17,    13,   372,
         714, 13325,     9,     2,   110,   189,    99,   126,   451,
           4,    55,     2,   451,     4,   290,  9526, 14885,     6,
         138,     3,    78,   563,    10,   190,    12,    59,    58,
          26,    34,    78,     8,     1,   202,     1,  5584,    70,
          34,    78,    18,    33,    61,  1202,     1,   116,   516,
           4,     1,    17,    56,   658,    57,   190,    12,    15,
           3,  2165,   493,     9,  2034,    48,     6,    53,    16,
          12,    17,    13,    52,   561,   348,    10,    80,    21,
         377,     9,

#### Задание 2. Реализуйте и обучите двунаправленную рекуррентную сеть (LSTM или GRU). Какого качества классификации удалось достичь?


In [0]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(max_words, 128, input_length=review_length))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

In [173]:
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 200, 128)          2560000   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 128)               98816     
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 129       
Total params: 2,658,945
Trainable params: 2,658,945
Non-trainable params: 0
_________________________________________________________________


In [0]:
model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])

In [0]:
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=1,
         validation_data=[x_val, y_val])

In [0]:
score, acc = model.evaluate(x_test, y_test)
print("Model accuracy:", acc)

Model accuracy: 0.8932


#### Задание 3. Используйте индексы слов и их различное внутреннее представление (word2vec, glove). Как влияет данное преобразование на качество классификации?


In [0]:
! cp "/content/drive/My Drive/Bsuir - Big data/4 semester/MO/data/a7/glove.6B.100d.txt" data

In [0]:
# Load GloVe pretrained embeddings

embedding_dim = 100

embeddings_index = {}
f = open(os.path.join(data_dir, 'glove.6B.'+str(embedding_dim)+'d.txt'), encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Word vectors: %s' % len(embeddings_index))
print('Embedding size: %s'% embedding_dim)

embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

Word vectors: 400000
Embedding size: 100


In [0]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(max_words, embedding_dim, input_length=review_length))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
model.summary()

In [0]:
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False

In [0]:
model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])

In [0]:
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=1,
          validation_data=[x_val, y_val])

Train on 40000 samples, validate on 5000 samples


<tensorflow.python.keras.callbacks.History at 0x7f648149ad68>

In [0]:
score, acc = model.evaluate(x_test, y_test)
print("Model accuracy:", acc)

Model accuracy: 0.864


#### Задание 4. Поэкспериментируйте со структурой сети (добавьте больше рекуррентных, полносвязных или сверточных слоев). Как это повлияло на качество классификации?


In [0]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(max_words, embedding_dim, input_length=review_length))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))
model.add(tf.keras.layers.Dense(256, activation='relu'))
model.add(tf.keras.layers.Dense(256, activation='relu'))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 200, 100)          2000000   
_________________________________________________________________
bidirectional_11 (Bidirectio (None, 128)               84480     
_________________________________________________________________
dense_8 (Dense)              (None, 256)               33024     
_________________________________________________________________
dense_9 (Dense)              (None, 256)               65792     
_________________________________________________________________
dropout_8 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_10 (Dense)             (None, 1)                 257       
Total params: 2,183,553
Trainable params: 2,183,553
Non-trainable params: 0
___________________________________________

In [0]:
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False

In [0]:
model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])

In [0]:
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=1,
          validation_data=[x_val, y_val])

Train on 40000 samples, validate on 5000 samples


<tensorflow.python.keras.callbacks.History at 0x7f6480ad99e8>

In [0]:
score, acc = model.evaluate(x_test, y_test)
print("Model accuracy:", acc)

Model accuracy: 0.861


Добавление двух полносвязных слоев никак не повлияло на точность классификации.

#### Задание 5. Используйте предобученную рекуррентную нейронную сеть (например, DeepMoji или что-то подобное).

Так как эта модель обучалась на другом наборе слов, модель не будет работать с нашим словарем индексов. Поэтому необходимо скачать также словарь, с которым обучалась модель и переиндексировать наш корпус текстов.

In [0]:
with open(data_dir + "word-index.json", "r") as file:
  rnn_vocab = json.load(file)

In [118]:
rnn_data = tokenize_data(rnn_vocab)

Found 16191 unique tokens.


In [0]:
labels = np.asarray(labels)
indices = np.arange(rnn_data.shape[0])

# shuffle and select training and validation sample
np.random.shuffle(indices)
rnn_data = rnn_data[indices]
labels = labels[indices]

x_train = rnn_data[:training_samples]
y_train = labels[:training_samples]

x_val = rnn_data[training_samples: training_samples + validation_samples]
y_val = labels[training_samples: training_samples + validation_samples]

x_test = rnn_data[-test_samples:]
y_test = labels[-test_samples:]

Также необходимо заменить последний слой сети на слой с одним выходом и дообучить модель, заморозив остальные слои.

In [152]:
rnn_model = load_model(data_dir + 'train-embeddings-rnn-100-length.h5')

rnn_model._layers.pop()
rnn_model.add(Dense(1, activation='sigmoid'))

for rnn_layer in rnn_model.layers:
  rnn_layer.trainable = False

rnn_model.layers[-1].trainable = True
rnn_model.layers[-2].trainable = True
rnn_model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, None, 100)         1619200   
_________________________________________________________________
lstm_6 (LSTM)                (None, 64)                42240     
_________________________________________________________________
dense_9 (Dense)              (None, 128)               8320      
_________________________________________________________________
dropout_5 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_10 (Dense)             (None, 16192)             2088768   
_________________________________________________________________
dense_12 (Dense)             (None, 1)                 16193     
Total params: 5,428,288
Trainable params: 3,758,528
Non-trainable params: 1,669,760
____________________________________

  sample_weight_mode: One of `None` or `"temporal"`.


In [0]:
rnn_model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])

In [0]:
rnn_model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=5,
          validation_data=[x_val, y_val])

In [158]:
score, accuracy = rnn_model.evaluate(x_test, y_test)
print("Model accuracy:", accuracy)

Model accuracy: 0.9231


Данная модель показывает точность 92%, что на 3% больше чем первая модель, обученная без использования word embeddings.