# IMDB

In [1]:
import os 

imdb = '../datasets/aclImdb'
train_dir = os.path.join(imdb, 'train')

labels = []
texts = []

for label_type in ['neg', 'pos']:
    dir_name = os.path.join(train_dir, label_type)
    for fname in os.listdir(dir_name):
        if fname[-4:] == '.txt':
            f = open(os.path.join(dir_name, fname))
            texts.append(f.read())
            f.close()
            labels.append(int(label_type == 'pos'))

## Tokenizing the data

In [4]:
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [5]:
maxlen = 100 # Max sentences length
training_samples = 200 # Only get first 200 samples to simulate a tiny training data
validation_samples = 10000
max_words = 1000 # Vocabulary size (The 1000 most frequent words)

In [6]:
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

In [7]:
words_index = tokenizer.word_index # Dictionary that maps words to tokens
print('Found {} unique tokens.'.format(len(words_index)))

Found 88582 unique tokens.


In [26]:
data = pad_sequences(sequences, maxlen=maxlen)

In [27]:
labels = np.asarray(labels)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Shape of data tensor: (25000, 100)
Shape of label tensor: (25000,)


Shuffle tha data

In [28]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

Split in train and validation set

In [29]:
x_train = data[:training_samples]
y_train = labels[:training_samples]

x_val = data[training_samples: training_samples + validation_samples]
y_val = labels[training_samples: training_samples + validation_samples]

## Preprocessing the embeddings

In [30]:
glove_path = '../datasets/glove.6B.100d.txt'

embedding_index = {}
with open(glove_path) as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = values[1:]
        embedding_index[word] = coefs
print('Found {} word vectors'.format(len(embedding_index)))

Found 400000 word vectors


Building the embedding matrix

In [31]:
embedding_dim = 100

embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in words_index.items():
    if i < max_words:
        embedding_vector = embedding_index.get(word)
        if embedding_vector:
            embedding_matrix[i] = embedding_vector

## Defining a model

In [32]:
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense

In [33]:
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 100, 100)          100000    
_________________________________________________________________
flatten_3 (Flatten)          (None, 10000)             0         
_________________________________________________________________
dense_5 (Dense)              (None, 32)                320032    
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 33        
Total params: 420,065
Trainable params: 420,065
Non-trainable params: 0
_________________________________________________________________


### Loading glove embeddings

In [34]:
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False

In [38]:
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['acc'])

In [39]:
history = model.fit(x_train, y_train,
                    epochs=10,
                    batch_size=32,
                    validation_data=(x_val, y_val))
model.save_weights('pre_trained_glove_mode.h5')

Train on 200 samples, validate on 10000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Validate against test set

In [40]:
test_dir = os.path.join(imdb, 'test')

labels = []
texts = []

for label_type in ['neg', 'pos']:
    dir_name = os.path.join(train_dir, label_type)
    for fname in os.listdir(dir_name):
        if fname[-4:] == '.txt':
            f = open(os.path.join(dir_name, fname))
            texts.append(f.read())
            f.close()
            labels.append(int(label_type == 'pos'))

In [41]:
sequence = tokenizer.texts_to_sequences(texts)
x_test = pad_sequences(sequences, maxlen=maxlen)
y_test = np.asarray(labels)

In [42]:
model.evaluate(x_test, y_test)



[1.529836594207287, 0.5008]