In [2]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn')
from scipy.stats import norm, skew
import numpy as np
import seaborn as sns

import keras
import os, shutil

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# Word embedding

In [3]:
from keras.layers import Embedding

#set embed layer with number of possible tokens (1000) and dimensionality of index
embedding_layer = Embedding(1000, 64)

Embedding layer takes as input a 2D tensor of integers of shape (samples, sequence length).

it returns a 3D floating point tensor of shape: (samples, sequence_length, embedding dimensionality)

Lets apply to IMDB dataset

In [4]:
from keras.datasets import imdb
from keras import preprocessing

max_features = 10000 #amount of most common words to consider
maxlen = 20 #cuts of text of reviews after this many words

#load the data as a list of integers
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words = max_features)

#changes the list of integers into 2D tensors of shape (samples, maxlen)
x_train = preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen)

In [5]:
from keras.models import Sequential
from keras.layers import Flatten, Dense


#sets up the model and adds an embedding layer that have shape 
#(samples, maxlen, 8)
model = Sequential()
model.add(Embedding(10000, 8, input_length=maxlen))

model.add(Flatten()) #flattens 3D tensor to 2D tensor of shape:
#(samples, maxlen*8)

model.add(Dense(1, activation = 'sigmoid')) #add the classifier at the end
model.compile(optimizer = 'rmsprop', loss = 'binary_crossentropy',
             metrics = ['acc'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 20, 8)             80000     
_________________________________________________________________
flatten_1 (Flatten)          (None, 160)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 161       
Total params: 80,161
Trainable params: 80,161
Non-trainable params: 0
_________________________________________________________________


In [6]:
#run model
history = model.fit(x_train, y_train, epochs = 10,
                   batch_size = 32, validation_split = 0.2)

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Word embedding with pre-trained nets

In [7]:
#get raw IMDB data

import os

imdb_dir = '/Users/Laurens/Documents/IE-BIGDATA/Term 3/Machine Learning III/imdb'
train_dir = os.path.join(imdb_dir, 'train')

In [8]:
labels = []
texts = []

for label_type in ['neg', 'pos']:
    dir_name = os.path.join(train_dir, label_type)
    for fname in os.listdir(dir_name):
        if fname[-4:] == '.txt':
            f = open(os.path.join(dir_name, fname), encoding = 'utf-8')
            texts.append(f.read())
            f.close()
            if label_type == 'neg':
                labels.append(0)
            else: 
                labels.append(1)

In [9]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

maxlen = 100
train_samples = 200
validation_samples = 10000
max_words = 10000

tokenizer = Tokenizer(num_words = max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

In [10]:
word_index = tokenizer.word_index
print('found {} unique tokens.'.format(len(word_index)))

data = pad_sequences(sequences, maxlen = maxlen)

labels = np.asarray(labels)
print('shape of data tensor:', data.shape)
print('shape of label tensor:', labels.shape)

#shuffle the data because labels are ordered and then train/test split
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

x_train = data[:train_samples]
y_train = labels[:train_samples]
x_val = data[train_samples: train_samples + validation_samples]
y_val = labels[train_samples: train_samples + validation_samples]

found 88582 unique tokens.
shape of data tensor: (25000, 100)
shape of label tensor: (25000,)


In [13]:
#parsing the GloVe word-embedding file
glove_dir = '/Users/Laurens/Downloads/glove.6B'

embeddings_index = {}
f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'), encoding = 'utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype = 'float32')
    embeddings_index[word] = coefs
f.close()

print('found {} word vectors.'.format(len(embeddings_index)))

found 400000 word vectors.


In [14]:
#preparing the GloVE word-embeddings matrix

embedding_dim = 100

embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [15]:
#defining the model for the word embedding
from keras.layers import Embedding, Flatten, Dense

model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(Flatten())
model.add(Dense(32, activation = 'relu'))
model.add(Dense(1, activation = 'sigmoid'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 100, 100)          1000000   
_________________________________________________________________
flatten_2 (Flatten)          (None, 10000)             0         
_________________________________________________________________
dense_2 (Dense)              (None, 32)                320032    
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 33        
Total params: 1,320,065
Trainable params: 1,320,065
Non-trainable params: 0
_________________________________________________________________


In [16]:
#Now I need to put the loaded Embeddings from Glove in the model
#we do this by setting the weights of the embedding layer with the Glove
#weigths

model.layers[0].set_weights([embedding_matrix])
#also set the layer to non-trainable to keep the weights the same
model.layers[0].trainable = False

In [17]:
#now compile and train
model.compile(optimizer = 'rmsprop',
             loss='binary_crossentropy',
             metrics = ['acc'])
history = model.fit(x_train, y_train,
                   epochs = 10,
                   batch_size = 32,
                   validation_data = (x_val, y_val))
model.save_weights('pre_trained_glove_model.h5')

Train on 200 samples, validate on 10000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
