In [79]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from Preprocessing.to_embedding import WordEmbedding
from Preprocessing.data_format import formatting
from Preprocessing.helper_functions import import_embedding, embedding_matrix_word2vec
from sklearn.model_selection import train_test_split

In [80]:
data = formatting("../phase1_movie_reviews-train.csv")
data = data[:round(len(data)*.2)]
y = pd.get_dummies(data['polarity'])
X_train, X_dev, y_train, y_dev = train_test_split(data, y, test_size = 0.10, random_state=42)

X_train_nlp, X_dev_nlp = X_train['reviewText'], X_dev['reviewText']
X_train_meta, X_dev_meta = X_train['year'], X_dev['year']
embedding_size = 300 #number of feature weights in embeddings
max_len = 400

In [81]:
print(X_train_nlp.head())
print(X_train_meta.head())

17223    [like, a, great, many, of, the, dvds, mgm, see...
9853     [i, loved, the, bbc's, documentary, series, qu...
14956    [i, am, not, going, to, compare, this, to, bon...
4509     [i, myself, recently, bought, squirm, on, vide...
14185    [this, review, is, for, the, 2005, twentieth, ...
Name: reviewText, dtype: object
17223    2002
9853     2002
14956    2002
4509     2004
14185    2005
Name: year, dtype: int64


In [82]:
embedding = WordEmbedding(num_features = embedding_size)

WordEmbedding.fit(embedding, X_train_nlp)
WordEmbedding.size(embedding)

Total number of words in the vocabulary:  (24940, 300)


In [83]:
#Save word embedding to dataframe
#train_embeddings = WordEmbedding.to_pd(embedding, X_train)

#Save Save embeddings to file
WordEmbedding.to_file(embedding)

In [84]:
embeddings_index = import_embedding('trained_embedding_word2vec.txt')

## 2. Vectorize text data

In [85]:
#Basic Vectorization of data
#Review data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train_nlp)
word_index = tokenizer.word_index

def vectorize(data, tokenizer ,max_len):
    sequences = tokenizer.texts_to_sequences(data)
    padding = pad_sequences(sequences, maxlen = max_len)
    
    return padding

X_train_nlp = vectorize(X_train_nlp, tokenizer , max_len)
X_dev_nlp = vectorize(X_dev_nlp, tokenizer, max_len)

print('Found %s unique tokens.' % len(word_index))
print('Shape of train tensor', X_train_nlp.shape)
print('Shape of dev tensor', X_dev_nlp.shape)

Found 82867 unique tokens.
Shape of train tensor (16200, 400)
Shape of dev tensor (1800, 400)


## 3. Create word vectors with the loaded word2vec model

In [86]:
embedding_matrix, num_words = embedding_matrix_word2vec(word_index, embedding_size, embeddings_index)

### Check train/dev sets

In [87]:
print('Shape of X_train:', X_train_nlp.shape)
print('Shape of y_train:', y_train_nlp.shape)
print('Shape of X_test:', X_dev_nlp.shape)
print('Shape of y_test:', y_dev_nlp.shape)

Shape of X_train: (16200, 400)
Shape of y_train: (16200, 2)
Shape of X_test: (1800, 400)
Shape of y_test: (1800, 2)


In [88]:
y

Unnamed: 0,negative,positive
0,1,0
1,0,1
2,0,1
3,0,1
4,0,1
5,0,1
6,0,1
7,1,0
8,1,0
9,0,1


## 5. Define model

In [89]:
# import keras

# config = tf.ConfigProto( device_count = {'GPU': 1 , 'CPU': 56} ) 
# config.gpu_options.allow_growth = True
# sess = tf.Session(config=config) 
# keras.backend.set_session(sess)

# from keras import backend as K
# K.tensorflow_backend._get_available_gpus()
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 12972935982229707506
]


In [76]:
from keras.models import Sequential, Model
from keras.layers import Dense, Embedding, LSTM, GRU, Bidirectional, GlobalMaxPool1D, Dropout, Input, concatenate
from keras.layers.embeddings import Embedding
from keras.initializers import Constant
from keras import regularizers

# Define Model
# model = Sequential()
# model.add(Embedding(num_words, 
#                     embedding_size,
#                     input_length = max_len,
#                      dropout=0.2))
# model.add(Bidirectional(LSTM(128, return_sequences = True)))
# model.add(GlobalMaxPool1D())
# model.add(Dense(20, activation="relu"))
# model.add(Dropout(0.05))
# model.add(Dense(2, activation="sigmoid"))


nlp_input = Input(shape=(max_len,), name='nlp_input')
meta_input = Input(shape=(1,), name='meta_input')
emb = Embedding(output_dim=embedding_size, input_dim=num_words, input_length=max_len)(nlp_input)
nlp_out = Bidirectional(LSTM(128, dropout=0.3, recurrent_dropout=0.3, kernel_regularizer=regularizers.l2(0.01)))(emb)
x = concatenate([nlp_out, meta_input])
x = Dense(20, activation='relu')(x)
x = Dense(2, activation='sigmoid')(x)
model = Model(inputs=[nlp_input , meta_input], outputs=[x])

model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

In [78]:
X_sets = [X_train_nlp, X_train_meta]
dev_sets = [X_dev_nlp, X_dev_meta]
history = model.fit(X_sets, y_train, batch_size=256, validation_data =(dev_sets, y_dev))

Train on 16200 samples, validate on 1800 samples
Epoch 1/1


In [12]:
history = model.fit(X_train, y_train, batch_size = 256, epochs = 4, validation_data = (X_dev, y_dev), verbose = 1)

Train on 81000 samples, validate on 9000 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
 1792/81000 [..............................] - ETA: 16:19 - loss: 0.0449 - acc: 0.9860

KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')

def plot_history(history):
    acc = history.history['acc']
    val_acc = history.history['val_acc']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 3, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 3, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()
    plt.show()

In [None]:
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Development Accuracy:  {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_dev, y_dev, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))
plot_history(history)