In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from Preprocessing.to_embedding import WordEmbedding
from Preprocessing.data_format import formatting
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [2]:
data = formatting("phase1_movie_reviews-train.csv")
y = data[['polarity']]

In [3]:
embedding = WordEmbedding(num_features = 300)
WordEmbedding.fit(embedding, data['reviewText'])

<gensim.models.word2vec.Word2Vec at 0x180baaba8>

In [4]:
WordEmbedding.size(embedding)

Total number of words in the vocabulary:  (54844, 300)


In [5]:
#Save word embedding to dataframe
#train_embeddings = WordEmbedding.to_pd(embedding, data['reviewText'])

#Save Save embeddings to file
WordEmbedding.to_file(embedding)

## 1. Extract training Word2vec Embeddings from file

In [6]:
embeddings_index = {}
f = open(os.path.join('', 'trained_embedding_word2vec.txt'), encoding = "utf-8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:])
    embeddings_index[word] = coefs
f.close

<function TextIOWrapper.close()>

## 2. Vectorize text data

In [7]:
#Basic Vectorization of data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['reviewText'])
sequences = tokenizer.texts_to_sequences(data['reviewText'])

#pad sequences
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

max_length = max([len(s) for s in data['reviewText'].values])
review_pad = pad_sequences(sequences, maxlen = max_length)
sentiment = data['polarity'].values
print('Shape of review tensor', review_pad.shape)
print('Shape of sentiment tensor', sentiment.shape)

Found 190279 unique tokens.
Shape of review tensor (90000, 5745)
Shape of sentiment tensor (90000,)


## 3. Create word vectors with the loaded word2vec model

In [8]:
embedding_size = 300 #number of feature weights in embeddings

num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, embedding_size))

for word, i in word_index.items():
    if i > num_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector 

## 4. Shaping train/dev data

In [9]:
indices = np.arange(review_pad.shape[0])
np.random.shuffle(indices)
review_pad = review_pad[indices]
sentiment = sentiment[indices]
num_val_samples = int(0.2 * review_pad.shape[0])

sentiment = pd.get_dummies(sentiment)

X_train = review_pad[:-num_val_samples]
y_train = sentiment[:-num_val_samples]
X_test = review_pad[-num_val_samples:]
y_test = sentiment[-num_val_samples:]

print('Shape of X_train:', X_train.shape)
print('Shape of y_train:', y_train.shape)
print('Shape of X_test:', X_test.shape)
print('Shape of X_test:', X_test.shape)

Shape of X_train: (72000, 5745)
Shape of y_train: (72000, 2)
Shape of X_test: (18000, 5745)
Shape of X_test: (18000, 5745)


In [13]:
#data = data[['year']]
#data = pd.concat([data, train_embeddings], axis=1, sort=False)

#y = pd.get_dummies(y)

#X_train, X_test, y_train, y_test = train_test_split(data, y, test_size = 0.20, random_state = 42)

## 5. Define model

In [107]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GRU
from keras.layers.embeddings import Embedding
from keras.initializers import Constant

# Define Model
model = Sequential()
embedding_layer = Embedding(num_words,
                            embedding_size,
                            embeddings_initializer = Constant(embedding_matrix),
                            input_length = max_length,
                            trainable = False)

model.add(embedding_layer)
model.add(GRU(units = 32, dropout = 0.2 , recurrent_dropout = 0.2))
model.add(Dense(2, activation = 'sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [108]:
model.fit(X_train, y_train, batch_size = 128, epochs = 25, validation_data = (X_test, y_test), verbose = 2)

Train on 72000 samples, validate on 18000 samples
Epoch 1/25


KeyboardInterrupt: 