In [2]:
from __future__ import print_function 
import numpy as np
import tensorflow as tf 

## Load the dataset

In [3]:
f = np.load('shoes_list_of_review_dicts.npz')

In [4]:
reviews_list = f['reviews_list']

In [5]:
print(reviews_list[0])

{'product/productId': 'B0009PK7KO', 'review/userId': 'A12O8IHB65BC1S', 'product/title': "Caligarius Women's Acheta Pump,Black Calf,6 M", 'review/helpfulness': '2/2', 'review/profileName': 'Fifi', 'review/score': '4.0', 'review/time': '1169769600', 'review/text': '... but not enough sizes or colors. Fits true to size on my size 8-1/2 feet.Bottom soles are completely slick... needs some kind of texturing or tread to help prevent slipping.', 'product/price': 'unknown', 'review/summary': 'Beautiful basic pump...'}


In [7]:
texts = [review['review/text'] for review in reviews_list]
scores = [review['review/score'] for review in reviews_list]
labelled_corpus = zip(texts, scores)

In [25]:
# number of review samples 
print(len(texts))

389877


In [87]:
scores_list = []

for i in range(len(scores)):
    if scores[i] not in scores_list:
        scores_list.append(scores[i])
        
print(scores_list)

['4.0', '2.0', '5.0', '3.0', '1.0']


In [89]:
scores = [int(float(score)) for score in scores]
print(scores[:10])

[4, 2, 5, 5, 5, 3, 5, 5, 4, 5]


In [92]:
labels = to_categorical(np.asarray(scores))
print(len(labels))
print(labels[:10])

389877
[[ 0.  0.  0.  0.  1.  0.]
 [ 0.  0.  1.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  1.]
 [ 0.  0.  0.  0.  0.  1.]
 [ 0.  0.  0.  0.  0.  1.]
 [ 0.  0.  0.  1.  0.  0.]
 [ 0.  0.  0.  0.  0.  1.]
 [ 0.  0.  0.  0.  0.  1.]
 [ 0.  0.  0.  0.  1.  0.]
 [ 0.  0.  0.  0.  0.  1.]]


## Clean the dataset

In [31]:
# clean up the corpus 

def filter_text_as_sentence_list(text, end="END"):
    sentences_list = []
    for sent in text.split('.'):
        if sent.strip() != '':
            sentences_list.append(sent.strip()+" "+end)
    return sentences_list

def filter_text(text, end="END"):
    sentences_list = []
    for sent in text.split('.'):
        if sent.strip() != '':
            sentences_list.append(sent.strip()+" "+end)
    return ' '.join(sentences_list)

In [30]:
print(filter_text(texts[0]))

['but not enough sizes or colors END', 'Fits true to size on my size 8-1/2 feet END', 'Bottom soles are completely slick END', 'needs some kind of texturing or tread to help prevent slipping END']


In [19]:
sentences_list = filter_text(texts[0])
print(sentences_list[:10])

['but not enough sizes or colors END', 'Fits true to size on my size 8-1/2 feet END', 'Bottom soles are completely slick END', 'needs some kind of texturing or tread to help prevent slipping END']


In [20]:
all_sentences_list = [filter_text_as_sentence_list(text) for text in texts]
all_sentences = [sent for sentence_list in all_sentences_list for sent in sentence_list]
all_sentences_as_wordslist = [sent.split() for sent in all_sentences]

In [24]:
# number of total reviews -> consistency confirmed
print(len(all_sentences_list))

# number of total sentences -> a lot of short sentences 
print(len(all_sentences_as_wordslist))

389877
1696932


## Divide the dataset

In [None]:
from random import sample

def divide_dataset(labelled_doc, num_train, num_valid, num_test=None, shuffle=False):
    if num_test == None:
        num_test = len(labelled_doc) - num_train - num_valid
        
    if shuffle:
        temp = sample(labelled_doc, len(labelled_doc))
    else:
        temp = labelled_doc
    return temp[:num_train], temp[num_train:-num_test], temp[-num_test:]

In [None]:
train_reviews, valid_reviews, test_reviews = divide_dataset(, 1585, 250, 250, shuffle=True)

print(len(train_reviews), len(valid_reviews), len(test_reviews))

## Words to indices

In [None]:
def get_most_common_words_list(sentences, num_words):
    # Remember there might be an Unknown token 
    c = Counter([word for sent in sentences for word in sent])
    list_most_common = c.most_common(num_words)
    words_most_common = [item[0] for item in list_most_common]
    return words_most_common
    
def replace_unknown_token(sent_list, words_most_common, unknown_token="UNK"):
    filtered_list = [word if word in words_most_common else unknown_token for word in sent_list]  # so fast !!!
    return filtered_list

def map_word_and_index(input_doc, top=10000):
    counts_new = []            
    words_count_list = Counter([word for doc in input_doc for sent in doc[0] for word in sent]).most_common(top)
    word2index = {item[0]: index for index, item in enumerate(words_count_list, 1)}
    index2word = {index: item[0] for index, item in enumerate(words_count_list, 1)}
    return word2index, index2word

def convert2words(doc):
    return [word for sent in doc for word in sent]

def doc2index(doc, word2index):
    return [[word2index[word]] for word in doc]

def convert_corpus(corpus, word2index):
    corpus_words = [(convert2words(doc[0]), doc[1]) for doc in corpus]
    return np.asarray([doc2index(doc[0], word2index) for doc in corpus_words])

In [None]:
word2index, index2word = map_word_and_index(train_dc, 10000)

## Word embedding model (Word2Vec)

In [21]:
from gensim.models import Word2Vec

In [27]:
model_trained = Word2Vec(all_sentences_as_wordslist, min_count=10, size=100)

In [28]:
model_trained.save("word2vec_model_amazon_review_shoes")

In [None]:
model = Word2Vec.load("word2vec_model_amazon_review_shoes")
print(len(model.wv.vocab))

In [None]:
import os 
from gensim.models import Word2Vec

def build_word2vec_model(name, sentences=None, min_count=10, size=100):
    if not os.path.isfile(name):
        model = Word2Vec(sentences, min_count=min_count, size=size)
        model.save(name)
    else:
        model = Word2Vec.load(name)
    return model 

In [None]:
embedding_model = build_word2vec_model('word2vec_model_amazon_review_shoes', sentences=all_sentences_as_wordslist, min_count=10, size=100)

## Classification model (TensorFlow)

## Classification model (Keras)

In [84]:
import os
import sys

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Model
from keras.layers import Input, Dense, Flatten, Lambda
from keras.layers import Conv1D, MaxPooling1D, Embedding

import keras.backend as K

In [47]:
embedding_dim = 100
max_sequence_length = 1000
max_num_words = 20000

### Data preprocessing

In [32]:
filtered_texts = [filter_text(text) for text in texts]
# print(filtered_texts[1])

In [54]:
# need to build index mapping words to their embeddings 
# embeddings_index[word] = coefficient vector as np.array

BASE_DIR = ''
GLOVE_DIR = BASE_DIR + 'glove.6B/'

embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [48]:
tokenizer = Tokenizer(num_words=max_num_words)
tokenizer.fit_on_texts(filtered_texts)
sequences = tokenizer.texts_to_sequences(filtered_texts)

word_index = tokenizer.word_index

data = pad_sequences(sequences, maxlen=max_sequence_length)

In [49]:
print(sequences[20])

[12, 11, 13, 354, 15, 9, 43, 57, 1, 3, 83, 55, 12, 10, 2359, 4, 185, 14, 51, 28, 3, 78, 104, 60, 157, 1, 87, 3, 17, 14, 10, 126, 4, 8805, 1, 7, 11, 26, 29, 4, 65, 37, 109, 43, 79, 1, 3, 17, 683, 14, 32, 2639, 6, 690, 18, 52, 14, 71, 1, 3, 259, 12, 15, 694, 4, 47, 241, 125, 14, 1]


In [50]:
print(data[20])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0  

In [93]:
# split the data 

validation_split = 0.2 

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
num_validation_samples = int(validation_split * data.shape[0])

x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]

In [55]:
# prepare embedding matrix and build the embedding layer 

num_words = min(max_num_words, len(word_index))
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
    if i >= max_num_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

### Build the model 

In [60]:
embedding_layer = Embedding(num_words,
                            embedding_dim,
                            weights=[embedding_matrix],
                            input_length=max_sequence_length,
                            trainable=False)

In [96]:
def mm(x):
    return tf.reduce_mean(x, axis=1)

sequence_input = Input(shape=(max_sequence_length, ), dtype='int32')
print(sequence_input.shape)
embedded_sequences = embedding_layer(sequence_input)
print(embedded_sequences.shape)
x = Lambda(mm)(embedded_sequences)
print(x.shape)
preds = Dense(6, activation='softmax')(x)
print(preds.shape)

model = Model(sequence_input, preds)

(?, 1000)
(?, 1000, 100)
(?, 100)
(?, 6)


In [97]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_19 (InputLayer)        (None, 1000)              0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 1000, 100)         2000000   
_________________________________________________________________
lambda_4 (Lambda)            (None, 100)               0         
_________________________________________________________________
dense_10 (Dense)             (None, 6)                 606       
Total params: 2,000,606.0
Trainable params: 606.0
Non-trainable params: 2,000,000.0
_________________________________________________________________


### Train the model

In [98]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])

In [None]:
import time 
start_time = time.time()

model.fit(x_train, y_train,
          batch_size=128,
          epochs=10,
          validation_data=(x_val, y_val))

print("Training time: ", time.time() - start_time)

Train on 311902 samples, validate on 77975 samples
Epoch 1/10
 28032/311902 [=>............................] - ETA: 212s - loss: 1.5692 - acc: 0.6449  