In [4]:
from __future__ import print_function 
import numpy as np
import tensorflow as tf 
from keras.utils import to_categorical

## Load the dataset

In [5]:
f = np.load('shoes_list_of_review_dicts.npz')

In [6]:
reviews_list = f['reviews_list']

In [7]:
print(reviews_list[0])

{'product/productId': 'B0009PK7KO', 'review/time': '1169769600', 'review/profileName': 'Fifi', 'review/text': '... but not enough sizes or colors. Fits true to size on my size 8-1/2 feet.Bottom soles are completely slick... needs some kind of texturing or tread to help prevent slipping.', 'product/title': "Caligarius Women's Acheta Pump,Black Calf,6 M", 'product/price': 'unknown', 'review/summary': 'Beautiful basic pump...', 'review/userId': 'A12O8IHB65BC1S', 'review/score': '4.0', 'review/helpfulness': '2/2'}


In [8]:
texts = [review['review/text'] for review in reviews_list]
scores = [review['review/score'] for review in reviews_list]

In [9]:
# scores_list = []

# for i in range(len(scores)):
#     if scores[i] not in scores_list:
#         scores_list.append(scores[i])
        
# print(scores_list)

In [10]:
scores_dict = {}
count = 0

for score in scores:
    count += 1
    if score not in scores_dict:
        scores_dict[score] = 1
    else:
        scores_dict[score] += 1
        
for key in scores_dict:
    print(key, scores_dict[key]/count)

1.0 0.06256845107559564
2.0 0.044106218114943946
5.0 0.6579074939019229
3.0 0.0721843042805808
4.0 0.1632335326269567


In [11]:
# clean up the corpus 

def filter_text_as_sentence_list(text, end="END"):
    sentences_list = []
    for sent in text.split('.'):
        if sent.strip() != '':
            sentences_list.append(sent.strip()+" "+end)
    return sentences_list

def filter_text(text, end="END"):
    sentences_list = []
    for sent in text.split('.'):
        if sent.strip() != '':
            sentences_list.append(sent.strip()+" "+end)
    return ' '.join(sentences_list)

def convert_review_as_sentence(review):
    return [sent.strip() for sent in review.split(" END") if sent != '' and sent !=' ']

def convert_reviews_to_sentences(texts):
    return [convert_review_as_sentence(text) for text in texts]

In [12]:
filtered_texts = [filter_text(text) for text in texts]

In [13]:
scores = [int(float(score)) for score in scores]
labels = to_categorical(np.asarray(scores))

In [14]:
print(len(filtered_texts))
print(filtered_texts[0])

389877
but not enough sizes or colors END Fits true to size on my size 8-1/2 feet END Bottom soles are completely slick END needs some kind of texturing or tread to help prevent slipping END


In [15]:
# the part of data construction is only for hierarchical attention model
filtered_texts_sentences = convert_reviews_to_sentences(filtered_texts)

In [16]:
print(filtered_texts_sentences[0])

['but not enough sizes or colors', 'Fits true to size on my size 8-1/2 feet', 'Bottom soles are completely slick', 'needs some kind of texturing or tread to help prevent slipping']


## Classification model (Keras)

In [17]:
import os
import sys

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Model
from keras.layers import Input, Dense, Flatten, Lambda
from keras.layers import Conv1D, MaxPooling1D, Embedding

import keras.backend as K

In [18]:
embedding_dim = 100
max_sequence_length = 100    # corpus max sentence length 2043
max_num_words = 20000

### Data preprocessing

In [19]:
# need to build index mapping words to their embeddings 
# embeddings_index[word] = coefficient vector as np.array

BASE_DIR = ''
GLOVE_DIR = BASE_DIR + 'glove.6B/'

embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [20]:
tokenizer = Tokenizer(num_words=max_num_words)
tokenizer.fit_on_texts(filtered_texts)
sequences = tokenizer.texts_to_sequences(filtered_texts)

word_index = tokenizer.word_index

data = pad_sequences(sequences, maxlen=max_sequence_length)

In [21]:
# print(sequences[19])

In [22]:
# print(word_index['END'])

In [23]:
# maxlength = 0
# for seq in sequences:
#     if len(seq) > maxlength:
#         maxlength = len(seq)
# print(maxlength)

In [24]:
# print(data[20])

In [25]:
# This part is only for hierarchical attention model 

sequences_sentences = [tokenizer.texts_to_sequences(sent_list) for sent_list in filtered_texts_sentences]

In [26]:
print(sequences_sentences[0])

[[20, 30, 194, 310, 48, 198], [368, 188, 6, 33, 19, 13, 33, 193, 149, 104, 38], [357, 250, 11, 628, 1687], [618, 102, 421, 9, 48, 892, 6, 549, 2080, 1113]]


In [27]:
max_sentence_length = 0
for seq in sequences_sentences:
    if len(seq) > max_sentence_length:
        max_sentence_length = len(seq)
print(max_sentence_length)

107


In [28]:
data_sentences = [pad_sequences(index_list, maxlen=max_sequence_length) for index_list in sequences_sentences]


In [29]:
# print(type(data_sentences[0]))
data_sentences_padded = []
for data_sentence in data_sentences:
    if len(data_sentence) < max_sentence_length:
        data_sentences_padded.append(np.concatenate(([[0]*max_sequence_length]*(max_sentence_length-len(data_sentence)), data_sentence), axis=0))

In [30]:
# print(data_sentences[0][0])
print(len(data_sentences[0]))

4


In [31]:
# split the data 

validation_split = 0.2 

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
num_validation_samples = int(validation_split * data.shape[0])

x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]

In [32]:
# prepare embedding matrix and build the embedding layer 

num_words = min(max_num_words, len(word_index))
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
    if i >= max_num_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

### Save constructed data and embedding matrix

In [36]:
# saved data for attention model and the others 
import pickle 

np.savez("data_and_embedding100", num_words=num_words, embedding_dim=embedding_dim,
                             max_sequence_length=max_sequence_length, data=data,
                             labels=labels, embedding_matrix=embedding_matrix,
                            max_sentence_length=max_sentence_length,
                            sequences_sentences=sequences_sentences)

with open('word2index.pickle', 'wb') as handle:
    pickle.dump(word_index, handle)