In [8]:
from __future__ import print_function 
import numpy as np
import tensorflow as tf 
from keras.utils import to_categorical

## Load the dataset

In [9]:
f = np.load('shoes_list_of_review_dicts.npz')

In [10]:
reviews_list = f['reviews_list']

In [11]:
print(reviews_list[0])

{'product/title': "Caligarius Women's Acheta Pump,Black Calf,6 M", 'review/text': '... but not enough sizes or colors. Fits true to size on my size 8-1/2 feet.Bottom soles are completely slick... needs some kind of texturing or tread to help prevent slipping.', 'review/summary': 'Beautiful basic pump...', 'review/score': '4.0', 'review/profileName': 'Fifi', 'review/time': '1169769600', 'product/productId': 'B0009PK7KO', 'product/price': 'unknown', 'review/userId': 'A12O8IHB65BC1S', 'review/helpfulness': '2/2'}


In [12]:
texts = [review['review/text'] for review in reviews_list]
scores = [review['review/score'] for review in reviews_list]

In [13]:
# scores_list = []

# for i in range(len(scores)):
#     if scores[i] not in scores_list:
#         scores_list.append(scores[i])
        
# print(scores_list)

In [14]:
# clean up the corpus 

def filter_text_as_sentence_list(text, end="END"):
    sentences_list = []
    for sent in text.split('.'):
        if sent.strip() != '':
            sentences_list.append(sent.strip()+" "+end)
    return sentences_list

def filter_text(text, end="END"):
    sentences_list = []
    for sent in text.split('.'):
        if sent.strip() != '':
            sentences_list.append(sent.strip()+" "+end)
    return ' '.join(sentences_list)

In [15]:
filtered_texts = [filter_text(text) for text in texts]

In [16]:
scores = [int(float(score)) for score in scores]
labels = to_categorical(np.asarray(scores))

## Classification model (Keras)

In [17]:
import os
import sys

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Model
from keras.layers import Input, Dense, Flatten, Lambda
from keras.layers import Conv1D, MaxPooling1D, Embedding

import keras.backend as K

In [18]:
embedding_dim = 100
max_sequence_length = 1000
max_num_words = 20000

### Data preprocessing

In [19]:
# need to build index mapping words to their embeddings 
# embeddings_index[word] = coefficient vector as np.array

BASE_DIR = ''
GLOVE_DIR = BASE_DIR + 'glove.6B/'

embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [20]:
tokenizer = Tokenizer(num_words=max_num_words)
tokenizer.fit_on_texts(filtered_texts)
sequences = tokenizer.texts_to_sequences(filtered_texts)

word_index = tokenizer.word_index

data = pad_sequences(sequences, maxlen=max_sequence_length)

In [21]:
print(sequences[20])

[12, 11, 13, 354, 15, 9, 43, 57, 1, 3, 83, 55, 12, 10, 2364, 4, 185, 14, 51, 28, 3, 78, 104, 60, 157, 1, 87, 3, 17, 14, 10, 126, 4, 8791, 1, 7, 11, 26, 29, 4, 65, 37, 109, 43, 79, 1, 3, 17, 683, 14, 32, 2638, 6, 690, 18, 52, 14, 71, 1, 3, 259, 12, 15, 694, 4, 47, 241, 125, 14, 1]


In [22]:
print(data[20])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0  

In [23]:
# split the data 

validation_split = 0.2 

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
num_validation_samples = int(validation_split * data.shape[0])

x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]

In [24]:
# prepare embedding matrix and build the embedding layer 

num_words = min(max_num_words, len(word_index))
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
    if i >= max_num_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

### Save constructed data and embedding matrix

In [25]:
np.savez("data_and_embedding", num_words=num_words, embedding_dim=embedding_dim,
                             max_sequence_length=max_sequence_length, data=data,
                             labels=labels, embedding_matrix=embedding_matrix)