In [1]:
# Importing packages
# Keras 2.2.4 compatible with Tensorflow 1.14.0
import json
import numpy as np
import pandas as pd
import os
import string
# Load Basic Model
import codecs
from keras_bert import load_trained_model_from_checkpoint
# Convert Data to Array
from tqdm import tqdm
from keras_bert import Tokenizer
# Build Custom Model
from tensorflow.python import keras
from keras_bert import AdamWarmup, calc_train_steps
# Initialize Variables
import tensorflow as tf
import tensorflow.keras.backend as K
# Convert to TPU Model
from keras_bert import get_custom_objects

Using TensorFlow backend.


In [2]:
# Reading in JSON file and putting into a list
runway = []
for line in open("renttherunway_final_data.json", 'r'):
    runway.append(json.loads(line))

In [3]:
# Looking at the structure of the data
# We will train using the review text as feature and rating as label
runway[0]

{'fit': 'fit',
 'user_id': '420272',
 'bust size': '34d',
 'item_id': '2260466',
 'weight': '137lbs',
 'rating': '10',
 'rented for': 'vacation',
 'review_text': "An adorable romper! Belt and zipper were a little hard to navigate in a full day of wear/bathroom use, but that's to be expected. Wish it had pockets, but other than that-- absolutely perfect! I got a million compliments.",
 'body type': 'hourglass',
 'review_summary': 'So many compliments!',
 'category': 'romper',
 'height': '5\' 8"',
 'size': 14,
 'age': '28',
 'review_date': 'April 20, 2016'}

In [4]:
# Looking at the length of the dataset
len(runway)

192544

In [5]:
# Isolating labels
rating = []
for i in range(192544):
    rating.append(runway[i]['rating'])

In [6]:
# Isolating features
review_text = []
for j in range(192544):
    review_text.append(runway[j]['review_text'])

In [7]:
# Defining a preprocessing function for the review text
def split_to_tokens(text):
    '''Function takes in text and preprocesses it for use in Bert'''
    for x in string.punctuation:
        text = [item.replace(x, '') for item in text]
    for y in string.digits:
        text = [item.replace(y, '8') for item in text]
    text = [item.lower() for item in text]
    return(text)

In [8]:
# Preprocessing the review text
review_processed = split_to_tokens(review_text)
review_processed[0:3]

['an adorable romper belt and zipper were a little hard to navigate in a full day of wearbathroom use but thats to be expected wish it had pockets but other than that absolutely perfect i got a million compliments',
 'i rented this dress for a photo shoot the theme was hollywood glam and big beautiful hats the dress was very comfortable and easy to move around in it is definitely on my list to rent again for another formal event ',
 'this hugged in all the right places it was a perfect dress for my event and i received so many compliments on it not to mention customer service was great getting this to me in less than 88 hours']

In [9]:
# Looking at the different categories of ratings available
set(rating)

{'10', '2', '4', '6', '8', None}

In [10]:
# Put in dataframe and one-hot encode
review_df = pd.DataFrame()
review_df['Text'] = review_text
review_df['Rating'] = rating

one_hot = pd.get_dummies(review_df["Rating"])
review_one_hot = pd.DataFrame()
review_one_hot = pd.concat([review_df['Text'], one_hot], axis=1)
review_one_hot.head()

Unnamed: 0,Text,10,2,4,6,8
0,An adorable romper! Belt and zipper were a lit...,1,0,0,0,0
1,I rented this dress for a photo shoot. The the...,1,0,0,0,0
2,This hugged in all the right places! It was a ...,1,0,0,0,0
3,I rented this for my company's black tie award...,0,0,0,0,1
4,I have always been petite in my upper body and...,1,0,0,0,0


In [11]:
# Defining hyperparameters
SEQ_LEN = 128
BATCH_SIZE = 128
EPOCHS = 5
LR = 1e-4

In [12]:
# These files must be put in current directory for paths to work
# Need to install wget via homebrew before can download pretrained model
pretrained_path = 'uncased_L-12_H-768_A-12'
config_path = os.path.join(pretrained_path, 'bert_config.json')
checkpoint_path = os.path.join(pretrained_path, 'bert_model.ckpt')
vocab_path = os.path.join(pretrained_path, 'vocab.txt')

# TF_KERAS must be added to environment variables in order to use TPU
os.environ['TF_KERAS'] = '1'

In [13]:
# Load Basic Model
token_dict = {}
with codecs.open(vocab_path, 'r', 'utf8') as reader:
    for line in reader:
        token = line.strip()
        token_dict[token] = len(token_dict)
        
model = load_trained_model_from_checkpoint(
    config_path,
    checkpoint_path,
    training=True,
    trainable=True,
    seq_len=SEQ_LEN,
)

W0709 19:33:32.773498 140735895495552 deprecation_wrapper.py:119] From /Users/jamesdarmody/anaconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0709 19:33:32.806383 140735895495552 deprecation_wrapper.py:119] From /Users/jamesdarmody/anaconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0709 19:33:32.854593 140735895495552 deprecation_wrapper.py:119] From /Users/jamesdarmody/anaconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:131: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0709 19:33:32.855237 140735895495552 deprecation_wrapper.py:119] From /Users/jamesdarmody/anaconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecated. Plea

In [14]:
len(review_processed)

192544

In [15]:
# Tokenizer class is used for splitting texts and generating indices
tokenizer = Tokenizer(token_dict)
token_indices, review_indices = [], []
for j in tqdm(range(0,len(review_processed))):
    ids, segments = tokenizer.encode(review_processed[j], max_len=SEQ_LEN)
    token_indices.append(ids)
    review_indices.append(segments)

100%|██████████| 192544/192544 [01:55<00:00, 1667.34it/s]


In [16]:
df_one_hot = pd.DataFrame()
df_one_hot['Token Indices'] = token_indices
df_one_hot = pd.concat([df_one_hot['Token Indices'], one_hot], axis=1)
df_one_hot.head()

Unnamed: 0,Token Indices,10,2,4,6,8
0,"[101, 2019, 23677, 17083, 4842, 5583, 1998, 22...",1,0,0,0,0
1,"[101, 1045, 12524, 2023, 4377, 2005, 1037, 630...",1,0,0,0,0
2,"[101, 2023, 10308, 1999, 2035, 1996, 2157, 318...",1,0,0,0,0
3,"[101, 1045, 12524, 2023, 2005, 2026, 2194, 201...",0,0,0,0,1
4,"[101, 1045, 2031, 2467, 2042, 20146, 1999, 202...",1,0,0,0,0


In [17]:
len(df_one_hot)

192544

In [18]:
# Build custom model
# import statement of keras needs to be here for this to work
import keras
inputs = model.inputs[:2]
# NSP > Next Sentence Prediction Dense Layer
dense = model.get_layer('NSP-Dense').output
outputs = keras.layers.Dense(units=1, activation='softmax')(dense)
decay_steps, warmup_steps = calc_train_steps(
    len(df_one_hot),
    batch_size = BATCH_SIZE,
    epochs = EPOCHS,)

model = keras.models.Model(inputs, outputs)
model.compile(
    AdamWarmup(decay_steps=decay_steps, warmup_steps=warmup_steps, lr=LR),
    loss='sparse_categorical_crossentropy',
    metrics=['sparse_categorical_accuracy'])

W0709 19:36:36.404079 140735895495552 deprecation_wrapper.py:119] From /Users/jamesdarmody/anaconda3/lib/python3.6/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.



In [19]:
tf.shape(dense)

<tf.Tensor 'Shape:0' shape=(2,) dtype=int32>

In [20]:
# Initialize Variables
sess = K.get_session()
uninitialized_variables = set([i.decode('ascii') for i in sess.run(tf.report_uninitialized_variables())])
init_op = tf.variables_initializer(
    [v for v in tf.global_variables() if v.name.split(':')[0] in uninitialized_variables])
sess.run(init_op)

W0709 19:36:44.836297 140735895495552 deprecation.py:323] From /Users/jamesdarmody/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/array_ops.py:1354: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [None]:
# Convert to TPU Model
tpu_address = 'grpc://' + os.environ['COLAB_TPU_ADDR']
strategy = tf.contribu.tpu.TPUDistributionStrategy(
    tf.contrib.cluster_resolver.TPUClusterResolver(tpu=tpu_address))
with tf.keras.utils.custom_object_scope(get_custom_objects()):
    tpu_model = tf.contrib.tpu.keras_to_tpu_model(model, strategy=strategy)

In [21]:
# Converting the 3 inputs to arrays
token_indices_array = np.array(token_indices)
zeros_array = np.array(review_indices)
user_reviews_array = np.array(df_one_hot.iloc[:,1:])
rating_array = np.array(rating)

# Shaping way model requires
from sklearn.model_selection import train_test_split
#train_x = [token_indices_array, zeros_array]
#train_y = rating_array
X_train, X_test, y_train, y_test = train_test_split(token_indices_array,
                        rating_array, test_size=0.25, random_state=42)

In [22]:
train_x = [X_train, np.zeros_like(X_train)]
test_x = [X_test, np.zeros_like(X_test)]
train_y = y_train
test_y = y_test

In [23]:
with tf.keras.utils.custom_object_scope(get_custom_objects()):
    tqdm(model.fit(
        train_x,
        train_y,
        epochs=EPOCHS,
        batch_size=BATCH_SIZE
    ))

Epoch 1/5


InvalidArgumentError: Received a label value of 10 which is outside the valid range of [0, 1).  Label values: 8 10 10 8 10 10 10 10 8 10 6 8 10 10 8 8 10 8 10 10 10 10 6 10 8 10 10 10 8 10 10 8 10 8 10 10 10 10 8 10 10 8 10 10 10 6 10 10 8 8 10 2 10 10 10 6 8 10 8 10 10 10 8 8 8 10 8 10 10 10 10 10 10 8 10 10 10 10 8 10 8 10 6 10 10 10 10 8 6 10 10 6 8 10 10 10 8 10 10 8 8 10 10 8 4 8 8 8 8 10 6 8 8 10 8 8 10 10 8 8 10 8 10 10 8 10 10 10
	 [[{{node loss/dense_1_loss/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits}}]]

In [None]:
type(df)

In [None]:
# Sandbox
# Convert data to Array

#token_list = []
#for i in tqdm(range(0,len(review_processed))):
#    tokens = tokenizer.tokenize(review_processed[i])
#    token_list.append(tokens)

In [None]:
# pip uninstall keras
# pip install keras==2.2.4
# pip uninstall tensorflow
# pip install tensorflow==1.12.0
# pip install tensorflow==2.0.0-beta1
# python -c 'import tensorflow as tf; print(tf.__version__)'
# python -c 'import keras; print(keras.__version__)'
# believe keras only compatible with python 2.7-3.6, not 3.7
# I have python 3.7.3